-
Notifications
You must be signed in to change notification settings - Fork 160
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2192 from moj-analytical-services/group_linker_fu…
…nctions Group linker functions thematically
- Loading branch information
Showing
99 changed files
with
129,657 additions
and
120,638 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,132 +1,132 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"!!! warning \"Work in Progress\"\n", | ||
" This page is currently under construction. \n", | ||
"\n", | ||
"# `cluster_studio_dashboard`\n", | ||
"\n", | ||
"!!! info \"At a glance\"\n", | ||
" **Useful for:** \n", | ||
"\n", | ||
" **API Documentation:** [cluster_studio_dashboard()](../linker.md#splink.linker.Linker.cluster_studio_dashboard)\n", | ||
"\n", | ||
" **What is needed to generate the chart?** " | ||
] | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"!!! warning \"Work in Progress\"\n", | ||
" This page is currently under construction. \n", | ||
"\n", | ||
"# `cluster_studio_dashboard`\n", | ||
"\n", | ||
"!!! info \"At a glance\"\n", | ||
" **Useful for:** \n", | ||
"\n", | ||
" **API Documentation:** [cluster_studio_dashboard()](../linker.md#splink.linker.linker.visualisations.cluster_studio_dashboard)\n", | ||
"\n", | ||
" **What is needed to generate the chart?** " | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Worked Example" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from splink.duckdb.linker import DuckDBLinker\n", | ||
"import splink.duckdb.comparison_library as cl\n", | ||
"import splink.duckdb.comparison_template_library as ctl\n", | ||
"from splink.duckdb.blocking_rule_library import block_on\n", | ||
"from splink.datasets import splink_datasets\n", | ||
"import logging, sys\n", | ||
"logging.disable(sys.maxsize)\n", | ||
"\n", | ||
"df = splink_datasets.fake_1000\n", | ||
"\n", | ||
"settings = {\n", | ||
" \"link_type\": \"dedupe_only\",\n", | ||
" \"blocking_rules_to_generate_predictions\": [\n", | ||
" block_on(\"first_name\"),\n", | ||
" block_on(\"surname\"),\n", | ||
" ],\n", | ||
" \"comparisons\": [\n", | ||
" ctl.name_comparison(\"first_name\"),\n", | ||
" ctl.name_comparison(\"surname\"),\n", | ||
" ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n", | ||
" cl.exact_match(\"city\", term_frequency_adjustments=True),\n", | ||
" ctl.email_comparison(\"email\", include_username_fuzzy_level=False),\n", | ||
" ],\n", | ||
" \"retain_intermediate_calculation_columns\": True,\n", | ||
" \"retain_matching_columns\":True,\n", | ||
"}\n", | ||
"\n", | ||
"linker = DuckDBLinker(df, settings)\n", | ||
"linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n", | ||
"\n", | ||
"blocking_rule_for_training = block_on([\"first_name\", \"surname\"])\n", | ||
"\n", | ||
"linker.training.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n", | ||
"\n", | ||
"blocking_rule_for_training = block_on(\"dob\")\n", | ||
"linker.training.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n", | ||
"\n", | ||
"df_predictions = linker.inference.predict(threshold_match_probability=0.2)\n", | ||
"df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predictions, threshold_match_probability=0.5)\n", | ||
"\n", | ||
"linker.visualisations.cluster_studio_dashboard(df_predictions, df_clusters, \"img/cluster_studio.html\", sampling_method=\"by_cluster_size\", overwrite=True)\n", | ||
"\n", | ||
"# You can view the scv.html file in your browser, or inline in a notbook as follows\n", | ||
"from IPython.display import IFrame\n", | ||
"IFrame(\n", | ||
" src=\"./img/cluster_studio.html\", width=\"100%\", height=1200\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### What the chart shows\n" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### How to interpret the chart\n" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Actions to take as a result of the chart\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "base", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Worked Example" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from splink.duckdb.linker import DuckDBLinker\n", | ||
"import splink.duckdb.comparison_library as cl\n", | ||
"import splink.duckdb.comparison_template_library as ctl\n", | ||
"from splink.duckdb.blocking_rule_library import block_on\n", | ||
"from splink.datasets import splink_datasets\n", | ||
"import logging, sys\n", | ||
"logging.disable(sys.maxsize)\n", | ||
"\n", | ||
"df = splink_datasets.fake_1000\n", | ||
"\n", | ||
"settings = {\n", | ||
" \"link_type\": \"dedupe_only\",\n", | ||
" \"blocking_rules_to_generate_predictions\": [\n", | ||
" block_on(\"first_name\"),\n", | ||
" block_on(\"surname\"),\n", | ||
" ],\n", | ||
" \"comparisons\": [\n", | ||
" ctl.name_comparison(\"first_name\"),\n", | ||
" ctl.name_comparison(\"surname\"),\n", | ||
" ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n", | ||
" cl.exact_match(\"city\", term_frequency_adjustments=True),\n", | ||
" ctl.email_comparison(\"email\", include_username_fuzzy_level=False),\n", | ||
" ],\n", | ||
" \"retain_intermediate_calculation_columns\": True,\n", | ||
" \"retain_matching_columns\":True,\n", | ||
"}\n", | ||
"\n", | ||
"linker = DuckDBLinker(df, settings)\n", | ||
"linker.estimate_u_using_random_sampling(max_pairs=1e6)\n", | ||
"\n", | ||
"blocking_rule_for_training = block_on([\"first_name\", \"surname\"])\n", | ||
"\n", | ||
"linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n", | ||
"\n", | ||
"blocking_rule_for_training = block_on(\"dob\")\n", | ||
"linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n", | ||
"\n", | ||
"df_predictions = linker.predict(threshold_match_probability=0.2)\n", | ||
"df_clusters = linker.cluster_pairwise_predictions_at_threshold(df_predictions, threshold_match_probability=0.5)\n", | ||
"\n", | ||
"linker.cluster_studio_dashboard(df_predictions, df_clusters, \"img/cluster_studio.html\", sampling_method=\"by_cluster_size\", overwrite=True)\n", | ||
"\n", | ||
"# You can view the scv.html file in your browser, or inline in a notbook as follows\n", | ||
"from IPython.display import IFrame\n", | ||
"IFrame(\n", | ||
" src=\"./img/cluster_studio.html\", width=\"100%\", height=1200\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### What the chart shows\n" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### How to interpret the chart\n" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Actions to take as a result of the chart\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "base", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.