Skip to content

Commit

Permalink
Merge pull request #2192 from moj-analytical-services/group_linker_fu…
Browse files Browse the repository at this point in the history
…nctions

Group linker functions thematically
  • Loading branch information
RobinL authored Jun 7, 2024
2 parents 5f59ebc + 8d8482b commit a99f4e0
Show file tree
Hide file tree
Showing 99 changed files with 129,657 additions and 120,638 deletions.
436 changes: 218 additions & 218 deletions docs/charts/accuracy_chart_from_labels_table.ipynb

Large diffs are not rendered by default.

260 changes: 130 additions & 130 deletions docs/charts/cluster_studio_dashboard.ipynb
Original file line number Diff line number Diff line change
@@ -1,132 +1,132 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"!!! warning \"Work in Progress\"\n",
" This page is currently under construction. \n",
"\n",
"# `cluster_studio_dashboard`\n",
"\n",
"!!! info \"At a glance\"\n",
" **Useful for:** \n",
"\n",
" **API Documentation:** [cluster_studio_dashboard()](../linker.md#splink.linker.Linker.cluster_studio_dashboard)\n",
"\n",
" **What is needed to generate the chart?** "
]
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"!!! warning \"Work in Progress\"\n",
" This page is currently under construction. \n",
"\n",
"# `cluster_studio_dashboard`\n",
"\n",
"!!! info \"At a glance\"\n",
" **Useful for:** \n",
"\n",
" **API Documentation:** [cluster_studio_dashboard()](../linker.md#splink.linker.linker.visualisations.cluster_studio_dashboard)\n",
"\n",
" **What is needed to generate the chart?** "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Worked Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from splink.duckdb.linker import DuckDBLinker\n",
"import splink.duckdb.comparison_library as cl\n",
"import splink.duckdb.comparison_template_library as ctl\n",
"from splink.duckdb.blocking_rule_library import block_on\n",
"from splink.datasets import splink_datasets\n",
"import logging, sys\n",
"logging.disable(sys.maxsize)\n",
"\n",
"df = splink_datasets.fake_1000\n",
"\n",
"settings = {\n",
" \"link_type\": \"dedupe_only\",\n",
" \"blocking_rules_to_generate_predictions\": [\n",
" block_on(\"first_name\"),\n",
" block_on(\"surname\"),\n",
" ],\n",
" \"comparisons\": [\n",
" ctl.name_comparison(\"first_name\"),\n",
" ctl.name_comparison(\"surname\"),\n",
" ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n",
" cl.exact_match(\"city\", term_frequency_adjustments=True),\n",
" ctl.email_comparison(\"email\", include_username_fuzzy_level=False),\n",
" ],\n",
" \"retain_intermediate_calculation_columns\": True,\n",
" \"retain_matching_columns\":True,\n",
"}\n",
"\n",
"linker = DuckDBLinker(df, settings)\n",
"linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n",
"\n",
"blocking_rule_for_training = block_on([\"first_name\", \"surname\"])\n",
"\n",
"linker.training.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n",
"\n",
"blocking_rule_for_training = block_on(\"dob\")\n",
"linker.training.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n",
"\n",
"df_predictions = linker.inference.predict(threshold_match_probability=0.2)\n",
"df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predictions, threshold_match_probability=0.5)\n",
"\n",
"linker.visualisations.cluster_studio_dashboard(df_predictions, df_clusters, \"img/cluster_studio.html\", sampling_method=\"by_cluster_size\", overwrite=True)\n",
"\n",
"# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
"from IPython.display import IFrame\n",
"IFrame(\n",
" src=\"./img/cluster_studio.html\", width=\"100%\", height=1200\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### What the chart shows\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### How to interpret the chart\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Actions to take as a result of the chart\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Worked Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from splink.duckdb.linker import DuckDBLinker\n",
"import splink.duckdb.comparison_library as cl\n",
"import splink.duckdb.comparison_template_library as ctl\n",
"from splink.duckdb.blocking_rule_library import block_on\n",
"from splink.datasets import splink_datasets\n",
"import logging, sys\n",
"logging.disable(sys.maxsize)\n",
"\n",
"df = splink_datasets.fake_1000\n",
"\n",
"settings = {\n",
" \"link_type\": \"dedupe_only\",\n",
" \"blocking_rules_to_generate_predictions\": [\n",
" block_on(\"first_name\"),\n",
" block_on(\"surname\"),\n",
" ],\n",
" \"comparisons\": [\n",
" ctl.name_comparison(\"first_name\"),\n",
" ctl.name_comparison(\"surname\"),\n",
" ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n",
" cl.exact_match(\"city\", term_frequency_adjustments=True),\n",
" ctl.email_comparison(\"email\", include_username_fuzzy_level=False),\n",
" ],\n",
" \"retain_intermediate_calculation_columns\": True,\n",
" \"retain_matching_columns\":True,\n",
"}\n",
"\n",
"linker = DuckDBLinker(df, settings)\n",
"linker.estimate_u_using_random_sampling(max_pairs=1e6)\n",
"\n",
"blocking_rule_for_training = block_on([\"first_name\", \"surname\"])\n",
"\n",
"linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n",
"\n",
"blocking_rule_for_training = block_on(\"dob\")\n",
"linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)\n",
"\n",
"df_predictions = linker.predict(threshold_match_probability=0.2)\n",
"df_clusters = linker.cluster_pairwise_predictions_at_threshold(df_predictions, threshold_match_probability=0.5)\n",
"\n",
"linker.cluster_studio_dashboard(df_predictions, df_clusters, \"img/cluster_studio.html\", sampling_method=\"by_cluster_size\", overwrite=True)\n",
"\n",
"# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
"from IPython.display import IFrame\n",
"IFrame(\n",
" src=\"./img/cluster_studio.html\", width=\"100%\", height=1200\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### What the chart shows\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### How to interpret the chart\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Actions to take as a result of the chart\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit a99f4e0

Please sign in to comment.