diff --git a/docs/demos/examples/athena/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/athena/deduplicate_50k_synthetic.ipynb index 3a34dc0e14..3dd987d4a0 100644 --- a/docs/demos/examples/athena/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/athena/deduplicate_50k_synthetic.ipynb @@ -97389,7 +97389,7 @@ } ], "source": [ - "records = linker.inference.prediction_errors_from_labels_column(\n", + "records = linker.evaluation.prediction_errors_from_labels_column(\n", " \"cluster\",\n", " threshold=0.999,\n", " include_false_negatives=False,\n", @@ -109042,7 +109042,7 @@ ], "source": [ "# Some of the false negatives will be because they weren't detected by the blocking rules\n", - "records = linker.inference.prediction_errors_from_labels_column(\n", + "records = linker.evaluation.prediction_errors_from_labels_column(\n", " \"cluster\",\n", " threshold=0.5,\n", " include_false_negatives=True,\n", diff --git a/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb b/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb index c0d1e55324..2575f82b77 100644 --- a/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb +++ b/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb @@ -1,272 +1,1337 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluation when you have fully labelled data\n", - "\n", - "In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model\n" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation when you have fully labelled data\n", + "\n", + "In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:16.264709Z", + "iopub.status.busy": "2024-06-07T09:09:16.264397Z", + "iopub.status.idle": "2024-06-07T09:09:16.269613Z", + "shell.execute_reply": "2024-06-07T09:09:16.268968Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:16.273849Z", + "iopub.status.busy": "2024-06-07T09:09:16.273306Z", + "iopub.status.idle": "2024-06-07T09:09:17.467426Z", + "shell.execute_reply": "2024-06-07T09:09:17.466787Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idfirst_namesurnamedobcityemailcluster
00RobertAlan1971-06-24NaNrobert255@smith.net0
11RobertAllen1971-05-24NaNroberta25@smith.net0
\n", + "
" + ], + "text/plain": [ + " unique_id first_name surname dob city email cluster\n", + "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n", + "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0" ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import splink_datasets\n", + "\n", + "df = splink_datasets.fake_1000\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:17.501913Z", + "iopub.status.busy": "2024-06-07T09:09:17.501641Z", + "iopub.status.idle": "2024-06-07T09:09:17.581434Z", + "shell.execute_reply": "2024-06-07T09:09:17.580667Z" + } + }, + "outputs": [], + "source": [ + "from splink import SettingsCreator, Linker, block_on, DuckDBAPI\n", + "import splink.comparison_template_library as ctl\n", + "import splink.comparison_library as cl\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"first_name\"),\n", + " block_on(\"surname\"),\n", + " ],\n", + " comparisons=[\n", + " ctl.NameComparison(\"first_name\"),\n", + " ctl.NameComparison(\"surname\"),\n", + " ctl.DateComparison(\n", + " \"dob\",\n", + " input_is_string=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", + " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:17.585114Z", + "iopub.status.busy": "2024-06-07T09:09:17.584837Z", + "iopub.status.idle": "2024-06-07T09:09:17.847471Z", + "shell.execute_reply": "2024-06-07T09:09:17.846845Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.00333.\n", + "This means that amongst all possible pairwise record comparisons, one in 300.13 are expected to match. With 499,500 total possible comparisons, we expect a total of around 1,664.29 matching pairs\n" + ] + } + ], + "source": [ + "db_api = DuckDBAPI()\n", + "linker = Linker(df, settings, database_api=db_api)\n", + "deterministic_rules = [\n", + " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", + " \"l.email = r.email\",\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:17.850459Z", + "iopub.status.busy": "2024-06-07T09:09:17.850216Z", + "iopub.status.idle": "2024-06-07T09:09:18.931010Z", + "shell.execute_reply": "2024-06-07T09:09:18.930397Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:53.238834Z", - "iopub.status.busy": "2024-03-27T15:10:53.238466Z", - "iopub.status.idle": "2024-03-27T15:10:53.243675Z", - "shell.execute_reply": "2024-03-27T15:10:53.243004Z" - } - }, - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:53.247564Z", - "iopub.status.busy": "2024-03-27T15:10:53.247269Z", - "iopub.status.idle": "2024-03-27T15:10:55.196205Z", - "shell.execute_reply": "2024-03-27T15:10:55.195428Z" - } - }, - "source": [ - "from splink import splink_datasets\n", - "\n", - "df = splink_datasets.fake_1000\n", - "df.head(2)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:55.242600Z", - "iopub.status.busy": "2024-03-27T15:10:55.242267Z", - "iopub.status.idle": "2024-03-27T15:10:55.601924Z", - "shell.execute_reply": "2024-03-27T15:10:55.601113Z" - } - }, - "source": [ - "from splink import SettingsCreator, Linker, block_on, DuckDBAPI\n", - "import splink.comparison_template_library as ctl\n", - "import splink.comparison_library as cl\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\"),\n", - " block_on(\"surname\"),\n", - " ],\n", - " comparisons=[\n", - " ctl.NameComparison(\"first_name\"),\n", - " ctl.NameComparison(\"surname\"),\n", - " ctl.DateComparison(\n", - " \"dob\",\n", - " input_is_string=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", - " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:55.606354Z", - "iopub.status.busy": "2024-03-27T15:10:55.606011Z", - "iopub.status.idle": "2024-03-27T15:10:55.966147Z", - "shell.execute_reply": "2024-03-27T15:10:55.965434Z" - } - }, - "source": [ - "db_api = DuckDBAPI()\n", - "linker = Linker(df, settings, database_api=db_api)\n", - "deterministic_rules = [\n", - " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", - " \"l.email = r.email\",\n", - "]\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:55.970182Z", - "iopub.status.busy": "2024-03-27T15:10:55.969667Z", - "iopub.status.idle": "2024-03-27T15:10:57.008471Z", - "shell.execute_reply": "2024-03-27T15:10:57.007360Z" - } - }, - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:57.012299Z", - "iopub.status.busy": "2024-03-27T15:10:57.012041Z", - "iopub.status.idle": "2024-03-27T15:10:58.591902Z", - "shell.execute_reply": "2024-03-27T15:10:58.591381Z" - } - }, - "source": [ - "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n", - "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"email\")\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:58.594711Z", - "iopub.status.busy": "2024-03-27T15:10:58.594499Z", - "iopub.status.idle": "2024-03-27T15:10:58.945354Z", - "shell.execute_reply": "2024-03-27T15:10:58.944711Z" - } - }, - "source": [ - "linker.accuracy_analysis_from_labels_column(\n", - " \"cluster\", output_type=\"table\"\n", - ").as_pandas_dataframe(limit=5)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:10:58.948920Z", - "iopub.status.busy": "2024-03-27T15:10:58.948640Z", - "iopub.status.idle": "2024-03-27T15:11:01.154581Z", - "shell.execute_reply": "2024-03-27T15:11:01.153881Z" - } - }, - "source": [ - "linker.accuracy_analysis_from_labels_column(\"cluster\", output_type=\"roc\")" + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - dob (no m values are trained).\n", + " - city (no m values are trained).\n", + " - email (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:18.934824Z", + "iopub.status.busy": "2024-06-07T09:09:18.934551Z", + "iopub.status.idle": "2024-06-07T09:09:20.495494Z", + "shell.execute_reply": "2024-06-07T09:09:20.494833Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"dob\" = r.\"dob\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - city\n", + " - email\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - dob\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.417 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.121 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0354 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.0127 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00539 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.0025 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.0012 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.000599 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.000313 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.000186 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.000147 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was 0.000158 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.000184 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.000195 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.000179 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.000144 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was 0.000105 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 7.27e-05 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 18 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - dob (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"email\" = r.\"email\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - dob\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - email\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.466 in the m_probability of dob, level `Exact match on dob`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0884 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0193 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00688 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00294 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00138 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.000681 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.000346 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.000178 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 9.26e-05 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 10 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n", + "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"email\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:20.498372Z", + "iopub.status.busy": "2024-06-07T09:09:20.498155Z", + "iopub.status.idle": "2024-06-07T09:09:20.768827Z", + "shell.execute_reply": "2024-06-07T09:09:20.768326Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
truth_thresholdmatch_probabilitytotal_clerical_labelspntptnfpfnP_rate...precisionrecallspecificitynpvaccuracyf1f2f0_5p4phi
0-19.30.000002499500.02031.0497469.01027.0495147.02322.01004.00.004066...0.3066590.5056620.9953320.9979760.9933410.3817840.4475730.3328580.5520840.390667
1-19.20.000002499500.02031.0497469.01027.0495383.02086.01004.00.004066...0.3299070.5056620.9958070.9979770.9938140.3993000.4569730.3545540.5702070.405492
2-18.00.000004499500.02031.0497469.01027.0495584.01885.01004.00.004066...0.3526790.5056620.9962110.9979780.9942160.4155370.4652950.3753930.5866070.419506
3-17.10.000007499500.02031.0497469.01027.0495836.01633.01004.00.004066...0.3860900.5056620.9967170.9979790.9947210.4378600.4761680.4052560.6085510.439259
4-17.00.000008499500.02031.0497469.01027.0495957.01512.01004.00.004066...0.4044900.5056620.9969610.9979800.9949630.4494530.4815720.4213510.6196820.449767
\n", + "

5 rows × 25 columns

\n", + "
" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:11:01.168311Z", - "iopub.status.busy": "2024-03-27T15:11:01.167847Z", - "iopub.status.idle": "2024-03-27T15:11:03.812090Z", - "shell.execute_reply": "2024-03-27T15:11:03.811329Z" - } - }, - "source": [ - "linker.accuracy_analysis_from_labels_column(\n", - " \"cluster\",\n", - " output_type=\"threshold_selection\",\n", - " threshold_actual=0.5,\n", - " add_metrics=[\"f1\"],\n", - ")" + "text/plain": [ + " truth_threshold match_probability total_clerical_labels p \\\n", + "0 -19.3 0.000002 499500.0 2031.0 \n", + "1 -19.2 0.000002 499500.0 2031.0 \n", + "2 -18.0 0.000004 499500.0 2031.0 \n", + "3 -17.1 0.000007 499500.0 2031.0 \n", + "4 -17.0 0.000008 499500.0 2031.0 \n", + "\n", + " n tp tn fp fn P_rate ... precision \\\n", + "0 497469.0 1027.0 495147.0 2322.0 1004.0 0.004066 ... 0.306659 \n", + "1 497469.0 1027.0 495383.0 2086.0 1004.0 0.004066 ... 0.329907 \n", + "2 497469.0 1027.0 495584.0 1885.0 1004.0 0.004066 ... 0.352679 \n", + "3 497469.0 1027.0 495836.0 1633.0 1004.0 0.004066 ... 0.386090 \n", + "4 497469.0 1027.0 495957.0 1512.0 1004.0 0.004066 ... 0.404490 \n", + "\n", + " recall specificity npv accuracy f1 f2 f0_5 \\\n", + "0 0.505662 0.995332 0.997976 0.993341 0.381784 0.447573 0.332858 \n", + "1 0.505662 0.995807 0.997977 0.993814 0.399300 0.456973 0.354554 \n", + "2 0.505662 0.996211 0.997978 0.994216 0.415537 0.465295 0.375393 \n", + "3 0.505662 0.996717 0.997979 0.994721 0.437860 0.476168 0.405256 \n", + "4 0.505662 0.996961 0.997980 0.994963 0.449453 0.481572 0.421351 \n", + "\n", + " p4 phi \n", + "0 0.552084 0.390667 \n", + "1 0.570207 0.405492 \n", + "2 0.586607 0.419506 \n", + "3 0.608551 0.439259 \n", + "4 0.619682 0.449767 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.evaluation.accuracy_analysis_from_labels_column(\n", + " \"cluster\", output_type=\"table\"\n", + ").as_pandas_dataframe(limit=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:20.771736Z", + "iopub.status.busy": "2024-06-07T09:09:20.771453Z", + "iopub.status.idle": "2024-06-07T09:09:21.322647Z", + "shell.execute_reply": "2024-06-07T09:09:21.322088Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:11:03.822254Z", - "iopub.status.busy": "2024-03-27T15:11:03.821939Z", - "iopub.status.idle": "2024-03-27T15:11:04.205976Z", - "shell.execute_reply": "2024-03-27T15:11:04.205179Z" - } - }, - "source": [ - "# Plot some false positives\n", - "linker.inference.prediction_errors_from_labels_column(\n", - " \"cluster\", include_false_negatives=True, include_false_positives=True\n", - ").as_pandas_dataframe(limit=5)" + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.evaluation.accuracy_analysis_from_labels_column(\"cluster\", output_type=\"roc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:21.327370Z", + "iopub.status.busy": "2024-06-07T09:09:21.327111Z", + "iopub.status.idle": "2024-06-07T09:09:22.635682Z", + "shell.execute_reply": "2024-06-07T09:09:22.635098Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:11:04.209998Z", - "iopub.status.busy": "2024-03-27T15:11:04.209676Z", - "iopub.status.idle": "2024-03-27T15:11:05.510086Z", - "shell.execute_reply": "2024-03-27T15:11:05.509348Z" - } - }, - "source": [ - "records = linker.inference.prediction_errors_from_labels_column(\n", - " \"cluster\", include_false_negatives=True, include_false_positives=True\n", - ").as_record_dict(limit=5)\n", - "\n", - "linker.visualisations.waterfall_chart(records)" + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.evaluation.accuracy_analysis_from_labels_column(\n", + " \"cluster\",\n", + " output_type=\"threshold_selection\",\n", + " threshold_actual=0.5,\n", + " add_metrics=[\"f1\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:22.638822Z", + "iopub.status.busy": "2024-06-07T09:09:22.638569Z", + "iopub.status.idle": "2024-06-07T09:09:22.853941Z", + "shell.execute_reply": "2024-06-07T09:09:22.853250Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clerical_match_scorefound_by_blocking_rulesmatch_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_namebf_first_name...tf_city_rbf_citybf_tf_adj_cityemail_lemail_rgamma_emailbf_emailcluster_lcluster_rmatch_key
01.0False-24.1659145.312940e-08417418FlorenceBrown00.213986...0.001230.4278451.0fb@reose.cemf@b@reese.com00.0010231081082
11.0False-21.9415062.482839e-07796797TaylorNone-11.000000...0.007380.4278451.0jt40o@combs.netjt40@cotbs.nm00.0010232012012
21.0False-19.5172771.332642e-06452454NoneDavies-11.000000...0.015990.4278451.0rd@lewis.comidlewrs.cocm00.0010231151152
31.0False-17.9783643.872323e-06717718MiaJones00.213986...0.006150.4278451.0mia.j63@martinez.bizNone-11.0000001821822
41.0True-15.5186902.130097e-05594595GraceGrace385.794621...0.001230.4278451.0gk@frey-robinson.orgrgk@frey-robinon.org00.0010231461460
\n", + "

5 rows × 32 columns

\n", + "
" ], - "outputs": [] + "text/plain": [ + " clerical_match_score found_by_blocking_rules match_weight \\\n", + "0 1.0 False -24.165914 \n", + "1 1.0 False -21.941506 \n", + "2 1.0 False -19.517277 \n", + "3 1.0 False -17.978364 \n", + "4 1.0 True -15.518690 \n", + "\n", + " match_probability unique_id_l unique_id_r first_name_l first_name_r \\\n", + "0 5.312940e-08 417 418 Florence Brown \n", + "1 2.482839e-07 796 797 Taylor None \n", + "2 1.332642e-06 452 454 None Davies \n", + "3 3.872323e-06 717 718 Mia Jones \n", + "4 2.130097e-05 594 595 Grace Grace \n", + "\n", + " gamma_first_name bf_first_name ... tf_city_r bf_city bf_tf_adj_city \\\n", + "0 0 0.213986 ... 0.00123 0.427845 1.0 \n", + "1 -1 1.000000 ... 0.00738 0.427845 1.0 \n", + "2 -1 1.000000 ... 0.01599 0.427845 1.0 \n", + "3 0 0.213986 ... 0.00615 0.427845 1.0 \n", + "4 3 85.794621 ... 0.00123 0.427845 1.0 \n", + "\n", + " email_l email_r gamma_email bf_email \\\n", + "0 fb@reose.cem f@b@reese.com 0 0.001023 \n", + "1 jt40o@combs.net jt40@cotbs.nm 0 0.001023 \n", + "2 rd@lewis.com idlewrs.cocm 0 0.001023 \n", + "3 mia.j63@martinez.biz None -1 1.000000 \n", + "4 gk@frey-robinson.org rgk@frey-robinon.org 0 0.001023 \n", + "\n", + " cluster_l cluster_r match_key \n", + "0 108 108 2 \n", + "1 201 201 2 \n", + "2 115 115 2 \n", + "3 182 182 2 \n", + "4 146 146 0 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot some false positives\n", + "linker.evaluation.prediction_errors_from_labels_column(\n", + " \"cluster\", include_false_negatives=True, include_false_positives=True\n", + ").as_pandas_dataframe(limit=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:22.857193Z", + "iopub.status.busy": "2024-06-07T09:09:22.856931Z", + "iopub.status.idle": "2024-06-07T09:09:23.602967Z", + "shell.execute_reply": "2024-06-07T09:09:23.602410Z" } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "records = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"cluster\", include_false_negatives=True, include_false_positives=True\n", + ").as_record_dict(limit=5)\n", + "\n", + "linker.visualisations.waterfall_chart(records)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb index 383ea109a2..3e30041105 100644 --- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb @@ -1,2135 +1,2343 @@ { - "cells": [ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking a dataset of real historical persons\n", + "\n", + "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:25.613571Z", + "iopub.status.busy": "2024-06-07T09:09:25.613270Z", + "iopub.status.idle": "2024-06-07T09:09:25.618664Z", + "shell.execute_reply": "2024-06-07T09:09:25.617985Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:25.622132Z", + "iopub.status.busy": "2024-06-07T09:09:25.621861Z", + "iopub.status.idle": "2024-06-07T09:09:28.057830Z", + "shell.execute_reply": "2024-06-07T09:09:28.057112Z" + } + }, + "outputs": [ { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking a dataset of real historical persons\n", - "\n", - "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "downloading: https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/historical_figures_with_errors_50k.parquet\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " download progress: 0 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 2 %\t(..........)\r", + " download progress: 2 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 5 %\t(..........)\r", + " download progress: 5 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 7 %\t(..........)\r", + " download progress: 7 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 9 %\t(..........)\r", + " download progress: 9 %\t(..........)\r", + " download progress: 10 %\t(..........)\r", + " download progress: 10 %\t(=.........)\r", + " download progress: 11 %\t(=.........)\r", + " download progress: 11 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 14 %\t(=.........)\r", + " download progress: 14 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 16 %\t(=.........)\r", + " download progress: 16 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 18 %\t(=.........)\r", + " download progress: 18 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 20 %\t(=.........)\r", + " download progress: 20 %\t(==........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 23 %\t(==........)\r", + " download progress: 23 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 25 %\t(==........)\r", + " download progress: 25 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 27 %\t(==........)\r", + " download progress: 27 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 29 %\t(==........)\r", + " download progress: 29 %\t(==........)\r", + " download progress: 30 %\t(==........)\r", + " download progress: 30 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 32 %\t(===.......)\r", + " download progress: 32 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 34 %\t(===.......)\r", + " download progress: 34 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 36 %\t(===.......)\r", + " download progress: 36 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 39 %\t(===.......)\r", + " download progress: 39 %\t(===.......)\r", + " download progress: 40 %\t(===.......)\r", + " download progress: 40 %\t(====......)\r", + " download progress: 41 %\t(====......)\r", + " download progress: 41 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 43 %\t(====......)\r", + " download progress: 43 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 45 %\t(====......)\r", + " download progress: 45 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 47 %\t(====......)\r", + " download progress: 47 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 50 %\t(====......)\r", + " download progress: 50 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 52 %\t(=====.....)\r", + " download progress: 52 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 54 %\t(=====.....)\r", + " download progress: 54 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 59 %\t(=====.....)\r", + " download progress: 59 %\t(=====.....)\r", + " download progress: 60 %\t(=====.....)\r", + " download progress: 60 %\t(======....)\r", + " download progress: 61 %\t(======....)\r", + " download progress: 61 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 63 %\t(======....)\r", + " download progress: 63 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 66 %\t(======....)\r", + " download progress: 66 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 68 %\t(======....)\r", + " download progress: 68 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 70 %\t(======....)\r", + " download progress: 70 %\t(=======...)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 75 %\t(=======...)\r", + " download progress: 75 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 77 %\t(=======...)\r", + " download progress: 77 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 79 %\t(=======...)\r", + " download progress: 79 %\t(=======...)\r", + " download progress: 80 %\t(=======...)\r", + " download progress: 80 %\t(========..)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 82 %\t(========..)\r", + " download progress: 82 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 84 %\t(========..)\r", + " download progress: 84 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 86 %\t(========..)\r", + " download progress: 86 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 88 %\t(========..)\r", + " download progress: 88 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 90 %\t(========..)\r", + " download progress: 90 %\t(=========.)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 93 %\t(=========.)\r", + " download progress: 93 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 95 %\t(=========.)\r", + " download progress: 95 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 97 %\t(=========.)\r", + " download progress: 97 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 100 %\t(=========.)\r", + " download progress: 100 %\t(==========)\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:03.040913Z", - "iopub.status.busy": "2024-05-15T16:07:03.040529Z", - "iopub.status.idle": "2024-05-15T16:07:03.045834Z", - "shell.execute_reply": "2024-05-15T16:07:03.045063Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idclusterfull_namefirst_and_surnamefirst_namesurnamedobbirth_placepostcode_fakegenderoccupation
0Q2296770-1Q2296770thomas clifford, 1st baron clifford of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
1Q2296770-2Q2296770thomas of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
2Q2296770-3Q2296770tom 1st baron clifford of chudleightom chudleightomchudleigh1630-08-01devontq13 8dfmalepolitician
3Q2296770-4Q2296770thomas 1st chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8huNonepolitician
4Q2296770-5Q2296770thomas clifford, 1st baron chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfNonepolitician
\n", + "
" + ], + "text/plain": [ + " unique_id cluster full_name \\\n", + "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n", + "1 Q2296770-2 Q2296770 thomas of chudleigh \n", + "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n", + "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n", + "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n", + "\n", + " first_and_surname first_name surname dob birth_place \\\n", + "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "2 tom chudleigh tom chudleigh 1630-08-01 devon \n", + "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "\n", + " postcode_fake gender occupation \n", + "0 tq13 8df male politician \n", + "1 tq13 8df male politician \n", + "2 tq13 8df male politician \n", + "3 tq13 8hu None politician \n", + "4 tq13 8df None politician " ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import splink_datasets\n", + "\n", + "df = splink_datasets.historical_50k\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:28.061677Z", + "iopub.status.busy": "2024-06-07T09:09:28.061319Z", + "iopub.status.idle": "2024-06-07T09:09:28.892623Z", + "shell.execute_reply": "2024-06-07T09:09:28.891638Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:03.049635Z", - "iopub.status.busy": "2024-05-15T16:07:03.049337Z", - "iopub.status.idle": "2024-05-15T16:07:04.275040Z", - "shell.execute_reply": "2024-05-15T16:07:04.274317Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unique_idclusterfull_namefirst_and_surnamefirst_namesurnamedobbirth_placepostcode_fakegenderoccupation
0Q2296770-1Q2296770thomas clifford, 1st baron clifford of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
1Q2296770-2Q2296770thomas of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
2Q2296770-3Q2296770tom 1st baron clifford of chudleightom chudleightomchudleigh1630-08-01devontq13 8dfmalepolitician
3Q2296770-4Q2296770thomas 1st chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8huNonepolitician
4Q2296770-5Q2296770thomas clifford, 1st baron chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfNonepolitician
\n", - "
" - ], - "text/plain": [ - " unique_id cluster full_name \\\n", - "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n", - "1 Q2296770-2 Q2296770 thomas of chudleigh \n", - "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n", - "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n", - "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n", - "\n", - " first_and_surname first_name surname dob birth_place \\\n", - "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", - "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", - "2 tom chudleigh tom chudleigh 1630-08-01 devon \n", - "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", - "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", - "\n", - " postcode_fake gender occupation \n", - "0 tq13 8df male politician \n", - "1 tq13 8df male politician \n", - "2 tq13 8df male politician \n", - "3 tq13 8hu None politician \n", - "4 tq13 8df None politician " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "from splink import splink_datasets\n", - "\n", - "df = splink_datasets.historical_50k\n", - "df.head()" + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI\n", + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = DuckDBAPI()\n", + "profile_columns(df, db_api, column_expressions=[\"first_name\", \"substr(surname,1,2)\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:28.898009Z", + "iopub.status.busy": "2024-06-07T09:09:28.897643Z", + "iopub.status.idle": "2024-06-07T09:09:29.356811Z", + "shell.execute_reply": "2024-06-07T09:09:29.356107Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:04.316719Z", - "iopub.status.busy": "2024-05-15T16:07:04.315783Z", - "iopub.status.idle": "2024-05-15T16:07:05.112833Z", - "shell.execute_reply": "2024-05-15T16:07:05.112087Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "from splink import DuckDBAPI\n", - "from splink.exploratory import profile_columns\n", - "\n", - "db_api = DuckDBAPI()\n", - "profile_columns(df, db_api, column_expressions=[\"first_name\", \"substr(surname,1,2)\"])" + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", + " block_on(\"surname\", \"dob\"),\n", + " block_on(\"first_name\", \"dob\"),\n", + " block_on(\"postcode_fake\", \"first_name\")]\n", + "\n", + "db_api = DuckDBAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rules=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:29.359888Z", + "iopub.status.busy": "2024-06-07T09:09:29.359671Z", + "iopub.status.idle": "2024-06-07T09:09:29.491413Z", + "shell.execute_reply": "2024-06-07T09:09:29.490645Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import Linker, SettingsCreator\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=blocking_rules,\n", + " comparisons=[\n", + " ctl.NameComparison(\"first_name\").configure(term_frequency_adjustments=True),\n", + " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", + " ctl.DateComparison(\n", + " \"dob\",\n", + " input_is_string=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " # TODO: Restore ctl.PostcodeComparison level here\n", + " cl.LevenshteinAtThresholds(\"postcode_fake\"),\n", + " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker(df, settings, database_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:29.494674Z", + "iopub.status.busy": "2024-06-07T09:09:29.494441Z", + "iopub.status.idle": "2024-06-07T09:09:29.778569Z", + "shell.execute_reply": "2024-06-07T09:09:29.778006Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000136.\n", + "This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match. With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs\n" + ] + } + ], + "source": [ + "linker.training.estimate_probability_two_random_records_match(\n", + " [\n", + " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", + " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", + " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", + " ],\n", + " recall=0.6,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:29.781900Z", + "iopub.status.busy": "2024-06-07T09:09:29.781630Z", + "iopub.status.idle": "2024-06-07T09:09:37.047025Z", + "shell.execute_reply": "2024-06-07T09:09:37.046527Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:05.117580Z", - "iopub.status.busy": "2024-05-15T16:07:05.117224Z", - "iopub.status.idle": "2024-05-15T16:07:05.620193Z", - "shell.execute_reply": "2024-05-15T16:07:05.619557Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", - " block_on(\"surname\", \"dob\"),\n", - " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\")]\n", - "\n", - "db_api = DuckDBAPI()\n", - "\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=df,\n", - " blocking_rules=blocking_rules,\n", - " db_api=db_api,\n", - " link_type=\"dedupe_only\",\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:05.623477Z", - "iopub.status.busy": "2024-05-15T16:07:05.623213Z", - "iopub.status.idle": "2024-05-15T16:07:05.768956Z", - "shell.execute_reply": "2024-05-15T16:07:05.768275Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "from splink import Linker, SettingsCreator\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=blocking_rules,\n", - " comparisons=[\n", - " ctl.NameComparison(\"first_name\").configure(term_frequency_adjustments=True),\n", - " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", - " ctl.DateComparison(\n", - " \"dob\",\n", - " input_is_string=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " # TODO: Restore ctl.PostcodeComparison level here\n", - " cl.LevenshteinAtThresholds(\"postcode_fake\"),\n", - " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "linker = Linker(df, settings, database_api=db_api)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - dob (no m values are trained).\n", + " - postcode_fake (no m values are trained).\n", + " - birth_place (no m values are trained).\n", + " - occupation (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=5e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:37.049884Z", + "iopub.status.busy": "2024-06-07T09:09:37.049671Z", + "iopub.status.idle": "2024-06-07T09:09:40.142517Z", + "shell.execute_reply": "2024-06-07T09:09:40.141723Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:05.772775Z", - "iopub.status.busy": "2024-05-15T16:07:05.772497Z", - "iopub.status.idle": "2024-05-15T16:07:06.084481Z", - "shell.execute_reply": "2024-05-15T16:07:06.083929Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.000136.\n", - "This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match. With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs\n" - ] - } - ], - "source": [ - "linker.training.estimate_probability_two_random_records_match(\n", - " [\n", - " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", - " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", - " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", - " ],\n", - " recall=0.6,\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - dob\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n", + " - surname\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:06.087609Z", - "iopub.status.busy": "2024-05-15T16:07:06.087384Z", - "iopub.status.idle": "2024-05-15T16:07:13.105199Z", - "shell.execute_reply": "2024-05-15T16:07:13.104695Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n", - " - dob (no m values are trained).\n", - " - postcode_fake (no m values are trained).\n", - " - birth_place (no m values are trained).\n", - " - occupation (no m values are trained).\n" - ] - } - ], - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=5e6)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:13.108034Z", - "iopub.status.busy": "2024-05-15T16:07:13.107820Z", - "iopub.status.idle": "2024-05-15T16:07:16.289385Z", - "shell.execute_reply": "2024-05-15T16:07:16.288708Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - dob\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - first_name\n", - " - surname\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.533 in probability_two_random_records_match\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.034 in probability_two_random_records_match\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0136 in the m_probability of birth_place, level `Exact match on birth_place`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.00579 in the m_probability of birth_place, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.00268 in the m_probability of birth_place, level `Exact match on birth_place`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was 0.00129 in the m_probability of birth_place, level `Exact match on birth_place`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was -0.000682 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was -0.000373 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was -0.000203 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was -0.000111 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was -6.05e-05 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 11 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n" - ] - } - ], - "source": [ - "training_blocking_rule = block_on(\"first_name\", \"surname\")\n", - "training_session_names = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.524 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:16.292730Z", - "iopub.status.busy": "2024-05-15T16:07:16.292472Z", - "iopub.status.idle": "2024-05-15T16:07:26.076237Z", - "shell.execute_reply": "2024-05-15T16:07:26.075402Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"dob\" = r.\"dob\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - first_name\n", - " - surname\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - dob\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.362 in the m_probability of first_name, level `Exact match on first_name`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.0343 in the m_probability of first_name, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.00489 in the m_probability of first_name, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was 0.00109 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.000261 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was 6.1e-05 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 6 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] - } - ], - "source": [ - "training_blocking_rule = block_on(\"dob\")\n", - "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0289 in probability_two_random_records_match\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final match weights can be viewed in the match weights chart:\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0106 in the m_probability of birth_place, level `All other comparisons`\n" + ] }, { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:26.079934Z", - "iopub.status.busy": "2024-05-15T16:07:26.079660Z", - "iopub.status.idle": "2024-05-15T16:07:26.364087Z", - "shell.execute_reply": "2024-05-15T16:07:26.363559Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.visualisations.match_weights_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00439 in the m_probability of birth_place, level `Exact match on birth_place`\n" + ] }, { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:26.367083Z", - "iopub.status.busy": "2024-05-15T16:07:26.366860Z", - "iopub.status.idle": "2024-05-15T16:07:28.387226Z", - "shell.execute_reply": "2024-05-15T16:07:28.386186Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.evaluation.unlinkables_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was -0.00199 in the m_probability of birth_place, level `All other comparisons`\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:28.393039Z", - "iopub.status.busy": "2024-05-15T16:07:28.392726Z", - "iopub.status.idle": "2024-05-15T16:07:30.731337Z", - "shell.execute_reply": "2024-05-15T16:07:30.730612Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
0-15.8404270.000017Q5971253-3Q75867928-4sirsir30.0249850.02498544.906565...0.1567561.0naval officermilitary officer00.0094510.0107560.1049891.00
1-15.8404270.000017Q5971253-3Q75867928-7sirsir30.0249850.02498544.906565...0.1567561.0naval officermilitary officer00.0094510.0107560.1049891.00
2-15.8404270.000017Q5971253-2Q75867928-4sirsir30.0249850.02498544.906565...0.1567561.0naval officermilitary officer00.0094510.0107560.1049891.00
3-15.8404270.000017Q5971253-2Q75867928-7sirsir30.0249850.02498544.906565...0.1567561.0naval officermilitary officer00.0094510.0107560.1049891.00
4-15.8404270.000017Q5971253-1Q75867928-4sirsir30.0249850.02498544.906565...0.1567561.0naval officermilitary officer00.0094510.0107560.1049891.00
\n", - "

5 rows × 41 columns

\n", - "
" - ], - "text/plain": [ - " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", - "0 -15.840427 0.000017 Q5971253-3 Q75867928-4 sir \n", - "1 -15.840427 0.000017 Q5971253-3 Q75867928-7 sir \n", - "2 -15.840427 0.000017 Q5971253-2 Q75867928-4 sir \n", - "3 -15.840427 0.000017 Q5971253-2 Q75867928-7 sir \n", - "4 -15.840427 0.000017 Q5971253-1 Q75867928-4 sir \n", - "\n", - " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", - "0 sir 3 0.024985 0.024985 \n", - "1 sir 3 0.024985 0.024985 \n", - "2 sir 3 0.024985 0.024985 \n", - "3 sir 3 0.024985 0.024985 \n", - "4 sir 3 0.024985 0.024985 \n", - "\n", - " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n", - "0 44.906565 ... 0.156756 1.0 naval officer \n", - "1 44.906565 ... 0.156756 1.0 naval officer \n", - "2 44.906565 ... 0.156756 1.0 naval officer \n", - "3 44.906565 ... 0.156756 1.0 naval officer \n", - "4 44.906565 ... 0.156756 1.0 naval officer \n", - "\n", - " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n", - "0 military officer 0 0.009451 0.010756 \n", - "1 military officer 0 0.009451 0.010756 \n", - "2 military officer 0 0.009451 0.010756 \n", - "3 military officer 0 0.009451 0.010756 \n", - "4 military officer 0 0.009451 0.010756 \n", - "\n", - " bf_occupation bf_tf_adj_occupation match_key \n", - "0 0.104989 1.0 0 \n", - "1 0.104989 1.0 0 \n", - "2 0.104989 1.0 0 \n", - "3 0.104989 1.0 0 \n", - "4 0.104989 1.0 0 \n", - "\n", - "[5 rows x 41 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_predict = linker.inference.predict()\n", - "df_e = df_predict.as_pandas_dataframe(limit=5)\n", - "df_e" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.000951 in the m_probability of birth_place, level `All other comparisons`\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also view rows in this dataset as a waterfall chart as follows:\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.000495 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was -0.000267 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.000144 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was -7.75e-05 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 10 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"first_name\", \"surname\")\n", + "training_session_names = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:40.146047Z", + "iopub.status.busy": "2024-06-07T09:09:40.145755Z", + "iopub.status.idle": "2024-06-07T09:09:49.809293Z", + "shell.execute_reply": "2024-06-07T09:09:49.808482Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"dob\" = r.\"dob\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - dob\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.362 in the m_probability of first_name, level `Exact match on first_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0336 in the m_probability of first_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.00479 in the m_probability of first_name, level `All other comparisons`\n" + ] }, { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:30.735380Z", - "iopub.status.busy": "2024-05-15T16:07:30.735079Z", - "iopub.status.idle": "2024-05-15T16:07:31.361460Z", - "shell.execute_reply": "2024-05-15T16:07:31.360879Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00106 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.000254 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 5.93e-05 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 6 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"dob\")\n", + "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final match weights can be viewed in the match weights chart:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:49.813047Z", + "iopub.status.busy": "2024-06-07T09:09:49.812768Z", + "iopub.status.idle": "2024-06-07T09:09:50.107589Z", + "shell.execute_reply": "2024-06-07T09:09:50.106949Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "\n", - "records_to_plot = df_e.to_dict(orient=\"records\")\n", - "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)" + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:50.110966Z", + "iopub.status.busy": "2024-06-07T09:09:50.110700Z", + "iopub.status.idle": "2024-06-07T09:09:52.080683Z", + "shell.execute_reply": "2024-06-07T09:09:52.080017Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:31.364481Z", - "iopub.status.busy": "2024-05-15T16:07:31.364255Z", - "iopub.status.idle": "2024-05-15T16:07:31.746356Z", - "shell.execute_reply": "2024-05-15T16:07:31.745671Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 1, root rows count 625\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 2, root rows count 93\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 3, root rows count 19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 4, root rows count 4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 5, root rows count 0\n" - ] - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n", - " df_predict, threshold_match_probability=0.95\n", - ")" + "text/plain": [ + "alt.LayerChart(...)" ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.evaluation.unlinkables_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:52.085817Z", + "iopub.status.busy": "2024-06-07T09:09:52.085551Z", + "iopub.status.idle": "2024-06-07T09:09:54.516650Z", + "shell.execute_reply": "2024-06-07T09:09:54.515921Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:31.749625Z", - "iopub.status.busy": "2024-05-15T16:07:31.749370Z", - "iopub.status.idle": "2024-05-15T16:07:31.898014Z", - "shell.execute_reply": "2024-05-15T16:07:31.897301Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
0-15.8293330.000017Q7528564-9Q75867928-1sirsir30.0249850.02498538.34881...0.1570161.0historianmilitary officer00.0124560.0107560.1050281.00
1-15.8293330.000017Q7528564-9Q75867928-2sirsir30.0249850.02498538.34881...0.1570161.0historianmilitary officer00.0124560.0107560.1050281.00
2-15.8293330.000017Q7528564-9Q75867928-3sirsir30.0249850.02498538.34881...0.1570161.0historianmilitary officer00.0124560.0107560.1050281.00
3-15.8293330.000017Q7528564-9Q75867928-4sirsir30.0249850.02498538.34881...0.1570161.0historianmilitary officer00.0124560.0107560.1050281.00
4-15.8293330.000017Q7528564-9Q75867928-6sirsir30.0249850.02498538.34881...0.1570161.0historianmilitary officer00.0124560.0107560.1050281.00
\n", + "

5 rows × 41 columns

\n", + "
" ], - "source": [ - "from IPython.display import IFrame\n", - "\n", - "linker.visualisations.cluster_studio_dashboard(\n", - " df_predict,\n", - " clusters,\n", - " \"dashboards/50k_cluster.html\",\n", - " sampling_method=\"by_cluster_size\",\n", - " overwrite=True,\n", - ")\n", - "\n", - "\n", - "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" + "text/plain": [ + " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", + "0 -15.829333 0.000017 Q7528564-9 Q75867928-1 sir \n", + "1 -15.829333 0.000017 Q7528564-9 Q75867928-2 sir \n", + "2 -15.829333 0.000017 Q7528564-9 Q75867928-3 sir \n", + "3 -15.829333 0.000017 Q7528564-9 Q75867928-4 sir \n", + "4 -15.829333 0.000017 Q7528564-9 Q75867928-6 sir \n", + "\n", + " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", + "0 sir 3 0.024985 0.024985 \n", + "1 sir 3 0.024985 0.024985 \n", + "2 sir 3 0.024985 0.024985 \n", + "3 sir 3 0.024985 0.024985 \n", + "4 sir 3 0.024985 0.024985 \n", + "\n", + " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n", + "0 38.34881 ... 0.157016 1.0 historian \n", + "1 38.34881 ... 0.157016 1.0 historian \n", + "2 38.34881 ... 0.157016 1.0 historian \n", + "3 38.34881 ... 0.157016 1.0 historian \n", + "4 38.34881 ... 0.157016 1.0 historian \n", + "\n", + " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n", + "0 military officer 0 0.012456 0.010756 \n", + "1 military officer 0 0.012456 0.010756 \n", + "2 military officer 0 0.012456 0.010756 \n", + "3 military officer 0 0.012456 0.010756 \n", + "4 military officer 0 0.012456 0.010756 \n", + "\n", + " bf_occupation bf_tf_adj_occupation match_key \n", + "0 0.105028 1.0 0 \n", + "1 0.105028 1.0 0 \n", + "2 0.105028 1.0 0 \n", + "3 0.105028 1.0 0 \n", + "4 0.105028 1.0 0 \n", + "\n", + "[5 rows x 41 columns]" ] - }, + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_predict = linker.inference.predict()\n", + "df_e = df_predict.as_pandas_dataframe(limit=5)\n", + "df_e" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also view rows in this dataset as a waterfall chart as follows:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:54.520577Z", + "iopub.status.busy": "2024-06-07T09:09:54.520273Z", + "iopub.status.idle": "2024-06-07T09:09:55.151653Z", + "shell.execute_reply": "2024-06-07T09:09:55.150935Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:31.901400Z", - "iopub.status.busy": "2024-05-15T16:07:31.901154Z", - "iopub.status.idle": "2024-05-15T16:07:44.228710Z", - "shell.execute_reply": "2024-05-15T16:07:44.227315Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "linker.accuracy_analysis_from_labels_column(\n", - " \"cluster\", output_type=\"roc\", match_weight_round_to_nearest=0.02\n", - ")" + "text/plain": [ + "alt.LayerChart(...)" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "records_to_plot = df_e.to_dict(orient=\"records\")\n", + "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:55.155050Z", + "iopub.status.busy": "2024-06-07T09:09:55.154811Z", + "iopub.status.idle": "2024-06-07T09:09:55.525689Z", + "shell.execute_reply": "2024-06-07T09:09:55.524936Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 623\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 2, root rows count 100\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 3, root rows count 22\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 4, root rows count 4\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:44.268428Z", - "iopub.status.busy": "2024-05-15T16:07:44.268099Z", - "iopub.status.idle": "2024-05-15T16:07:47.826572Z", - "shell.execute_reply": "2024-05-15T16:07:47.826055Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 5, root rows count 0\n" + ] + } + ], + "source": [ + "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n", + " df_predict, threshold_match_probability=0.95\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:55.528997Z", + "iopub.status.busy": "2024-06-07T09:09:55.528732Z", + "iopub.status.idle": "2024-06-07T09:09:55.705059Z", + "shell.execute_reply": "2024-06-07T09:09:55.704305Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "records = linker.inference.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.999,\n", - " include_false_negatives=False,\n", - " include_false_positives=True,\n", - ").as_record_dict()\n", - "linker.visualisations.waterfall_chart(records)" + "text/plain": [ + "" ] - }, + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import IFrame\n", + "\n", + "linker.visualisations.cluster_studio_dashboard(\n", + " df_predict,\n", + " clusters,\n", + " \"dashboards/50k_cluster.html\",\n", + " sampling_method=\"by_cluster_size\",\n", + " overwrite=True,\n", + ")\n", + "\n", + "\n", + "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:09:55.708587Z", + "iopub.status.busy": "2024-06-07T09:09:55.708313Z", + "iopub.status.idle": "2024-06-07T09:10:07.358895Z", + "shell.execute_reply": "2024-06-07T09:10:07.358097Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T16:07:47.834324Z", - "iopub.status.busy": "2024-05-15T16:07:47.834092Z", - "iopub.status.idle": "2024-05-15T16:07:51.080047Z", - "shell.execute_reply": "2024-05-15T16:07:51.079464Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "# Some of the false negatives will be because they weren't detected by the blocking rules\n", - "records = linker.inference.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.5,\n", - " include_false_negatives=True,\n", - " include_false_positives=False,\n", - ").as_record_dict(limit=50)\n", - "\n", - "linker.visualisations.waterfall_chart(records)" + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "06545e908438426c8185e5bc9b35b182": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_5fe1cfa86b4f4e9bbecf34be2378fbe7", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_40efa7951ddc4ca8bb74e0c91d1abf66", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "1e58ea15a76f4887b75ee41c6210f8bd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "40efa7951ddc4ca8bb74e0c91d1abf66": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "4816c47151d145b994566568e51c630c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_a5007a2aafc44df2b5a0932cc17d4a0c", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1e58ea15a76f4887b75ee41c6210f8bd", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "5fcb5354d9c746bcbbd42fb211dc84ec": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "5fe1cfa86b4f4e9bbecf34be2378fbe7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "8d093bba3d464dafaebcaeb55dcbef47": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_f0a6693a457e40b68664a1569829e678", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ef73e739ae9e4d9e942ff4eb0ec2338c", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "9fe5f6a7b06a455fa9fb04d4088d3a78": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_5fcb5354d9c746bcbbd42fb211dc84ec", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ad735d4f18bb437fb01e563f9175ba97", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "a5007a2aafc44df2b5a0932cc17d4a0c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "ad735d4f18bb437fb01e563f9175ba97": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "ef73e739ae9e4d9e942ff4eb0ec2338c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "f0a6693a457e40b68664a1569829e678": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - } - }, - "version_major": 2, - "version_minor": 0 - } + ], + "source": [ + "linker.evaluation.accuracy_analysis_from_labels_column(\n", + " \"cluster\", output_type=\"roc\", match_weight_round_to_nearest=0.02\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:10:07.391167Z", + "iopub.status.busy": "2024-06-07T09:10:07.390901Z", + "iopub.status.idle": "2024-06-07T09:10:10.809464Z", + "shell.execute_reply": "2024-06-07T09:10:10.808740Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "records = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.999,\n", + " include_false_negatives=False,\n", + " include_false_positives=True,\n", + ").as_record_dict()\n", + "linker.visualisations.waterfall_chart(records)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:10:10.819376Z", + "iopub.status.busy": "2024-06-07T09:10:10.818967Z", + "iopub.status.idle": "2024-06-07T09:10:13.601958Z", + "shell.execute_reply": "2024-06-07T09:10:13.601341Z" } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Some of the false negatives will be because they weren't detected by the blocking rules\n", + "records = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.5,\n", + " include_false_negatives=True,\n", + " include_false_positives=False,\n", + ").as_record_dict(limit=50)\n", + "\n", + "linker.visualisations.waterfall_chart(records)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "06545e908438426c8185e5bc9b35b182": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_5fe1cfa86b4f4e9bbecf34be2378fbe7", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_40efa7951ddc4ca8bb74e0c91d1abf66", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "1e58ea15a76f4887b75ee41c6210f8bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "40efa7951ddc4ca8bb74e0c91d1abf66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "4816c47151d145b994566568e51c630c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_a5007a2aafc44df2b5a0932cc17d4a0c", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1e58ea15a76f4887b75ee41c6210f8bd", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "5fcb5354d9c746bcbbd42fb211dc84ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "5fe1cfa86b4f4e9bbecf34be2378fbe7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "8d093bba3d464dafaebcaeb55dcbef47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_f0a6693a457e40b68664a1569829e678", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ef73e739ae9e4d9e942ff4eb0ec2338c", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "9fe5f6a7b06a455fa9fb04d4088d3a78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_5fcb5354d9c746bcbbd42fb211dc84ec", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ad735d4f18bb437fb01e563f9175ba97", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "a5007a2aafc44df2b5a0932cc17d4a0c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "ad735d4f18bb437fb01e563f9175ba97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "ef73e739ae9e4d9e942ff4eb0ec2338c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "f0a6693a457e40b68664a1569829e678": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb index b3afdff7b1..0065baeda9 100644 --- a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb +++ b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb @@ -1,267 +1,826 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking a dataset of real historical persons with Deterrministic Rules\n", - "\n", - "While Splink is primarily a tool for probabilistic records linkage, it includes functionality to perform deterministic (i.e. rules based) linkage.\n", - "\n", - "Significant work has gone into optimising the performance of rules based matching, so Splink is likely to be significantly faster than writing the basic SQL by hand.\n", - "\n", - "In this example, we deduplicate a 50k row dataset based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced. The probabilistic dedupe of the same dataset can be found at `Deduplicate 50k rows historical persons`.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:50.508953Z", - "iopub.status.busy": "2024-05-15T15:43:50.508620Z", - "iopub.status.idle": "2024-05-15T15:43:50.514416Z", - "shell.execute_reply": "2024-05-15T15:43:50.513604Z" - } - }, - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ], - "outputs": [] - }, + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking a dataset of real historical persons with Deterrministic Rules\n", + "\n", + "While Splink is primarily a tool for probabilistic records linkage, it includes functionality to perform deterministic (i.e. rules based) linkage.\n", + "\n", + "Significant work has gone into optimising the performance of rules based matching, so Splink is likely to be significantly faster than writing the basic SQL by hand.\n", + "\n", + "In this example, we deduplicate a 50k row dataset based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced. The probabilistic dedupe of the same dataset can be found at `Deduplicate 50k rows historical persons`.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:10:59.567669Z", + "iopub.status.busy": "2024-06-07T09:10:59.567311Z", + "iopub.status.idle": "2024-06-07T09:10:59.591784Z", + "shell.execute_reply": "2024-06-07T09:10:59.590923Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:10:59.595969Z", + "iopub.status.busy": "2024-06-07T09:10:59.595667Z", + "iopub.status.idle": "2024-06-07T09:11:01.007136Z", + "shell.execute_reply": "2024-06-07T09:11:01.006553Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:50.519532Z", - "iopub.status.busy": "2024-05-15T15:43:50.519159Z", - "iopub.status.idle": "2024-05-15T15:43:53.171104Z", - "shell.execute_reply": "2024-05-15T15:43:53.170070Z" - } - }, - "source": [ - "import pandas as pd\n", - "\n", - "from splink import splink_datasets\n", - "\n", - "pd.options.display.max_rows = 1000\n", - "df = splink_datasets.historical_50k\n", - "df.head()" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idclusterfull_namefirst_and_surnamefirst_namesurnamedobbirth_placepostcode_fakegenderoccupation
0Q2296770-1Q2296770thomas clifford, 1st baron clifford of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
1Q2296770-2Q2296770thomas of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
2Q2296770-3Q2296770tom 1st baron clifford of chudleightom chudleightomchudleigh1630-08-01devontq13 8dfmalepolitician
3Q2296770-4Q2296770thomas 1st chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8huNonepolitician
4Q2296770-5Q2296770thomas clifford, 1st baron chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfNonepolitician
\n", + "
" ], - "outputs": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When defining the settings object, specity your deterministic rules in the `blocking_rules_to_generate_predictions` key.\n", - "\n", - "For a deterministic linkage, the linkage methodology is based solely on these rules, so there is no need to define `comparisons` nor any other parameters required for model training in a probabilistic model.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Prior to running the linkage, it's usually a good idea to check how many record comparisons will be generated by your deterministic rules:\n" + "text/plain": [ + " unique_id cluster full_name \\\n", + "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n", + "1 Q2296770-2 Q2296770 thomas of chudleigh \n", + "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n", + "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n", + "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n", + "\n", + " first_and_surname first_name surname dob birth_place \\\n", + "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "2 tom chudleigh tom chudleigh 1630-08-01 devon \n", + "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "\n", + " postcode_fake gender occupation \n", + "0 tq13 8df male politician \n", + "1 tq13 8df male politician \n", + "2 tq13 8df male politician \n", + "3 tq13 8hu None politician \n", + "4 tq13 8df None politician " ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "from splink import splink_datasets\n", + "\n", + "pd.options.display.max_rows = 1000\n", + "df = splink_datasets.historical_50k\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When defining the settings object, specity your deterministic rules in the `blocking_rules_to_generate_predictions` key.\n", + "\n", + "For a deterministic linkage, the linkage methodology is based solely on these rules, so there is no need to define `comparisons` nor any other parameters required for model training in a probabilistic model.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prior to running the linkage, it's usually a good idea to check how many record comparisons will be generated by your deterministic rules:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:01.050336Z", + "iopub.status.busy": "2024-06-07T09:11:01.049679Z", + "iopub.status.idle": "2024-06-07T09:11:01.602823Z", + "shell.execute_reply": "2024-06-07T09:11:01.601902Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:53.273619Z", - "iopub.status.busy": "2024-05-15T15:43:53.271060Z", - "iopub.status.idle": "2024-05-15T15:43:54.139302Z", - "shell.execute_reply": "2024-05-15T15:43:54.138451Z" - } - }, - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "db_api = DuckDBAPI()\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=df,\n", - " blocking_rules=[\n", - " block_on(\"first_name\", \"surname\", \"dob\"),\n", - " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", - " block_on(\"first_name\", \"dob\", \"occupation\"),\n", - " ],\n", - " db_api=db_api,\n", - " link_type=\"dedupe_only\",\n", - ")" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rules=[\n", + " block_on(\"first_name\", \"surname\", \"dob\"),\n", + " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", + " block_on(\"first_name\", \"dob\", \"occupation\"),\n", + " ],\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:01.606853Z", + "iopub.status.busy": "2024-06-07T09:11:01.606539Z", + "iopub.status.idle": "2024-06-07T09:11:01.691839Z", + "shell.execute_reply": "2024-06-07T09:11:01.690988Z" + } + }, + "outputs": [], + "source": [ + "from splink import Linker, SettingsCreator\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"first_name\", \"surname\", \"dob\"),\n", + " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", + " block_on(\"first_name\", \"dob\", \"occupation\"),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker(df, settings, database_api=db_api)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results of the linkage can be viewed with the `deterministic_link` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:01.695906Z", + "iopub.status.busy": "2024-06-07T09:11:01.695600Z", + "iopub.status.idle": "2024-06-07T09:11:01.995020Z", + "shell.execute_reply": "2024-06-07T09:11:01.994289Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:54.144031Z", - "iopub.status.busy": "2024-05-15T15:43:54.143555Z", - "iopub.status.idle": "2024-05-15T15:43:54.254120Z", - "shell.execute_reply": "2024-05-15T15:43:54.252360Z" - } - }, - "source": [ - "from splink import Linker, SettingsCreator\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\", \"surname\", \"dob\"),\n", - " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", - " block_on(\"first_name\", \"dob\", \"occupation\"),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "linker = Linker(df, settings, database_api=db_api)\n" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_id_lunique_id_roccupation_loccupation_rfirst_name_lfirst_name_rdob_ldob_rsurname_lsurname_rpostcode_fake_lpostcode_fake_rmatch_key
0Q55455287-12Q55455287-2Nonewriterjaidojaido1836-01-011836-01-01moratamoratata4 2ugta4 2uu0
1Q55455287-12Q55455287-3Nonewriterjaidojaido1836-01-011836-01-01moratamoratata4 2ugta4 2uu0
2Q55455287-12Q55455287-4Nonewriterjaidojaido1836-01-011836-01-01moratamoratata4 2ugta4 2sz0
3Q55455287-12Q55455287-5NoneNonejaidojaido1836-01-011836-01-01moratamoratata4 2ugta4 2ug0
4Q55455287-12Q55455287-6Nonewriterjaidojaido1836-01-011836-01-01moratamoratata4 2ugNone0
\n", + "
" ], - "outputs": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results of the linkage can be viewed with the `deterministic_link` function.\n" + "text/plain": [ + " unique_id_l unique_id_r occupation_l occupation_r first_name_l \\\n", + "0 Q55455287-12 Q55455287-2 None writer jaido \n", + "1 Q55455287-12 Q55455287-3 None writer jaido \n", + "2 Q55455287-12 Q55455287-4 None writer jaido \n", + "3 Q55455287-12 Q55455287-5 None None jaido \n", + "4 Q55455287-12 Q55455287-6 None writer jaido \n", + "\n", + " first_name_r dob_l dob_r surname_l surname_r postcode_fake_l \\\n", + "0 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n", + "1 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n", + "2 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n", + "3 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n", + "4 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n", + "\n", + " postcode_fake_r match_key \n", + "0 ta4 2uu 0 \n", + "1 ta4 2uu 0 \n", + "2 ta4 2sz 0 \n", + "3 ta4 2ug 0 \n", + "4 None 0 " ] - }, + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_predict = linker.inference.deterministic_link()\n", + "df_predict.as_pandas_dataframe().head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Which can be used to generate clusters.\n", + "\n", + "Note, for deterministic linkage, each comparison has been assigned a match probability of 1, so to generate clusters, set `threshold_match_probability=1` in the `cluster_pairwise_predictions_at_threshold` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:01.998965Z", + "iopub.status.busy": "2024-06-07T09:11:01.998665Z", + "iopub.status.idle": "2024-06-07T09:11:02.348788Z", + "shell.execute_reply": "2024-06-07T09:11:02.348039Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:54.259538Z", - "iopub.status.busy": "2024-05-15T15:43:54.258905Z", - "iopub.status.idle": "2024-05-15T15:43:54.922593Z", - "shell.execute_reply": "2024-05-15T15:43:54.921796Z" - } - }, - "source": [ - "df_predict = linker.deterministic_link()\n", - "df_predict.as_pandas_dataframe().head()" - ], - "outputs": [] + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 94\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Which can be used to generate clusters.\n", - "\n", - "Note, for deterministic linkage, each comparison has been assigned a match probability of 1, so to generate clusters, set `threshold_match_probability=1` in the `cluster_pairwise_predictions_at_threshold` function.\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 2, root rows count 10\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:54.928175Z", - "iopub.status.busy": "2024-05-15T15:43:54.927807Z", - "iopub.status.idle": "2024-05-15T15:43:55.547697Z", - "shell.execute_reply": "2024-05-15T15:43:55.543024Z" - } - }, - "source": [ - "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n", - " df_predict, threshold_match_probability=1\n", - ")" - ], - "outputs": [] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 3, root rows count 0\n" + ] + } + ], + "source": [ + "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n", + " df_predict, threshold_match_probability=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:02.352872Z", + "iopub.status.busy": "2024-06-07T09:11:02.352366Z", + "iopub.status.idle": "2024-06-07T09:11:02.367858Z", + "shell.execute_reply": "2024-06-07T09:11:02.367179Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:55.555934Z", - "iopub.status.busy": "2024-05-15T15:43:55.554006Z", - "iopub.status.idle": "2024-05-15T15:43:55.592918Z", - "shell.execute_reply": "2024-05-15T15:43:55.589688Z" - } - }, - "source": [ - "clusters.as_pandas_dataframe(limit=5)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_idunique_idclusterfull_namefirst_and_surnamefirst_namesurnamedobbirth_placepostcode_fakegenderoccupation__splink_salt
0Q16025107-1Q5497940-9Q5497940frederick hallfrederick hallfrederickhall1855-01-01bristol, city ofbs11 9pnNoneNone0.002739
1Q1149445-1Q1149445-9Q1149445earl egertonearl egertonearlegerton1800-01-01westminsterw1d 2hfNoneNone0.991459
2Q20664532-1Q21466387-2Q21466387harry brookerharry brookerharrybrooker1848-01-01plymouthpl4 9hxmalepainter0.506127
3Q1124636-1Q1124636-12Q1124636tom stapletontom stapletontomstapleton1535-01-01Nonebn6 9namaletheologian0.612694
4Q18508292-1Q21466711-4Q21466711harry s0enceharry s0enceharrys0ence1860-01-01londonse1 7pbmalepainter0.488917
\n", + "
" ], - "outputs": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These results can then be passed into the `Cluster Studio Dashboard`.\n" + "text/plain": [ + " cluster_id unique_id cluster full_name first_and_surname \\\n", + "0 Q16025107-1 Q5497940-9 Q5497940 frederick hall frederick hall \n", + "1 Q1149445-1 Q1149445-9 Q1149445 earl egerton earl egerton \n", + "2 Q20664532-1 Q21466387-2 Q21466387 harry brooker harry brooker \n", + "3 Q1124636-1 Q1124636-12 Q1124636 tom stapleton tom stapleton \n", + "4 Q18508292-1 Q21466711-4 Q21466711 harry s0ence harry s0ence \n", + "\n", + " first_name surname dob birth_place postcode_fake gender \\\n", + "0 frederick hall 1855-01-01 bristol, city of bs11 9pn None \n", + "1 earl egerton 1800-01-01 westminster w1d 2hf None \n", + "2 harry brooker 1848-01-01 plymouth pl4 9hx male \n", + "3 tom stapleton 1535-01-01 None bn6 9na male \n", + "4 harry s0ence 1860-01-01 london se1 7pb male \n", + "\n", + " occupation __splink_salt \n", + "0 None 0.002739 \n", + "1 None 0.991459 \n", + "2 painter 0.506127 \n", + "3 theologian 0.612694 \n", + "4 painter 0.488917 " ] - }, + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clusters.as_pandas_dataframe(limit=5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These results can then be passed into the `Cluster Studio Dashboard`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:02.371850Z", + "iopub.status.busy": "2024-06-07T09:11:02.371545Z", + "iopub.status.idle": "2024-06-07T09:11:02.462645Z", + "shell.execute_reply": "2024-06-07T09:11:02.461886Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:43:55.600959Z", - "iopub.status.busy": "2024-05-15T15:43:55.600358Z", - "iopub.status.idle": "2024-05-15T15:43:55.761150Z", - "shell.execute_reply": "2024-05-15T15:43:55.759988Z" - } - }, - "source": [ - "linker.visualisations.cluster_studio_dashboard(\n", - " df_predict,\n", - " clusters,\n", - " \"dashboards/50k_deterministic_cluster.html\",\n", - " sampling_method=\"by_cluster_size\",\n", - " overwrite=True,\n", - ")\n", - "\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(src=\"./dashboards/50k_deterministic_cluster.html\", width=\"100%\", height=1200)" + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "linker.visualisations.cluster_studio_dashboard(\n", + " df_predict,\n", + " clusters,\n", + " \"dashboards/50k_deterministic_cluster.html\",\n", + " sampling_method=\"by_cluster_size\",\n", + " overwrite=True,\n", + ")\n", + "\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(src=\"./dashboards/50k_deterministic_cluster.html\", width=\"100%\", height=1200)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/febrl3.ipynb b/docs/demos/examples/duckdb/febrl3.ipynb index 080b7c6607..9a102d4519 100644 --- a/docs/demos/examples/duckdb/febrl3.ipynb +++ b/docs/demos/examples/duckdb/febrl3.ipynb @@ -1,630 +1,1802 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deduplicating the febrl3 dataset\n", - "\n", - "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deduplicating the febrl3 dataset\n", + "\n", + "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:24.420657Z", + "iopub.status.busy": "2024-06-07T09:11:24.420336Z", + "iopub.status.idle": "2024-06-07T09:11:24.443364Z", + "shell.execute_reply": "2024-06-07T09:11:24.442120Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:24.447798Z", + "iopub.status.busy": "2024-06-07T09:11:24.447495Z", + "iopub.status.idle": "2024-06-07T09:11:26.149918Z", + "shell.execute_reply": "2024-06-07T09:11:26.149230Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloading: https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset3.csv\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " download progress: 0 %\t(..........)\r", + " download progress: 2 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 5 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 10 %\t(..........)\r", + " download progress: 11 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 14 %\t(=.........)\r", + " download progress: 16 %\t(=.........)\r", + " download progress: 18 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 25 %\t(==........)\r", + " download progress: 27 %\t(==........)\r", + " download progress: 29 %\t(==........)\r", + " download progress: 30 %\t(===.......)\r", + " download progress: 32 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 40 %\t(===.......)\r", + " download progress: 41 %\t(====......)\r", + " download progress: 43 %\t(====......)\r", + " download progress: 45 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 54 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 59 %\t(=====.....)\r", + " download progress: 61 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 70 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 75 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 80 %\t(=======...)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 84 %\t(========..)\r", + " download progress: 86 %\t(========..)\r", + " download progress: 88 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 97 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 100 %\t(==========)\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:53.970752Z", - "iopub.status.busy": "2024-05-15T15:50:53.970419Z", - "iopub.status.idle": "2024-05-15T15:50:53.975673Z", - "shell.execute_reply": "2024-05-15T15:50:53.974958Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-1496-orgmitchellgreen7wallaby placedelmarcleveland2119sa195604091804974rec-1496
1rec-552-dup-3harleymccarthy177pridhamstreetmiltonmarsden3165nsw190804196089216rec-552
\n", + "
" + ], + "text/plain": [ + " rec_id given_name surname street_number address_1 \\\n", + "0 rec-1496-org mitchell green 7 wallaby place \n", + "1 rec-552-dup-3 harley mccarthy 177 pridhamstreet \n", + "\n", + " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n", + "0 delmar cleveland 2119 sa 19560409 1804974 rec-1496 \n", + "1 milton marsden 3165 nsw 19080419 6089216 rec-552 " ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.datasets import splink_datasets\n", + "\n", + "df = splink_datasets.febrl3\n", + "df = df.rename(columns=lambda x: x.strip())\n", + "\n", + "df[\"cluster\"] = df[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n", + "\n", + "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", + "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()\n", + "\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:26.153666Z", + "iopub.status.busy": "2024-06-07T09:11:26.153378Z", + "iopub.status.idle": "2024-06-07T09:11:26.160666Z", + "shell.execute_reply": "2024-06-07T09:11:26.159911Z" + } + }, + "outputs": [], + "source": [ + "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", + "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:26.164000Z", + "iopub.status.busy": "2024-06-07T09:11:26.163726Z", + "iopub.status.idle": "2024-06-07T09:11:26.170794Z", + "shell.execute_reply": "2024-06-07T09:11:26.170146Z" + } + }, + "outputs": [], + "source": [ + "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", + "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:26.174301Z", + "iopub.status.busy": "2024-06-07T09:11:26.174024Z", + "iopub.status.idle": "2024-06-07T09:11:26.331196Z", + "shell.execute_reply": "2024-06-07T09:11:26.330465Z" + } + }, + "outputs": [], + "source": [ + "from splink import DuckDBAPI, Linker, SettingsCreator\n", + "\n", + "# TODO: Allow missingness to be analysed without a linker\n", + "settings = SettingsCreator(\n", + " unique_id_column_name=\"rec_id\",\n", + " link_type=\"dedupe_only\",\n", + ")\n", + "\n", + "linker = Linker(df, settings, database_api=DuckDBAPI())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:26.334644Z", + "iopub.status.busy": "2024-06-07T09:11:26.334398Z", + "iopub.status.idle": "2024-06-07T09:11:26.630134Z", + "shell.execute_reply": "2024-06-07T09:11:26.629629Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:53.979321Z", - "iopub.status.busy": "2024-05-15T15:50:53.979040Z", - "iopub.status.idle": "2024-05-15T15:50:55.403280Z", - "shell.execute_reply": "2024-05-15T15:50:55.402512Z" - } - }, - "outputs": [], - "source": [ - "from splink.datasets import splink_datasets\n", - "\n", - "df = splink_datasets.febrl3\n", - "df = df.rename(columns=lambda x: x.strip())\n", - "\n", - "df[\"cluster\"] = df[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n", - "\n", - "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", - "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()\n", - "\n", - "df.head(2)" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" ] - }, + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import completeness_chart\n", + "\n", + "completeness_chart(df, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:26.633200Z", + "iopub.status.busy": "2024-06-07T09:11:26.632979Z", + "iopub.status.idle": "2024-06-07T09:11:27.047469Z", + "shell.execute_reply": "2024-06-07T09:11:27.046951Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:55.445888Z", - "iopub.status.busy": "2024-05-15T15:50:55.445564Z", - "iopub.status.idle": "2024-05-15T15:50:55.453559Z", - "shell.execute_reply": "2024-05-15T15:50:55.452728Z" - } - }, - "outputs": [], - "source": [ - "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", - "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import profile_columns\n", + "\n", + "profile_columns(df, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:27.050491Z", + "iopub.status.busy": "2024-06-07T09:11:27.050266Z", + "iopub.status.idle": "2024-06-07T09:11:27.428593Z", + "shell.execute_reply": "2024-06-07T09:11:27.428055Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:55.457023Z", - "iopub.status.busy": "2024-05-15T15:50:55.456741Z", - "iopub.status.idle": "2024-05-15T15:50:55.464209Z", - "shell.execute_reply": "2024-05-15T15:50:55.463386Z" - } - }, - "outputs": [], - "source": [ - "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n", - "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules = [\n", + " block_on(\"soc_sec_id\"),\n", + " block_on(\"given_name\"),\n", + " block_on(\"surname\"),\n", + " block_on(\"date_of_birth\"),\n", + " block_on(\"postcode\"),\n", + "]\n", + "\n", + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rules=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"rec_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:27.431702Z", + "iopub.status.busy": "2024-06-07T09:11:27.431466Z", + "iopub.status.idle": "2024-06-07T09:11:27.591229Z", + "shell.execute_reply": "2024-06-07T09:11:27.590491Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import Linker\n", + "\n", + "settings = SettingsCreator(\n", + " unique_id_column_name=\"rec_id\",\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=blocking_rules,\n", + " comparisons=[\n", + " ctl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n", + " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", + " ctl.DateComparison(\n", + " \"date_of_birth\",\n", + " input_is_string=True,\n", + " datetime_format=\"%Y%m%d\",\n", + " invalid_dates_as_null=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " cl.LevenshteinAtThresholds(\"soc_sec_id\", [2]),\n", + " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\"postcode\").configure(term_frequency_adjustments=True),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker(df, settings, database_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:27.594493Z", + "iopub.status.busy": "2024-06-07T09:11:27.594264Z", + "iopub.status.idle": "2024-06-07T09:11:27.787352Z", + "shell.execute_reply": "2024-06-07T09:11:27.786769Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000528.\n", + "This means that amongst all possible pairwise record comparisons, one in 1,893.56 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 6,600.00 matching pairs\n" + ] + } + ], + "source": [ + "from splink import block_on\n", + "\n", + "deterministic_rules = [\n", + " block_on(\"soc_sec_id\"),\n", + " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n", + " \"l.given_name = r.surname and l.surname = r.given_name and l.date_of_birth = r.date_of_birth\",\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:27.790368Z", + "iopub.status.busy": "2024-06-07T09:11:27.790145Z", + "iopub.status.idle": "2024-06-07T09:11:35.433199Z", + "shell.execute_reply": "2024-06-07T09:11:35.431006Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:55.467779Z", - "iopub.status.busy": "2024-05-15T15:50:55.467486Z", - "iopub.status.idle": "2024-05-15T15:50:55.617978Z", - "shell.execute_reply": "2024-05-15T15:50:55.617331Z" - } - }, - "outputs": [], - "source": [ - "from splink import DuckDBAPI, Linker, SettingsCreator\n", - "\n", - "# TODO: Allow missingness to be analysed without a linker\n", - "settings = SettingsCreator(\n", - " unique_id_column_name=\"rec_id\",\n", - " link_type=\"dedupe_only\",\n", - ")\n", - "\n", - "linker = Linker(df, settings, database_api=DuckDBAPI())" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing:\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:55.621604Z", - "iopub.status.busy": "2024-05-15T15:50:55.621314Z", - "iopub.status.idle": "2024-05-15T15:50:55.930689Z", - "shell.execute_reply": "2024-05-15T15:50:55.929809Z" - } - }, - "outputs": [], - "source": [ - "from splink.exploratory import completeness_chart\n", - "\n", - "completeness_chart(df, db_api=DuckDBAPI())" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:55.933815Z", - "iopub.status.busy": "2024-05-15T15:50:55.933588Z", - "iopub.status.idle": "2024-05-15T15:50:56.393881Z", - "shell.execute_reply": "2024-05-15T15:50:56.393363Z" - } - }, - "outputs": [], - "source": [ - "from splink.exploratory import profile_columns\n", - "\n", - "profile_columns(df, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:56.397337Z", - "iopub.status.busy": "2024-05-15T15:50:56.396993Z", - "iopub.status.idle": "2024-05-15T15:50:56.749566Z", - "shell.execute_reply": "2024-05-15T15:50:56.748922Z" - } - }, - "outputs": [], - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "blocking_rules = [\n", - " block_on(\"soc_sec_id\"),\n", - " block_on(\"given_name\"),\n", - " block_on(\"surname\"),\n", - " block_on(\"date_of_birth\"),\n", - " block_on(\"postcode\"),\n", - "]\n", - "\n", - "db_api = DuckDBAPI()\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=df,\n", - " blocking_rules=blocking_rules,\n", - " db_api=db_api,\n", - " link_type=\"dedupe_only\",\n", - " unique_id_column_name=\"rec_id\",\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:56.752854Z", - "iopub.status.busy": "2024-05-15T15:50:56.752596Z", - "iopub.status.idle": "2024-05-15T15:50:56.907514Z", - "shell.execute_reply": "2024-05-15T15:50:56.906772Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "from splink import Linker\n", - "\n", - "settings = SettingsCreator(\n", - " unique_id_column_name=\"rec_id\",\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=blocking_rules,\n", - " comparisons=[\n", - " ctl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n", - " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", - " ctl.DateComparison(\n", - " \"date_of_birth\",\n", - " input_is_string=True,\n", - " datetime_format=\"%Y%m%d\",\n", - " invalid_dates_as_null=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " cl.LevenshteinAtThresholds(\"soc_sec_id\", [2]),\n", - " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\"postcode\").configure(term_frequency_adjustments=True),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "linker = Linker(df, settings, database_api=DuckDBAPI())" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n", + " - soc_sec_id (no m values are trained).\n", + " - street_number (no m values are trained).\n", + " - postcode (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:35.446472Z", + "iopub.status.busy": "2024-06-07T09:11:35.440198Z", + "iopub.status.idle": "2024-06-07T09:11:36.895235Z", + "shell.execute_reply": "2024-06-07T09:11:36.894603Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:56.910709Z", - "iopub.status.busy": "2024-05-15T15:50:56.910470Z", - "iopub.status.idle": "2024-05-15T15:50:57.119744Z", - "shell.execute_reply": "2024-05-15T15:50:57.119133Z" - } - }, - "outputs": [], - "source": [ - "from splink import block_on\n", - "\n", - "deterministic_rules = [\n", - " block_on(\"soc_sec_id\"),\n", - " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n", - " \"l.given_name = r.surname and l.surname = r.given_name and l.date_of_birth = r.date_of_birth\",\n", - "]\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"date_of_birth\" = r.\"date_of_birth\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - soc_sec_id\n", + " - street_number\n", + " - postcode\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - date_of_birth\n" + ] }, { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:50:57.122905Z", - "iopub.status.busy": "2024-05-15T15:50:57.122623Z", - "iopub.status.idle": "2024-05-15T15:51:01.161828Z", - "shell.execute_reply": "2024-05-15T15:51:01.161251Z" - } - }, - "outputs": [], - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:01.165539Z", - "iopub.status.busy": "2024-05-15T15:51:01.165298Z", - "iopub.status.idle": "2024-05-15T15:51:01.704281Z", - "shell.execute_reply": "2024-05-15T15:51:01.703690Z" - } - }, - "outputs": [], - "source": [ - "em_blocking_rule_1 = block_on(\"date_of_birth\")\n", - "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " em_blocking_rule_1\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.376 in the m_probability of surname, level `Exact match on surname`\n" + ] }, { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:01.707325Z", - "iopub.status.busy": "2024-05-15T15:51:01.707114Z", - "iopub.status.idle": "2024-05-15T15:51:02.290513Z", - "shell.execute_reply": "2024-05-15T15:51:02.290020Z" - } - }, - "outputs": [], - "source": [ - "em_blocking_rule_2 = block_on(\"postcode\")\n", - "session_postcode = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " em_blocking_rule_2\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0158 in the m_probability of given_name, level `All other comparisons`\n" + ] }, { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:02.294783Z", - "iopub.status.busy": "2024-05-15T15:51:02.294498Z", - "iopub.status.idle": "2024-05-15T15:51:02.665651Z", - "shell.execute_reply": "2024-05-15T15:51:02.665073Z" - } - }, - "outputs": [], - "source": [ - "linker.visualisations.match_weights_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.000688 in the m_probability of postcode, level `Exact match on postcode`\n" + ] }, { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:02.668752Z", - "iopub.status.busy": "2024-05-15T15:51:02.668512Z", - "iopub.status.idle": "2024-05-15T15:51:09.240685Z", - "shell.execute_reply": "2024-05-15T15:51:09.240109Z" - } - }, - "outputs": [], - "source": [ - "results = linker.inference.predict(threshold_match_probability=0.2)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 3.65e-05 in the m_probability of postcode, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 4 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n" + ] + } + ], + "source": [ + "em_blocking_rule_1 = block_on(\"date_of_birth\")\n", + "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " em_blocking_rule_1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:36.898638Z", + "iopub.status.busy": "2024-06-07T09:11:36.898156Z", + "iopub.status.idle": "2024-06-07T09:11:37.517318Z", + "shell.execute_reply": "2024-06-07T09:11:37.516459Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"postcode\" = r.\"postcode\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - date_of_birth\n", + " - soc_sec_id\n", + " - street_number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - postcode\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0627 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:09.243955Z", - "iopub.status.busy": "2024-05-15T15:51:09.243667Z", - "iopub.status.idle": "2024-05-15T15:51:11.811265Z", - "shell.execute_reply": "2024-05-15T15:51:11.810638Z" - } - }, - "outputs": [], - "source": [ - "linker.accuracy_analysis_from_labels_column(\n", - " \"cluster\", match_weight_round_to_nearest=0.1, output_type=\"roc\"\n", - ")" + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.00188 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 5.26e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, some m values are not trained).\n" + ] + } + ], + "source": [ + "em_blocking_rule_2 = block_on(\"postcode\")\n", + "session_postcode = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " em_blocking_rule_2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:37.523135Z", + "iopub.status.busy": "2024-06-07T09:11:37.522810Z", + "iopub.status.idle": "2024-06-07T09:11:37.957335Z", + "shell.execute_reply": "2024-06-07T09:11:37.956712Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:37.960629Z", + "iopub.status.busy": "2024-06-07T09:11:37.960358Z", + "iopub.status.idle": "2024-06-07T09:11:44.496784Z", + "shell.execute_reply": "2024-06-07T09:11:44.496254Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + } + ], + "source": [ + "results = linker.inference.predict(threshold_match_probability=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:44.499943Z", + "iopub.status.busy": "2024-06-07T09:11:44.499693Z", + "iopub.status.idle": "2024-06-07T09:11:47.310831Z", + "shell.execute_reply": "2024-06-07T09:11:47.310208Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:11.820946Z", - "iopub.status.busy": "2024-05-15T15:51:11.820644Z", - "iopub.status.idle": "2024-05-15T15:51:12.084284Z", - "shell.execute_reply": "2024-05-15T15:51:12.083443Z" - } - }, - "outputs": [], - "source": [ - "pred_errors_df = linker.inference.prediction_errors_from_labels_column(\n", - " \"cluster\"\n", - ").as_pandas_dataframe()\n", - "len(pred_errors_df)\n", - "pred_errors_df.head()" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.evaluation.accuracy_analysis_from_labels_column(\n", + " \"cluster\", match_weight_round_to_nearest=0.1, output_type=\"roc\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:47.319625Z", + "iopub.status.busy": "2024-06-07T09:11:47.319347Z", + "iopub.status.idle": "2024-06-07T09:11:47.588558Z", + "shell.execute_reply": "2024-06-07T09:11:47.587940Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] }, { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:51:12.087291Z", - "iopub.status.busy": "2024-05-15T15:51:12.087021Z", - "iopub.status.idle": "2024-05-15T15:51:13.092062Z", - "shell.execute_reply": "2024-05-15T15:51:13.091503Z" - } - }, - "outputs": [], - "source": [ - "records = linker.inference.prediction_errors_from_labels_column(\"cluster\").as_record_dict(\n", - " limit=10\n", - ")\n", - "linker.visualisations.waterfall_chart(records)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clerical_match_scorefound_by_blocking_rulesmatch_weightmatch_probabilityrec_id_lrec_id_rgiven_name_lgiven_name_rgamma_given_nametf_given_name_l...postcode_lpostcode_rgamma_postcodetf_postcode_ltf_postcode_rbf_postcodebf_tf_adj_postcodecluster_lcluster_rmatch_key
01.0False-27.4482155.460897e-09rec-993-dup-1rec-993-dup-3westbrookjake00.0004...2704207400.00020.00140.2300371.0rec-993rec-9935
11.0False-27.4482155.460897e-09rec-829-dup-0rec-829-dup-2wildekyra00.0002...3859359500.00040.00060.2300371.0rec-829rec-8295
21.0False-19.3621991.483873e-06rec-829-dup-0rec-829-dup-1wildekyra00.0002...3859388900.00040.00020.2300371.0rec-829rec-8295
31.0True-15.2331222.596344e-05rec-721-dup-0rec-721-dup-1mikhailielly00.0008...4806486000.00080.00140.2300371.0rec-721rec-7212
41.0True-12.6003281.610102e-04rec-401-dup-1rec-401-dup-3whitbealexa-ose00.0002...3040304100.00200.00040.2300371.0rec-401rec-4010
\n", + "

5 rows × 45 columns

\n", + "
" + ], + "text/plain": [ + " clerical_match_score found_by_blocking_rules match_weight \\\n", + "0 1.0 False -27.448215 \n", + "1 1.0 False -27.448215 \n", + "2 1.0 False -19.362199 \n", + "3 1.0 True -15.233122 \n", + "4 1.0 True -12.600328 \n", + "\n", + " match_probability rec_id_l rec_id_r given_name_l given_name_r \\\n", + "0 5.460897e-09 rec-993-dup-1 rec-993-dup-3 westbrook jake \n", + "1 5.460897e-09 rec-829-dup-0 rec-829-dup-2 wilde kyra \n", + "2 1.483873e-06 rec-829-dup-0 rec-829-dup-1 wilde kyra \n", + "3 2.596344e-05 rec-721-dup-0 rec-721-dup-1 mikhaili elly \n", + "4 1.610102e-04 rec-401-dup-1 rec-401-dup-3 whitbe alexa-ose \n", + "\n", + " gamma_given_name tf_given_name_l ... postcode_l postcode_r \\\n", + "0 0 0.0004 ... 2704 2074 \n", + "1 0 0.0002 ... 3859 3595 \n", + "2 0 0.0002 ... 3859 3889 \n", + "3 0 0.0008 ... 4806 4860 \n", + "4 0 0.0002 ... 3040 3041 \n", + "\n", + " gamma_postcode tf_postcode_l tf_postcode_r bf_postcode \\\n", + "0 0 0.0002 0.0014 0.230037 \n", + "1 0 0.0004 0.0006 0.230037 \n", + "2 0 0.0004 0.0002 0.230037 \n", + "3 0 0.0008 0.0014 0.230037 \n", + "4 0 0.0020 0.0004 0.230037 \n", + "\n", + " bf_tf_adj_postcode cluster_l cluster_r match_key \n", + "0 1.0 rec-993 rec-993 5 \n", + "1 1.0 rec-829 rec-829 5 \n", + "2 1.0 rec-829 rec-829 5 \n", + "3 1.0 rec-721 rec-721 2 \n", + "4 1.0 rec-401 rec-401 0 \n", + "\n", + "[5 rows x 45 columns]" ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "94aaeff2f888492ea321d4e4492526ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_bdf3a462cd3d48bda4269ac1cc8ed9ef", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e05a7090510949ac956ea05719a3b8c2", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "b179423ef9d24cb1ac973b4b55daa86c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "bdf3a462cd3d48bda4269ac1cc8ed9ef": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "db3fd6bdb9884f5a88fd4cf5d39330d4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "e05a7090510949ac956ea05719a3b8c2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "e181cb7618b74e4bbf9f2e144b68b87e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_b179423ef9d24cb1ac973b4b55daa86c", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_db3fd6bdb9884f5a88fd4cf5d39330d4", - "tabbable": null, - "tooltip": null, - "value": 100 - } - } - }, - "version_major": 2, - "version_minor": 0 - } + ], + "source": [ + "pred_errors_df = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"cluster\"\n", + ").as_pandas_dataframe()\n", + "len(pred_errors_df)\n", + "pred_errors_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:11:47.591674Z", + "iopub.status.busy": "2024-06-07T09:11:47.591437Z", + "iopub.status.idle": "2024-06-07T09:11:48.630581Z", + "shell.execute_reply": "2024-06-07T09:11:48.629955Z" } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "records = linker.evaluation.prediction_errors_from_labels_column(\"cluster\").as_record_dict(\n", + " limit=10\n", + ")\n", + "linker.visualisations.waterfall_chart(records)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "94aaeff2f888492ea321d4e4492526ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_bdf3a462cd3d48bda4269ac1cc8ed9ef", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e05a7090510949ac956ea05719a3b8c2", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "b179423ef9d24cb1ac973b4b55daa86c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "bdf3a462cd3d48bda4269ac1cc8ed9ef": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "db3fd6bdb9884f5a88fd4cf5d39330d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "e05a7090510949ac956ea05719a3b8c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "e181cb7618b74e4bbf9f2e144b68b87e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_b179423ef9d24cb1ac973b4b55daa86c", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_db3fd6bdb9884f5a88fd4cf5d39330d4", + "tabbable": null, + "tooltip": null, + "value": 100 + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/febrl4.ipynb b/docs/demos/examples/duckdb/febrl4.ipynb index 1a162201ff..a7cf929e46 100644 --- a/docs/demos/examples/duckdb/febrl4.ipynb +++ b/docs/demos/examples/duckdb/febrl4.ipynb @@ -30,17 +30,17 @@ "id": "9c2be649", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:42.115992Z", - "iopub.status.busy": "2024-05-15T15:56:42.115623Z", - "iopub.status.idle": "2024-05-15T15:56:42.138818Z", - "shell.execute_reply": "2024-05-15T15:56:42.137554Z" + "iopub.execute_input": "2024-06-07T09:16:39.973571Z", + "iopub.status.busy": "2024-06-07T09:16:39.973235Z", + "iopub.status.idle": "2024-06-07T09:16:39.993885Z", + "shell.execute_reply": "2024-06-07T09:16:39.992799Z" } }, + "outputs": [], "source": [ "# Uncomment and run this cell if you're running in Google Colab.\n", "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -64,12 +64,184 @@ "id": "832113c9-13b2-43b7-86d0-6051a9db79e8", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:42.144735Z", - "iopub.status.busy": "2024-05-15T15:56:42.144299Z", - "iopub.status.idle": "2024-05-15T15:56:44.123585Z", - "shell.execute_reply": "2024-05-15T15:56:44.122726Z" + "iopub.execute_input": "2024-06-07T09:16:39.999281Z", + "iopub.status.busy": "2024-06-07T09:16:39.998928Z", + "iopub.status.idle": "2024-06-07T09:16:41.957056Z", + "shell.execute_reply": "2024-06-07T09:16:41.956423Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-1070-orgmichaelaneumann8stanley streetmiamiwinston hills4223nsw191511115304218rec-1070
1rec-1016-orgcourtneypainter12pinkerton circuitbega flatsrichlands4560vic191612144066625rec-1016
\n", + "
" + ], + "text/plain": [ + " rec_id given_name surname street_number address_1 \\\n", + "0 rec-1070-org michaela neumann 8 stanley street \n", + "1 rec-1016-org courtney painter 12 pinkerton circuit \n", + "\n", + " address_2 suburb postcode state date_of_birth soc_sec_id \\\n", + "0 miami winston hills 4223 nsw 19151111 5304218 \n", + "1 bega flats richlands 4560 vic 19161214 4066625 \n", + "\n", + " cluster \n", + "0 rec-1070 \n", + "1 rec-1016 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-561-dup-0elton3light setreetpinehillwindermere3212vic196510131551941rec-561
1rec-2642-dup-0mitchellmaxon47edkins streetlochaoairnorth ryde3355nsw193902128859999rec-2642
\n", + "
" + ], + "text/plain": [ + " rec_id given_name surname street_number address_1 \\\n", + "0 rec-561-dup-0 elton 3 light setreet \n", + "1 rec-2642-dup-0 mitchell maxon 47 edkins street \n", + "\n", + " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n", + "0 pinehill windermere 3212 vic 19651013 1551941 rec-561 \n", + "1 lochaoair north ryde 3355 nsw 19390212 8859999 rec-2642 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from splink import splink_datasets\n", "\n", @@ -90,8 +262,7 @@ "\n", "display(dfs[0].head(2))\n", "display(dfs[1].head(2))" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -107,12 +278,13 @@ "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:44.128064Z", - "iopub.status.busy": "2024-05-15T15:56:44.127470Z", - "iopub.status.idle": "2024-05-15T15:56:44.412449Z", - "shell.execute_reply": "2024-05-15T15:56:44.410927Z" + "iopub.execute_input": "2024-06-07T09:16:41.960684Z", + "iopub.status.busy": "2024-06-07T09:16:41.960330Z", + "iopub.status.idle": "2024-06-07T09:16:42.175342Z", + "shell.execute_reply": "2024-06-07T09:16:42.174611Z" } }, + "outputs": [], "source": [ "from splink import DuckDBAPI, Linker, SettingsCreator\n", "\n", @@ -127,8 +299,7 @@ ")\n", "\n", "linker = Linker(dfs, basic_settings, database_api=DuckDBAPI())" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -144,18 +315,97 @@ "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:44.418048Z", - "iopub.status.busy": "2024-05-15T15:56:44.417174Z", - "iopub.status.idle": "2024-05-15T15:56:45.018140Z", - "shell.execute_reply": "2024-05-15T15:56:45.017233Z" + "iopub.execute_input": "2024-06-07T09:16:42.178669Z", + "iopub.status.busy": "2024-06-07T09:16:42.178397Z", + "iopub.status.idle": "2024-06-07T09:16:42.558301Z", + "shell.execute_reply": "2024-06-07T09:16:42.557736Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from splink.exploratory import completeness_chart\n", "\n", "completeness_chart(dfs, db_api=DuckDBAPI())" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -163,18 +413,97 @@ "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:45.022368Z", - "iopub.status.busy": "2024-05-15T15:56:45.021805Z", - "iopub.status.idle": "2024-05-15T15:56:45.760354Z", - "shell.execute_reply": "2024-05-15T15:56:45.759671Z" + "iopub.execute_input": "2024-06-07T09:16:42.561536Z", + "iopub.status.busy": "2024-06-07T09:16:42.561314Z", + "iopub.status.idle": "2024-06-07T09:16:43.066015Z", + "shell.execute_reply": "2024-06-07T09:16:43.065469Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from splink.exploratory import profile_columns\n", "\n", "profile_columns(dfs, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -192,12 +521,92 @@ "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:45.764541Z", - "iopub.status.busy": "2024-05-15T15:56:45.764220Z", - "iopub.status.idle": "2024-05-15T15:56:46.595508Z", - "shell.execute_reply": "2024-05-15T15:56:46.594573Z" + "iopub.execute_input": "2024-06-07T09:16:43.069224Z", + "iopub.status.busy": "2024-06-07T09:16:43.068982Z", + "iopub.status.idle": "2024-06-07T09:16:43.684745Z", + "shell.execute_reply": "2024-06-07T09:16:43.684041Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from splink import DuckDBAPI, block_on\n", "from splink.blocking_analysis import (\n", @@ -225,8 +634,7 @@ " unique_id_column_name=\"rec_id\",\n", " source_dataset_column_name=\"source_dataset\",\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -253,12 +661,13 @@ "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:46.600071Z", - "iopub.status.busy": "2024-05-15T15:56:46.599766Z", - "iopub.status.idle": "2024-05-15T15:56:47.112399Z", - "shell.execute_reply": "2024-05-15T15:56:47.111220Z" + "iopub.execute_input": "2024-06-07T09:16:43.687914Z", + "iopub.status.busy": "2024-06-07T09:16:43.687640Z", + "iopub.status.idle": "2024-06-07T09:16:44.021204Z", + "shell.execute_reply": "2024-06-07T09:16:44.020435Z" } }, + "outputs": [], "source": [ "import splink.comparison_level_library as cll\n", "import splink.comparison_library as cl\n", @@ -307,8 +716,7 @@ "\n", "linker_simple = Linker(dfs, simple_model_settings, database_api=DuckDBAPI())\n", "linker_detailed = Linker(dfs, detailed_model_settings, database_api=DuckDBAPI())" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -342,23 +750,32 @@ "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:47.118143Z", - "iopub.status.busy": "2024-05-15T15:56:47.117804Z", - "iopub.status.idle": "2024-05-15T15:56:47.491169Z", - "shell.execute_reply": "2024-05-15T15:56:47.489974Z" + "iopub.execute_input": "2024-06-07T09:16:44.024887Z", + "iopub.status.busy": "2024-06-07T09:16:44.024650Z", + "iopub.status.idle": "2024-06-07T09:16:44.225016Z", + "shell.execute_reply": "2024-06-07T09:16:44.224395Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + } + ], "source": [ "deterministic_rules = [\n", " block_on(\"soc_sec_id\"),\n", " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n", "]\n", "\n", - "linker_detailed.estimate_probability_two_random_records_match(\n", + "linker_detailed.training.estimate_probability_two_random_records_match(\n", " deterministic_rules, recall=0.8\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -382,18 +799,76 @@ "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:47.497349Z", - "iopub.status.busy": "2024-05-15T15:56:47.496965Z", - "iopub.status.idle": "2024-05-15T15:56:59.095072Z", - "shell.execute_reply": "2024-05-15T15:56:59.094337Z" + "iopub.execute_input": "2024-06-07T09:16:44.228813Z", + "iopub.status.busy": "2024-06-07T09:16:44.228526Z", + "iopub.status.idle": "2024-06-07T09:16:50.708588Z", + "shell.execute_reply": "2024-06-07T09:16:50.707955Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n", + " - soc_sec_id (no m values are trained).\n", + " - street_number (no m values are trained).\n", + " - postcode (no m values are trained).\n" + ] + } + ], "source": [ "# We generally recommend setting max pairs higher (e.g. 1e7 or more)\n", "# But this will run faster for the purpose of this demo\n", - "linker_detailed.estimate_u_using_random_sampling(max_pairs=1e6)" - ], - "outputs": [] + "linker_detailed.training.estimate_u_using_random_sampling(max_pairs=1e6)" + ] }, { "cell_type": "markdown", @@ -411,21 +886,214 @@ "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:56:59.100504Z", - "iopub.status.busy": "2024-05-15T15:56:59.100174Z", - "iopub.status.idle": "2024-05-15T15:57:01.059609Z", - "shell.execute_reply": "2024-05-15T15:57:01.058521Z" + "iopub.execute_input": "2024-06-07T09:16:50.712950Z", + "iopub.status.busy": "2024-06-07T09:16:50.712681Z", + "iopub.status.idle": "2024-06-07T09:16:52.276811Z", + "shell.execute_reply": "2024-06-07T09:16:52.276216Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"date_of_birth\" = r.\"date_of_birth\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - soc_sec_id\n", + " - street_number\n", + " - postcode\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - date_of_birth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.316 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.00365 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 8.84e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"postcode\" = r.\"postcode\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - date_of_birth\n", + " - soc_sec_id\n", + " - street_number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - postcode\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.000489 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 9.4e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, some m values are not trained).\n" + ] + } + ], "source": [ - "session_dob = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", + "session_dob = linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n", " block_on(\"date_of_birth\"), estimate_without_term_frequencies=True\n", ")\n", - "session_pc = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", + "session_pc = linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n", " block_on(\"postcode\"), estimate_without_term_frequencies=True\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -441,16 +1109,95 @@ "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.065654Z", - "iopub.status.busy": "2024-05-15T15:57:01.065325Z", - "iopub.status.idle": "2024-05-15T15:57:01.389061Z", - "shell.execute_reply": "2024-05-15T15:57:01.388339Z" + "iopub.execute_input": "2024-06-07T09:16:52.281563Z", + "iopub.status.busy": "2024-06-07T09:16:52.281303Z", + "iopub.status.idle": "2024-06-07T09:16:52.513958Z", + "shell.execute_reply": "2024-06-07T09:16:52.513314Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "session_dob.m_u_values_interactive_history_chart()" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -466,16 +1213,95 @@ "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.393145Z", - "iopub.status.busy": "2024-05-15T15:57:01.392842Z", - "iopub.status.idle": "2024-05-15T15:57:01.561233Z", - "shell.execute_reply": "2024-05-15T15:57:01.560475Z" + "iopub.execute_input": "2024-06-07T09:16:52.517168Z", + "iopub.status.busy": "2024-06-07T09:16:52.516948Z", + "iopub.status.idle": "2024-06-07T09:16:52.637604Z", + "shell.execute_reply": "2024-06-07T09:16:52.636662Z" } }, - "source": [ - "linker_detailed.parameter_estimate_comparisons_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_detailed.visualisations.parameter_estimate_comparisons_chart()" + ] }, { "cell_type": "markdown", @@ -491,26 +1317,577 @@ "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.565611Z", - "iopub.status.busy": "2024-05-15T15:57:01.565220Z", - "iopub.status.idle": "2024-05-15T15:57:04.177024Z", - "shell.execute_reply": "2024-05-15T15:57:04.176371Z" + "iopub.execute_input": "2024-06-07T09:16:52.640970Z", + "iopub.status.busy": "2024-06-07T09:16:52.640725Z", + "iopub.status.idle": "2024-06-07T09:16:54.701590Z", + "shell.execute_reply": "2024-06-07T09:16:54.701058Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - street_number (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"given_name\" = r.\"given_name\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - surname\n", + " - street_number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - given_name\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.0816 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0263 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0249 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.0229 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was -0.02 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.0165 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.0132 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.0102 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.00772 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00577 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -0.00428 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was 0.00316 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was -0.00233 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was -0.00172 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.00127 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.000936 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.000691 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was -0.000511 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was 0.000378 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was -0.00028 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was 0.000208 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was -0.000154 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was 0.000114 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 24: Largest change in params was -8.48e-05 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 24 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"street_number\" = r.\"street_number\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - street_number\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0445 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0288 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0278 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.0269 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was -0.0245 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.0209 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.0169 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.0132 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.00995 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00738 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -0.00541 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.00396 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was -0.0029 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.00213 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was -0.00158 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.00118 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.000894 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 0.000683 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was -0.000561 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was 0.000469 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was -0.000389 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was -0.000321 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was 0.000264 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 24: Largest change in params was 0.000217 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 25: Largest change in params was 0.000177 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 25 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "linker_simple.estimate_probability_two_random_records_match(\n", + "linker_simple.training.estimate_probability_two_random_records_match(\n", " deterministic_rules, recall=0.8\n", ")\n", - "linker_simple.estimate_u_using_random_sampling(max_pairs=1e7)\n", - "session_ssid = linker_simple.estimate_parameters_using_expectation_maximisation(\n", + "linker_simple.training.estimate_u_using_random_sampling(max_pairs=1e7)\n", + "session_ssid = linker_simple.training.estimate_parameters_using_expectation_maximisation(\n", " block_on(\"given_name\"), estimate_without_term_frequencies=True\n", ")\n", - "session_pc = linker_simple.estimate_parameters_using_expectation_maximisation(\n", + "session_pc = linker_simple.training.estimate_parameters_using_expectation_maximisation(\n", " block_on(\"street_number\"), estimate_without_term_frequencies=True\n", ")\n", - "linker_simple.parameter_estimate_comparisons_chart()" - ], - "outputs": [] + "linker_simple.visualisations.parameter_estimate_comparisons_chart()" + ] }, { "cell_type": "code", @@ -518,12 +1895,13 @@ "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.180496Z", - "iopub.status.busy": "2024-05-15T15:57:04.180247Z", - "iopub.status.idle": "2024-05-15T15:57:04.183145Z", - "shell.execute_reply": "2024-05-15T15:57:04.182523Z" + "iopub.execute_input": "2024-06-07T09:16:54.704569Z", + "iopub.status.busy": "2024-06-07T09:16:54.704327Z", + "iopub.status.idle": "2024-06-07T09:16:54.707573Z", + "shell.execute_reply": "2024-06-07T09:16:54.707000Z" } }, + "outputs": [], "source": [ "# import json\n", "# we can have a look at the full settings if we wish, including the values of our estimated parameters:\n", @@ -531,8 +1909,7 @@ "# we can also get a handy summary of of the model in an easily readable format if we wish:\n", "# print(linker_detailed._settings_obj.human_readable_description)\n", "# (we suppress output here for brevity)" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -550,16 +1927,95 @@ "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.186220Z", - "iopub.status.busy": "2024-05-15T15:57:04.185782Z", - "iopub.status.idle": "2024-05-15T15:57:04.541188Z", - "shell.execute_reply": "2024-05-15T15:57:04.540169Z" + "iopub.execute_input": "2024-06-07T09:16:54.710434Z", + "iopub.status.busy": "2024-06-07T09:16:54.710226Z", + "iopub.status.idle": "2024-06-07T09:16:54.974408Z", + "shell.execute_reply": "2024-06-07T09:16:54.973855Z" } }, - "source": [ - "linker_simple.match_weights_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_simple.visualisations.match_weights_chart()" + ] }, { "cell_type": "code", @@ -567,16 +2023,95 @@ "id": "c095ff2b-405b-427c-849f-1468f6ca98e0", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.545921Z", - "iopub.status.busy": "2024-05-15T15:57:04.545071Z", - "iopub.status.idle": "2024-05-15T15:57:04.888788Z", - "shell.execute_reply": "2024-05-15T15:57:04.887944Z" + "iopub.execute_input": "2024-06-07T09:16:54.977562Z", + "iopub.status.busy": "2024-06-07T09:16:54.977352Z", + "iopub.status.idle": "2024-06-07T09:16:55.252915Z", + "shell.execute_reply": "2024-06-07T09:16:55.251950Z" } }, - "source": [ - "linker_detailed.match_weights_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_detailed.visualisations.match_weights_chart()" + ] }, { "cell_type": "markdown", @@ -594,17 +2129,96 @@ "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.893722Z", - "iopub.status.busy": "2024-05-15T15:57:04.893207Z", - "iopub.status.idle": "2024-05-15T15:57:05.067224Z", - "shell.execute_reply": "2024-05-15T15:57:05.066686Z" + "iopub.execute_input": "2024-06-07T09:16:55.256437Z", + "iopub.status.busy": "2024-06-07T09:16:55.256148Z", + "iopub.status.idle": "2024-06-07T09:16:55.408274Z", + "shell.execute_reply": "2024-06-07T09:16:55.407631Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# linker_simple.m_u_parameters_chart()\n", - "linker_detailed.m_u_parameters_chart()" - ], - "outputs": [] + "linker_detailed.visualisations.m_u_parameters_chart()" + ] }, { "cell_type": "markdown", @@ -622,16 +2236,95 @@ "id": "149962d6-a2ad-412f-aa05-8697beb12ed0", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:05.070283Z", - "iopub.status.busy": "2024-05-15T15:57:05.070040Z", - "iopub.status.idle": "2024-05-15T15:57:06.960773Z", - "shell.execute_reply": "2024-05-15T15:57:06.959848Z" + "iopub.execute_input": "2024-06-07T09:16:55.411718Z", + "iopub.status.busy": "2024-06-07T09:16:55.411484Z", + "iopub.status.idle": "2024-06-07T09:16:57.179378Z", + "shell.execute_reply": "2024-06-07T09:16:57.178861Z" } }, - "source": [ - "linker_simple.unlinkables_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_simple.evaluation.unlinkables_chart()" + ] }, { "cell_type": "code", @@ -639,16 +2332,95 @@ "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:06.965159Z", - "iopub.status.busy": "2024-05-15T15:57:06.964863Z", - "iopub.status.idle": "2024-05-15T15:57:07.337075Z", - "shell.execute_reply": "2024-05-15T15:57:07.336337Z" + "iopub.execute_input": "2024-06-07T09:16:57.182832Z", + "iopub.status.busy": "2024-06-07T09:16:57.182595Z", + "iopub.status.idle": "2024-06-07T09:16:57.517285Z", + "shell.execute_reply": "2024-06-07T09:16:57.516677Z" } }, - "source": [ - "linker_detailed.unlinkables_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_detailed.evaluation.unlinkables_chart()" + ] }, { "cell_type": "markdown", @@ -676,18 +2448,252 @@ "id": "03348477-c3c1-42e7-a8af-8f678acc9d58", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:07.340733Z", - "iopub.status.busy": "2024-05-15T15:57:07.340494Z", - "iopub.status.idle": "2024-05-15T15:57:12.239689Z", - "shell.execute_reply": "2024-05-15T15:57:12.238900Z" + "iopub.execute_input": "2024-06-07T09:16:57.520557Z", + "iopub.status.busy": "2024-06-07T09:16:57.520288Z", + "iopub.status.idle": "2024-06-07T09:17:01.939499Z", + "shell.execute_reply": "2024-06-07T09:17:01.938793Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrec_id_lrec_id_rgiven_name_lgiven_name_rgamma_given_nametf_given_name_l...gamma_postcodetf_postcode_ltf_postcode_rbf_postcodebf_tf_adj_postcodeaddress_1_laddress_1_rstate_lstate_rmatch_key
0-1.8250030.220115__splink__input_table_0__splink__input_table_1rec-760-orgrec-3951-dup-0lachlanlachlan30.0113...30.00070.0007718.8240031.672690bushby closetemplestoew avenuenswvic0
1-1.6373660.243251__splink__input_table_0__splink__input_table_1rec-4980-orgrec-4980-dup-0isabellactercteko00.0069...30.00040.0004718.8240032.927207sturt avenuesturta venuevicvic2
2-1.0914450.319400__splink__input_table_0__splink__input_table_1rec-585-orgrec-585-dup-0dannystephenson00.0001...20.00160.001211.3956081.000000o'shanassy streeto'shanassy streettastas1
3-0.9421480.342303__splink__input_table_0__splink__input_table_1rec-1250-orgrec-1250-dup-0lukegazzola00.0055...20.00150.000211.3956081.000000newman morris circuitnewman morr is circuitnswnsw1
4-0.1864990.467727__splink__input_table_0__splink__input_table_1rec-4763-orgrec-4763-dup-0maxalisha00.0021...10.00040.00160.0444691.000000duffy streetduffy s treetnswnsw2
\n", + "

5 rows × 47 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l \\\n", + "0 -1.825003 0.220115 __splink__input_table_0 \n", + "1 -1.637366 0.243251 __splink__input_table_0 \n", + "2 -1.091445 0.319400 __splink__input_table_0 \n", + "3 -0.942148 0.342303 __splink__input_table_0 \n", + "4 -0.186499 0.467727 __splink__input_table_0 \n", + "\n", + " source_dataset_r rec_id_l rec_id_r given_name_l \\\n", + "0 __splink__input_table_1 rec-760-org rec-3951-dup-0 lachlan \n", + "1 __splink__input_table_1 rec-4980-org rec-4980-dup-0 isabella \n", + "2 __splink__input_table_1 rec-585-org rec-585-dup-0 danny \n", + "3 __splink__input_table_1 rec-1250-org rec-1250-dup-0 luke \n", + "4 __splink__input_table_1 rec-4763-org rec-4763-dup-0 max \n", + "\n", + " given_name_r gamma_given_name tf_given_name_l ... gamma_postcode \\\n", + "0 lachlan 3 0.0113 ... 3 \n", + "1 ctercteko 0 0.0069 ... 3 \n", + "2 stephenson 0 0.0001 ... 2 \n", + "3 gazzola 0 0.0055 ... 2 \n", + "4 alisha 0 0.0021 ... 1 \n", + "\n", + " tf_postcode_l tf_postcode_r bf_postcode bf_tf_adj_postcode \\\n", + "0 0.0007 0.0007 718.824003 1.672690 \n", + "1 0.0004 0.0004 718.824003 2.927207 \n", + "2 0.0016 0.0012 11.395608 1.000000 \n", + "3 0.0015 0.0002 11.395608 1.000000 \n", + "4 0.0004 0.0016 0.044469 1.000000 \n", + "\n", + " address_1_l address_1_r state_l state_r \\\n", + "0 bushby close templestoew avenue nsw vic \n", + "1 sturt avenue sturta venue vic vic \n", + "2 o'shanassy street o'shanassy street tas tas \n", + "3 newman morris circuit newman morr is circuit nsw nsw \n", + "4 duffy street duffy s treet nsw nsw \n", + "\n", + " match_key \n", + "0 0 \n", + "1 2 \n", + "2 1 \n", + "3 1 \n", + "4 2 \n", + "\n", + "[5 rows x 47 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "predictions = linker_detailed.predict(threshold_match_probability=0.2)\n", + "predictions = linker_detailed.inference.predict(threshold_match_probability=0.2)\n", "df_predictions = predictions.as_pandas_dataframe()\n", "df_predictions.head(5)" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -703,18 +2709,110 @@ "id": "ce8d409c-7ef5-4485-9ec0-8b539fdecb1f", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:12.244377Z", - "iopub.status.busy": "2024-05-15T15:57:12.243938Z", - "iopub.status.idle": "2024-05-15T15:57:15.174716Z", - "shell.execute_reply": "2024-05-15T15:57:15.173769Z" + "iopub.execute_input": "2024-06-07T09:17:01.942896Z", + "iopub.status.busy": "2024-06-07T09:17:01.942661Z", + "iopub.status.idle": "2024-06-07T09:17:04.159161Z", + "shell.execute_reply": "2024-06-07T09:17:04.158614Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "linker_detailed.accuracy_analysis_from_labels_column(\n", + "linker_detailed.evaluation.accuracy_analysis_from_labels_column(\n", " \"cluster\", output_type=\"precision_recall\"\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -730,20 +2828,40 @@ "id": "ade53248-212f-4776-8d7d-4632b1749425", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.183049Z", - "iopub.status.busy": "2024-05-15T15:57:15.182695Z", - "iopub.status.idle": "2024-05-15T15:57:15.493444Z", - "shell.execute_reply": "2024-05-15T15:57:15.492713Z" + "iopub.execute_input": "2024-06-07T09:17:04.165374Z", + "iopub.status.busy": "2024-06-07T09:17:04.165099Z", + "iopub.status.idle": "2024-06-07T09:17:04.301694Z", + "shell.execute_reply": "2024-06-07T09:17:04.301045Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 0\n" + ] + }, + { + "data": { + "text/plain": [ + "2 4958\n", + "1 84\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "clusters = linker_detailed.cluster_pairwise_predictions_at_threshold(\n", + "clusters = linker_detailed.clustering.cluster_pairwise_predictions_at_threshold(\n", " predictions, threshold_match_probability=0.99\n", ")\n", "df_clusters = clusters.as_pandas_dataframe().sort_values(\"cluster_id\")\n", "df_clusters.groupby(\"cluster_id\").size().value_counts()" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -763,12 +2881,13 @@ "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.500107Z", - "iopub.status.busy": "2024-05-15T15:57:15.499499Z", - "iopub.status.idle": "2024-05-15T15:57:15.523366Z", - "shell.execute_reply": "2024-05-15T15:57:15.522625Z" + "iopub.execute_input": "2024-06-07T09:17:04.305169Z", + "iopub.status.busy": "2024-06-07T09:17:04.304886Z", + "iopub.status.idle": "2024-06-07T09:17:04.322035Z", + "shell.execute_reply": "2024-06-07T09:17:04.321351Z" } }, + "outputs": [], "source": [ "df_predictions[\"cluster_l\"] = df_predictions[\"rec_id_l\"].apply(\n", " lambda x: \"-\".join(x.split(\"-\")[:2])\n", @@ -779,8 +2898,7 @@ "df_true_links = df_predictions[\n", " df_predictions[\"cluster_l\"] == df_predictions[\"cluster_r\"]\n", "].sort_values(\"match_probability\")" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -788,19 +2906,98 @@ "id": "bc531ca3-fe0d-480d-b059-a7125474fb22", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.527453Z", - "iopub.status.busy": "2024-05-15T15:57:15.527121Z", - "iopub.status.idle": "2024-05-15T15:57:16.507088Z", - "shell.execute_reply": "2024-05-15T15:57:16.506251Z" + "iopub.execute_input": "2024-06-07T09:17:04.325739Z", + "iopub.status.busy": "2024-06-07T09:17:04.325483Z", + "iopub.status.idle": "2024-06-07T09:17:04.966790Z", + "shell.execute_reply": "2024-06-07T09:17:04.966182Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "records_to_view = 3\n", - "linker_detailed.waterfall_chart(\n", + "linker_detailed.visualisations.waterfall_chart(\n", " df_true_links.head(records_to_view).to_dict(orient=\"records\")\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -808,21 +3005,100 @@ "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:16.510992Z", - "iopub.status.busy": "2024-05-15T15:57:16.510681Z", - "iopub.status.idle": "2024-05-15T15:57:17.322254Z", - "shell.execute_reply": "2024-05-15T15:57:17.321456Z" + "iopub.execute_input": "2024-06-07T09:17:04.969789Z", + "iopub.status.busy": "2024-06-07T09:17:04.969553Z", + "iopub.status.idle": "2024-06-07T09:17:05.445307Z", + "shell.execute_reply": "2024-06-07T09:17:05.444530Z" } }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_non_links = df_predictions[\n", " df_predictions[\"cluster_l\"] != df_predictions[\"cluster_r\"]\n", "].sort_values(\"match_probability\", ascending=False)\n", - "linker_detailed.waterfall_chart(\n", + "linker_detailed.visualisations.waterfall_chart(\n", " df_non_links.head(records_to_view).to_dict(orient=\"records\")\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -848,12 +3124,13 @@ "id": "2a7229da-9f79-4151-a6b1-018d17205f5f", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:17.327035Z", - "iopub.status.busy": "2024-05-15T15:57:17.326665Z", - "iopub.status.idle": "2024-05-15T15:57:17.342204Z", - "shell.execute_reply": "2024-05-15T15:57:17.341227Z" + "iopub.execute_input": "2024-06-07T09:17:05.448836Z", + "iopub.status.busy": "2024-06-07T09:17:05.448543Z", + "iopub.status.idle": "2024-06-07T09:17:05.460100Z", + "shell.execute_reply": "2024-06-07T09:17:05.459191Z" } }, + "outputs": [], "source": [ "# we need to append a full name column to our source data frames\n", "# so that we can use it for term frequency adjustments\n", @@ -945,8 +3222,7 @@ " ],\n", " \"retain_intermediate_calculation_columns\": True,\n", "}" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -954,25 +3230,83 @@ "id": "1581eeeb-246b-46de-be88-ba4dc821fce7", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:57:17.346493Z", - "iopub.status.busy": "2024-05-15T15:57:17.346091Z", - "iopub.status.idle": "2024-05-15T15:58:52.238122Z", - "shell.execute_reply": "2024-05-15T15:58:52.237374Z" + "iopub.execute_input": "2024-06-07T09:17:05.463764Z", + "iopub.status.busy": "2024-06-07T09:17:05.463499Z", + "iopub.status.idle": "2024-06-07T09:18:25.606071Z", + "shell.execute_reply": "2024-06-07T09:18:25.605371Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (no m values are trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n", + " - Social security ID (no m values are trained).\n", + " - Street number (no m values are trained).\n", + " - Postcode (no m values are trained).\n" + ] + } + ], "source": [ "# train\n", "linker_advanced = Linker(dfs, extended_model_settings, database_api=DuckDBAPI())\n", - "linker_advanced.estimate_probability_two_random_records_match(\n", + "linker_advanced.training.estimate_probability_two_random_records_match(\n", " deterministic_rules, recall=0.8\n", ")\n", "# We recommend increasing target rows to 1e8 improve accuracy for u\n", "# values in full name comparison, as we have subdivided the data more finely\n", "\n", "# Here, 1e7 for speed\n", - "linker_advanced.estimate_u_using_random_sampling(max_pairs=1e7)" - ], - "outputs": [] + "linker_advanced.training.estimate_u_using_random_sampling(max_pairs=1e7)" + ] }, { "cell_type": "code", @@ -980,18 +3314,107 @@ "id": "265f0651", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:58:52.244579Z", - "iopub.status.busy": "2024-05-15T15:58:52.244307Z", - "iopub.status.idle": "2024-05-15T15:58:53.189566Z", - "shell.execute_reply": "2024-05-15T15:58:53.188815Z" + "iopub.execute_input": "2024-06-07T09:18:25.610698Z", + "iopub.status.busy": "2024-06-07T09:18:25.610416Z", + "iopub.status.idle": "2024-06-07T09:18:26.522700Z", + "shell.execute_reply": "2024-06-07T09:18:26.522017Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.date_of_birth = r.date_of_birth\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - Full name\n", + " - Social security ID\n", + " - Street number\n", + " - Postcode\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - date_of_birth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.465 in the m_probability of Full name, level `Exact match on full_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.00249 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 4.89e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (some m values are not trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n" + ] + } + ], "source": [ - "session_dob = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", + "session_dob = linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n", " \"l.date_of_birth = r.date_of_birth\", estimate_without_term_frequencies=True\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -999,18 +3422,155 @@ "id": "ebcb15c8", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:58:53.193304Z", - "iopub.status.busy": "2024-05-15T15:58:53.193012Z", - "iopub.status.idle": "2024-05-15T15:58:54.287492Z", - "shell.execute_reply": "2024-05-15T15:58:54.286732Z" + "iopub.execute_input": "2024-06-07T09:18:26.526171Z", + "iopub.status.busy": "2024-06-07T09:18:26.525914Z", + "iopub.status.idle": "2024-06-07T09:18:27.518982Z", + "shell.execute_reply": "2024-06-07T09:18:27.518364Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.postcode = r.postcode\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - Full name\n", + " - date_of_birth\n", + " - Social security ID\n", + " - Street number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - Postcode\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0375 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.000645 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 1.72e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (some m values are not trained).\n", + " - date_of_birth (some u values are not trained, some m values are not trained).\n" + ] + } + ], "source": [ - "session_pc = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", + "session_pc = linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n", " \"l.postcode = r.postcode\", estimate_without_term_frequencies=True\n", ")" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -1018,16 +3578,95 @@ "id": "d9d21e85-b89b-435a-8b75-142166ac3f31", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.292571Z", - "iopub.status.busy": "2024-05-15T15:58:54.292308Z", - "iopub.status.idle": "2024-05-15T15:58:54.443712Z", - "shell.execute_reply": "2024-05-15T15:58:54.443023Z" + "iopub.execute_input": "2024-06-07T09:18:27.523341Z", + "iopub.status.busy": "2024-06-07T09:18:27.523109Z", + "iopub.status.idle": "2024-06-07T09:18:27.711081Z", + "shell.execute_reply": "2024-06-07T09:18:27.710381Z" } }, - "source": [ - "linker_advanced.parameter_estimate_comparisons_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_advanced.visualisations.parameter_estimate_comparisons_chart()" + ] }, { "cell_type": "code", @@ -1035,16 +3674,95 @@ "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.447134Z", - "iopub.status.busy": "2024-05-15T15:58:54.446857Z", - "iopub.status.idle": "2024-05-15T15:58:54.770678Z", - "shell.execute_reply": "2024-05-15T15:58:54.770024Z" + "iopub.execute_input": "2024-06-07T09:18:27.746299Z", + "iopub.status.busy": "2024-06-07T09:18:27.744495Z", + "iopub.status.idle": "2024-06-07T09:18:28.388134Z", + "shell.execute_reply": "2024-06-07T09:18:28.387392Z" } }, - "source": [ - "linker_advanced.match_weights_chart()" + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } ], - "outputs": [] + "source": [ + "linker_advanced.visualisations.match_weights_chart()" + ] }, { "cell_type": "code", @@ -1052,22 +3770,57 @@ "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e", "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.773893Z", - "iopub.status.busy": "2024-05-15T15:58:54.773655Z", - "iopub.status.idle": "2024-05-15T15:58:56.607253Z", - "shell.execute_reply": "2024-05-15T15:58:56.606584Z" + "iopub.execute_input": "2024-06-07T09:18:28.392069Z", + "iopub.status.busy": "2024-06-07T09:18:28.391745Z", + "iopub.status.idle": "2024-06-07T09:18:30.289569Z", + "shell.execute_reply": "2024-06-07T09:18:30.288893Z" } }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'Full name':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 0\n" + ] + }, + { + "data": { + "text/plain": [ + "2 4960\n", + "1 80\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "predictions_adv = linker_advanced.predict()\n", + "predictions_adv = linker_advanced.inference.predict()\n", "df_predictions_adv = predictions_adv.as_pandas_dataframe()\n", - "clusters_adv = linker_advanced.cluster_pairwise_predictions_at_threshold(\n", + "clusters_adv = linker_advanced.clustering.cluster_pairwise_predictions_at_threshold(\n", " predictions_adv, threshold_match_probability=0.99\n", ")\n", "df_clusters_adv = clusters_adv.as_pandas_dataframe().sort_values(\"cluster_id\")\n", "df_clusters_adv.groupby(\"cluster_id\").size().value_counts()" - ], - "outputs": [] + ] }, { "cell_type": "markdown", diff --git a/docs/demos/examples/duckdb/link_only.ipynb b/docs/demos/examples/duckdb/link_only.ipynb index a1f64626f5..dba266abe1 100644 --- a/docs/demos/examples/duckdb/link_only.ipynb +++ b/docs/demos/examples/duckdb/link_only.ipynb @@ -1,253 +1,992 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking without deduplication\n", - "\n", - "A simple record linkage model using the `link_only` [link type](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#link_type).\n", - "\n", - "With `link_only`, only between-dataset record comparisons are generated. No within-dataset record comparisons are created, meaning that the model does not attempt to find within-dataset duplicates.\n" + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking without deduplication\n", + "\n", + "A simple record linkage model using the `link_only` [link type](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#link_type).\n", + "\n", + "With `link_only`, only between-dataset record comparisons are generated. No within-dataset record comparisons are created, meaning that the model does not attempt to find within-dataset duplicates.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:42.926356Z", + "iopub.status.busy": "2024-06-07T09:18:42.925982Z", + "iopub.status.idle": "2024-06-07T09:18:42.943456Z", + "shell.execute_reply": "2024-06-07T09:18:42.942569Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:42.947959Z", + "iopub.status.busy": "2024-06-07T09:18:42.947640Z", + "iopub.status.idle": "2024-06-07T09:18:44.652788Z", + "shell.execute_reply": "2024-06-07T09:18:44.652024Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idfirst_namesurnamedobcityemailcluster
930930LukeRobinnso1981-10-18Coventrylrobinson@wolf.org233
385385LottieDavis1972-06-12NaNlottie.d7@morgan-pierce.com100
\n", + "
" + ], + "text/plain": [ + " unique_id first_name surname dob city \\\n", + "930 930 Luke Robinnso 1981-10-18 Coventry \n", + "385 385 Lottie Davis 1972-06-12 NaN \n", + "\n", + " email cluster \n", + "930 lrobinson@wolf.org 233 \n", + "385 lottie.d7@morgan-pierce.com 100 " ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import splink_datasets\n", + "\n", + "df = splink_datasets.fake_1000\n", + "\n", + "# Split a simple dataset into two, separate datasets which can be linked together.\n", + "df_l = df.sample(frac=0.5)\n", + "df_r = df.drop(df_l.index)\n", + "\n", + "df_l.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:44.695716Z", + "iopub.status.busy": "2024-06-07T09:18:44.695390Z", + "iopub.status.idle": "2024-06-07T09:18:44.942598Z", + "shell.execute_reply": "2024-06-07T09:18:44.942052Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"first_name\"),\n", + " block_on(\"surname\"),\n", + " ],\n", + " comparisons=[\n", + " ctl.NameComparison(\n", + " \"first_name\",\n", + " ),\n", + " ctl.NameComparison(\"surname\"),\n", + " ctl.DateComparison(\n", + " \"dob\",\n", + " input_is_string=True,\n", + " invalid_dates_as_null=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", + " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", + " ],\n", + ")\n", + "\n", + "linker = Linker(\n", + " [df_l, df_r],\n", + " settings,\n", + " database_api=DuckDBAPI(),\n", + " input_table_aliases=[\"df_left\", \"df_right\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:44.946395Z", + "iopub.status.busy": "2024-06-07T09:18:44.946113Z", + "iopub.status.idle": "2024-06-07T09:18:45.188705Z", + "shell.execute_reply": "2024-06-07T09:18:45.188192Z" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import completeness_chart\n", + "\n", + "completeness_chart(\n", + " [df_l, df_r],\n", + " cols=[\"first_name\", \"surname\", \"dob\", \"city\", \"email\"],\n", + " db_api=DuckDBAPI(),\n", + " table_names_for_chart=[\"df_left\", \"df_right\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:45.192584Z", + "iopub.status.busy": "2024-06-07T09:18:45.192253Z", + "iopub.status.idle": "2024-06-07T09:18:45.341533Z", + "shell.execute_reply": "2024-06-07T09:18:45.340965Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.00346.\n", + "This means that amongst all possible pairwise record comparisons, one in 288.78 are expected to match. With 250,000 total possible comparisons, we expect a total of around 865.71 matching pairs\n" + ] + } + ], + "source": [ + "\n", + "deterministic_rules = [\n", + " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", + " block_on(\"email\"),\n", + "]\n", + "\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:45.344512Z", + "iopub.status.busy": "2024-06-07T09:18:45.344289Z", + "iopub.status.idle": "2024-06-07T09:18:46.142225Z", + "shell.execute_reply": "2024-06-07T09:18:46.141712Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:43.284619Z", - "iopub.status.busy": "2024-03-27T15:14:43.284336Z", - "iopub.status.idle": "2024-03-27T15:14:43.289588Z", - "shell.execute_reply": "2024-03-27T15:14:43.288971Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:43.293314Z", - "iopub.status.busy": "2024-03-27T15:14:43.293026Z", - "iopub.status.idle": "2024-03-27T15:14:45.144216Z", - "shell.execute_reply": "2024-03-27T15:14:45.143259Z" - } - }, - "outputs": [], - "source": [ - "from splink import splink_datasets\n", - "\n", - "df = splink_datasets.fake_1000\n", - "\n", - "# Split a simple dataset into two, separate datasets which can be linked together.\n", - "df_l = df.sample(frac=0.5)\n", - "df_r = df.drop(df_l.index)\n", - "\n", - "df_l.head(2)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:45.149667Z", - "iopub.status.busy": "2024-03-27T15:14:45.149322Z", - "iopub.status.idle": "2024-03-27T15:14:45.584636Z", - "shell.execute_reply": "2024-03-27T15:14:45.583909Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"link_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\"),\n", - " block_on(\"surname\"),\n", - " ],\n", - " comparisons=[\n", - " ctl.NameComparison(\n", - " \"first_name\",\n", - " ),\n", - " ctl.NameComparison(\"surname\"),\n", - " ctl.DateComparison(\n", - " \"dob\",\n", - " input_is_string=True,\n", - " invalid_dates_as_null=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", - " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", - " ],\n", - ")\n", - "\n", - "linker = Linker(\n", - " [df_l, df_r],\n", - " settings,\n", - " database_api=DuckDBAPI(),\n", - " input_table_aliases=[\"df_left\", \"df_right\"],\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - dob (no m values are trained).\n", + " - city (no m values are trained).\n", + " - email (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:46.145662Z", + "iopub.status.busy": "2024-06-07T09:18:46.145393Z", + "iopub.status.idle": "2024-06-07T09:18:47.814138Z", + "shell.execute_reply": "2024-06-07T09:18:47.813573Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:45.588957Z", - "iopub.status.busy": "2024-03-27T15:14:45.588354Z", - "iopub.status.idle": "2024-03-27T15:14:46.120692Z", - "shell.execute_reply": "2024-03-27T15:14:46.119623Z" - } - }, - "outputs": [], - "source": [ - "from splink.exploratory import completeness_chart\n", - "\n", - "completeness_chart(\n", - " [df_l, df_r],\n", - " cols=[\"first_name\", \"surname\", \"dob\", \"city\", \"email\"],\n", - " db_api=DuckDBAPI(),\n", - " table_names_for_chart=[\"df_left\", \"df_right\"],\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"dob\" = r.\"dob\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - city\n", + " - email\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - dob\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:46.124880Z", - "iopub.status.busy": "2024-03-27T15:14:46.124449Z", - "iopub.status.idle": "2024-03-27T15:14:46.333422Z", - "shell.execute_reply": "2024-03-27T15:14:46.332477Z" - } - }, - "outputs": [], - "source": [ - "\n", - "deterministic_rules = [\n", - " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", - " block_on(\"email\"),\n", - "]\n", - "\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:46.337604Z", - "iopub.status.busy": "2024-03-27T15:14:46.337231Z", - "iopub.status.idle": "2024-03-27T15:14:47.729876Z", - "shell.execute_reply": "2024-03-27T15:14:47.728440Z" - } - }, - "outputs": [], - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.387 in the m_probability of surname, level `Exact match on surname`\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:47.735185Z", - "iopub.status.busy": "2024-03-27T15:14:47.734598Z", - "iopub.status.idle": "2024-03-27T15:14:49.944190Z", - "shell.execute_reply": "2024-03-27T15:14:49.943452Z" - } - }, - "outputs": [], - "source": [ - "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n", - "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"email\")\n", - ")\n", - "session_first_name = linker.training.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"first_name\")\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.113 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:49.948346Z", - "iopub.status.busy": "2024-03-27T15:14:49.948058Z", - "iopub.status.idle": "2024-03-27T15:14:50.272696Z", - "shell.execute_reply": "2024-03-27T15:14:50.271981Z" - } - }, - "outputs": [], - "source": [ - "results = linker.inference.predict(threshold_match_probability=0.9)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0347 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:50.276802Z", - "iopub.status.busy": "2024-03-27T15:14:50.276415Z", - "iopub.status.idle": "2024-03-27T15:14:50.299341Z", - "shell.execute_reply": "2024-03-27T15:14:50.298407Z" - } - }, - "outputs": [], - "source": [ - "results.as_pandas_dataframe(limit=5)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.0122 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00504 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00226 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.00105 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.000497 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.000237 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.000114 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 5.46e-05 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 11 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - dob (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"email\" = r.\"email\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - dob\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - email\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.453 in the m_probability of dob, level `Exact match on dob`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.0816 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0173 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00584 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00237 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00106 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.000497 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.000238 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.000115 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 5.6e-05 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 10 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"first_name\" = r.\"first_name\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - surname\n", + " - dob\n", + " - city\n", + " - email\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.182 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0082 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.00119 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.000228 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was -4.89e-05 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 5 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" + ], + "source": [ + "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n", + "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"email\")\n", + ")\n", + "session_first_name = linker.training.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"first_name\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:47.817058Z", + "iopub.status.busy": "2024-06-07T09:18:47.816828Z", + "iopub.status.idle": "2024-06-07T09:18:48.064527Z", + "shell.execute_reply": "2024-06-07T09:18:48.063844Z" + } + }, + "outputs": [], + "source": [ + "results = linker.inference.predict(threshold_match_probability=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:18:48.067845Z", + "iopub.status.busy": "2024-06-07T09:18:48.067582Z", + "iopub.status.idle": "2024-06-07T09:18:48.084784Z", + "shell.execute_reply": "2024-06-07T09:18:48.084179Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_runique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_namesurname_l...dob_ldob_rgamma_dobcity_lcity_rgamma_cityemail_lemail_rgamma_emailmatch_key
03.201620.90196df_leftdf_right445444JacobJacob3Campbell...1988-06-051997-05-041LononLondon0j.c65@ortiz.comNone-10
13.201620.90196df_leftdf_right774778ArmstrongArmstrong3Eva...2027-04-212017-04-231PeterborouhgPeterbotrough0e.armstrong16odonnell.infoNone-10
23.201620.90196df_leftdf_right239242FreyaFreya3Shah...1972-01-171970-12-171LondonLonnod0f.s@flynn.comNone-10
33.201620.90196df_leftdf_right833834MasonMason3Smith...1983-03-161993-03-131Kingston-uponH-ullKingston-upon-Hull0masons52@reed.comNone-10
43.201620.90196df_leftdf_right439444JacobJacob3Campbell...1987-06-061997-05-041LonnodLondon0NoneNone-10
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r \\\n", + "0 3.20162 0.90196 df_left df_right \n", + "1 3.20162 0.90196 df_left df_right \n", + "2 3.20162 0.90196 df_left df_right \n", + "3 3.20162 0.90196 df_left df_right \n", + "4 3.20162 0.90196 df_left df_right \n", + "\n", + " unique_id_l unique_id_r first_name_l first_name_r gamma_first_name \\\n", + "0 445 444 Jacob Jacob 3 \n", + "1 774 778 Armstrong Armstrong 3 \n", + "2 239 242 Freya Freya 3 \n", + "3 833 834 Mason Mason 3 \n", + "4 439 444 Jacob Jacob 3 \n", + "\n", + " surname_l ... dob_l dob_r gamma_dob city_l \\\n", + "0 Campbell ... 1988-06-05 1997-05-04 1 Lonon \n", + "1 Eva ... 2027-04-21 2017-04-23 1 Peterborouhg \n", + "2 Shah ... 1972-01-17 1970-12-17 1 London \n", + "3 Smith ... 1983-03-16 1993-03-13 1 Kingston-uponH-ull \n", + "4 Campbell ... 1987-06-06 1997-05-04 1 Lonnod \n", + "\n", + " city_r gamma_city email_l email_r \\\n", + "0 London 0 j.c65@ortiz.com None \n", + "1 Peterbotrough 0 e.armstrong16odonnell.info None \n", + "2 Lonnod 0 f.s@flynn.com None \n", + "3 Kingston-upon-Hull 0 masons52@reed.com None \n", + "4 London 0 None None \n", + "\n", + " gamma_email match_key \n", + "0 -1 0 \n", + "1 -1 0 \n", + "2 -1 0 \n", + "3 -1 0 \n", + "4 -1 0 \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "results.as_pandas_dataframe(limit=5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/pairwise_labels.ipynb b/docs/demos/examples/duckdb/pairwise_labels.ipynb index 168c09f915..f99d3a566c 100644 --- a/docs/demos/examples/duckdb/pairwise_labels.ipynb +++ b/docs/demos/examples/duckdb/pairwise_labels.ipynb @@ -1,390 +1,776 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Estimating m from a sample of pairwise labels\n", - "\n", - "In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.\n", - "\n", - "The table must be in the following format:\n", - "\n", - "| source_dataset_l | unique_id_l | source_dataset_r | unique_id_r |\n", - "| ---------------- | ----------- | ---------------- | ----------- |\n", - "| df_1 | 1 | df_2 | 2 |\n", - "| df_1 | 1 | df_2 | 3 |\n", - "\n", - "It is assumed that every record in the table represents a certain match.\n", - "\n", - "Note that the column names above are the defaults. They should correspond to the values you've set for [`unique_id_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#unique_id_column_name) and [`source_dataset_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#source_dataset_column_name), if you've chosen custom values.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:53.117258Z", - "iopub.status.busy": "2024-03-27T15:14:53.116906Z", - "iopub.status.idle": "2024-03-27T15:14:53.122096Z", - "shell.execute_reply": "2024-03-27T15:14:53.121308Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:53.126249Z", - "iopub.status.busy": "2024-03-27T15:14:53.125905Z", - "iopub.status.idle": "2024-03-27T15:14:54.649822Z", - "shell.execute_reply": "2024-03-27T15:14:54.649099Z" - } - }, - "outputs": [], - "source": [ - "from splink.datasets import splink_dataset_labels\n", - "\n", - "pairwise_labels = splink_dataset_labels.fake_1000_labels\n", - "\n", - "# Choose labels indicating a match\n", - "pairwise_labels = pairwise_labels[pairwise_labels[\"clerical_match_score\"] == 1]\n", - "pairwise_labels" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now proceed to estimate the Fellegi Sunter model:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:54.653768Z", - "iopub.status.busy": "2024-03-27T15:14:54.653468Z", - "iopub.status.idle": "2024-03-27T15:14:54.668003Z", - "shell.execute_reply": "2024-03-27T15:14:54.667271Z" - } - }, - "outputs": [], - "source": [ - "from splink import splink_datasets\n", - "\n", - "df = splink_datasets.fake_1000\n", - "df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:54.671717Z", - "iopub.status.busy": "2024-03-27T15:14:54.671406Z", - "iopub.status.idle": "2024-03-27T15:14:54.912700Z", - "shell.execute_reply": "2024-03-27T15:14:54.911624Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\"),\n", - " block_on(\"surname\"),\n", - " ],\n", - " comparisons=[\n", - " ctl.NameComparison(\"first_name\"),\n", - " ctl.NameComparison(\"surname\"),\n", - " ctl.DateComparison(\n", - " \"dob\",\n", - " input_is_string=True,\n", - " invalid_dates_as_null=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " cl.LevenshteinAtThresholds(\"dob\", [2]),\n", - " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", - " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")" - ] - }, + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Estimating m from a sample of pairwise labels\n", + "\n", + "In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.\n", + "\n", + "The table must be in the following format:\n", + "\n", + "| source_dataset_l | unique_id_l | source_dataset_r | unique_id_r |\n", + "| ---------------- | ----------- | ---------------- | ----------- |\n", + "| df_1 | 1 | df_2 | 2 |\n", + "| df_1 | 1 | df_2 | 3 |\n", + "\n", + "It is assumed that every record in the table represents a certain match.\n", + "\n", + "Note that the column names above are the defaults. They should correspond to the values you've set for [`unique_id_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#unique_id_column_name) and [`source_dataset_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#source_dataset_column_name), if you've chosen custom values.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:22.461384Z", + "iopub.status.busy": "2024-06-07T09:20:22.461075Z", + "iopub.status.idle": "2024-06-07T09:20:22.466162Z", + "shell.execute_reply": "2024-06-07T09:20:22.465529Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:22.470034Z", + "iopub.status.busy": "2024-06-07T09:20:22.469740Z", + "iopub.status.idle": "2024-06-07T09:20:24.546756Z", + "shell.execute_reply": "2024-06-07T09:20:24.546033Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:54.917049Z", - "iopub.status.busy": "2024-03-27T15:14:54.916681Z", - "iopub.status.idle": "2024-03-27T15:14:55.221629Z", - "shell.execute_reply": "2024-03-27T15:14:55.220884Z" - } - }, - "outputs": [], - "source": [ - "linker = Linker(df, settings, database_api=DuckDBAPI(), set_up_basic_logging=False)\n", - "deterministic_rules = [\n", - " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", - " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", - " \"l.email = r.email\",\n", - "]\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_id_lsource_dataset_lunique_id_rsource_dataset_rclerical_match_score
00fake_10001fake_10001.0
10fake_10002fake_10001.0
20fake_10003fake_10001.0
491fake_10002fake_10001.0
501fake_10003fake_10001.0
..................
3171994fake_1000996fake_10001.0
3172995fake_1000996fake_10001.0
3173997fake_1000998fake_10001.0
3174997fake_1000999fake_10001.0
3175998fake_1000999fake_10001.0
\n", + "

2031 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n", + "0 0 fake_1000 1 fake_1000 \n", + "1 0 fake_1000 2 fake_1000 \n", + "2 0 fake_1000 3 fake_1000 \n", + "49 1 fake_1000 2 fake_1000 \n", + "50 1 fake_1000 3 fake_1000 \n", + "... ... ... ... ... \n", + "3171 994 fake_1000 996 fake_1000 \n", + "3172 995 fake_1000 996 fake_1000 \n", + "3173 997 fake_1000 998 fake_1000 \n", + "3174 997 fake_1000 999 fake_1000 \n", + "3175 998 fake_1000 999 fake_1000 \n", + "\n", + " clerical_match_score \n", + "0 1.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "49 1.0 \n", + "50 1.0 \n", + "... ... \n", + "3171 1.0 \n", + "3172 1.0 \n", + "3173 1.0 \n", + "3174 1.0 \n", + "3175 1.0 \n", + "\n", + "[2031 rows x 5 columns]" ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.datasets import splink_dataset_labels\n", + "\n", + "pairwise_labels = splink_dataset_labels.fake_1000_labels\n", + "\n", + "# Choose labels indicating a match\n", + "pairwise_labels = pairwise_labels[pairwise_labels[\"clerical_match_score\"] == 1]\n", + "pairwise_labels" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now proceed to estimate the Fellegi Sunter model:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:24.588843Z", + "iopub.status.busy": "2024-06-07T09:20:24.588530Z", + "iopub.status.idle": "2024-06-07T09:20:24.602952Z", + "shell.execute_reply": "2024-06-07T09:20:24.602047Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:55.224831Z", - "iopub.status.busy": "2024-03-27T15:14:55.224593Z", - "iopub.status.idle": "2024-03-27T15:14:57.430946Z", - "shell.execute_reply": "2024-03-27T15:14:57.430131Z" - } - }, - "outputs": [], - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idfirst_namesurnamedobcityemailcluster
00RobertAlan1971-06-24NaNrobert255@smith.net0
11RobertAllen1971-05-24NaNroberta25@smith.net0
\n", + "
" + ], + "text/plain": [ + " unique_id first_name surname dob city email cluster\n", + "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n", + "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0" ] - }, + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import splink_datasets\n", + "\n", + "df = splink_datasets.fake_1000\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:24.607247Z", + "iopub.status.busy": "2024-06-07T09:20:24.606935Z", + "iopub.status.idle": "2024-06-07T09:20:24.711369Z", + "shell.execute_reply": "2024-06-07T09:20:24.710531Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"first_name\"),\n", + " block_on(\"surname\"),\n", + " ],\n", + " comparisons=[\n", + " ctl.NameComparison(\"first_name\"),\n", + " ctl.NameComparison(\"surname\"),\n", + " ctl.DateComparison(\n", + " \"dob\",\n", + " input_is_string=True,\n", + " invalid_dates_as_null=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " cl.LevenshteinAtThresholds(\"dob\", [2]),\n", + " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n", + " ctl.EmailComparison(\"email\", include_username_fuzzy_level=False),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:24.715481Z", + "iopub.status.busy": "2024-06-07T09:20:24.715162Z", + "iopub.status.idle": "2024-06-07T09:20:25.100461Z", + "shell.execute_reply": "2024-06-07T09:20:25.099741Z" + } + }, + "outputs": [], + "source": [ + "linker = Linker(df, settings, database_api=DuckDBAPI(), set_up_basic_logging=False)\n", + "deterministic_rules = [\n", + " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n", + " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n", + " \"l.email = r.email\",\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:25.104541Z", + "iopub.status.busy": "2024-06-07T09:20:25.104116Z", + "iopub.status.idle": "2024-06-07T09:20:26.866642Z", + "shell.execute_reply": "2024-06-07T09:20:26.866007Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:57.434534Z", - "iopub.status.busy": "2024-03-27T15:14:57.434260Z", - "iopub.status.idle": "2024-03-27T15:14:57.657154Z", - "shell.execute_reply": "2024-03-27T15:14:57.656336Z" - } - }, - "outputs": [], - "source": [ - "# Register the pairwise labels table with the database, and then use it to estimate the m values\n", - "labels_df = linker.table_management.register_labels_table(pairwise_labels, overwrite=True)\n", - "linker.estimate_m_from_pairwise_labels(labels_df)\n", - "\n", - "\n", - "# If the labels table already existing in the dataset you could run\n", - "# linker.estimate_m_from_pairwise_labels(\"labels_tablename_here\")" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:26.871363Z", + "iopub.status.busy": "2024-06-07T09:20:26.871016Z", + "iopub.status.idle": "2024-06-07T09:20:27.051023Z", + "shell.execute_reply": "2024-06-07T09:20:27.050407Z" + } + }, + "outputs": [], + "source": [ + "# Register the pairwise labels table with the database, and then use it to estimate the m values\n", + "labels_df = linker.table_management.register_labels_table(pairwise_labels, overwrite=True)\n", + "linker.training.estimate_m_from_pairwise_labels(labels_df)\n", + "\n", + "\n", + "# If the labels table already existing in the dataset you could run\n", + "# linker.training.estimate_m_from_pairwise_labels(\"labels_tablename_here\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:27.054211Z", + "iopub.status.busy": "2024-06-07T09:20:27.053972Z", + "iopub.status.idle": "2024-06-07T09:20:27.489093Z", + "shell.execute_reply": "2024-06-07T09:20:27.488564Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:57.662065Z", - "iopub.status.busy": "2024-03-27T15:14:57.661552Z", - "iopub.status.idle": "2024-03-27T15:14:58.144518Z", - "shell.execute_reply": "2024-03-27T15:14:58.143799Z" - } - }, - "outputs": [], - "source": [ - "training_blocking_rule = block_on(\"first_name\")\n", - "linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)" + "data": { + "text/plain": [ + "" ] - }, + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_blocking_rule = block_on(\"first_name\")\n", + "linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:27.492742Z", + "iopub.status.busy": "2024-06-07T09:20:27.492510Z", + "iopub.status.idle": "2024-06-07T09:20:27.624619Z", + "shell.execute_reply": "2024-06-07T09:20:27.624114Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:58.149118Z", - "iopub.status.busy": "2024-03-27T15:14:58.148820Z", - "iopub.status.idle": "2024-03-27T15:14:58.295802Z", - "shell.execute_reply": "2024-03-27T15:14:58.294855Z" - } - }, - "outputs": [], - "source": [ - "linker.parameter_estimate_comparisons_chart()" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" ] - }, + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.parameter_estimate_comparisons_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:27.628602Z", + "iopub.status.busy": "2024-06-07T09:20:27.628256Z", + "iopub.status.idle": "2024-06-07T09:20:27.933374Z", + "shell.execute_reply": "2024-06-07T09:20:27.932702Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:14:58.299160Z", - "iopub.status.busy": "2024-03-27T15:14:58.298915Z", - "iopub.status.idle": "2024-03-27T15:14:58.605413Z", - "shell.execute_reply": "2024-03-27T15:14:58.604766Z" - } - }, - "outputs": [], - "source": [ - "linker.visualisations.match_weights_chart()" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "54186cece08b4f6fa03f33cc282f36a6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_8863bef4905c44fc9705add5d5165a71", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ce7ee37dfbeb4d26ae9171f7f3b857e7", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "8863bef4905c44fc9705add5d5165a71": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "ce7ee37dfbeb4d26ae9171f7f3b857e7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - } - }, - "version_major": 2, - "version_minor": 0 - } + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "54186cece08b4f6fa03f33cc282f36a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_8863bef4905c44fc9705add5d5165a71", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ce7ee37dfbeb4d26ae9171f7f3b857e7", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "8863bef4905c44fc9705add5d5165a71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "ce7ee37dfbeb4d26ae9171f7f3b857e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb b/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb index e02245bf78..12bc1af90b 100644 --- a/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb +++ b/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb @@ -1,303 +1,665 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Historical people: Quick and dirty\n", - "\n", - "This example shows how to get some initial record linkage results as quickly as possible.\n", - "\n", - "There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:01.166130Z", - "iopub.status.busy": "2024-03-27T15:15:01.165782Z", - "iopub.status.idle": "2024-03-27T15:15:01.171295Z", - "shell.execute_reply": "2024-03-27T15:15:01.170553Z" - } - }, - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:01.174969Z", - "iopub.status.busy": "2024-03-27T15:15:01.174678Z", - "iopub.status.idle": "2024-03-27T15:15:02.750516Z", - "shell.execute_reply": "2024-03-27T15:15:02.749785Z" - } - }, - "source": [ - "from splink.datasets import splink_datasets\n", - "\n", - "df = splink_datasets.historical_50k\n", - "df.head(5)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:02.754599Z", - "iopub.status.busy": "2024-03-27T15:15:02.754204Z", - "iopub.status.idle": "2024-03-27T15:15:02.762053Z", - "shell.execute_reply": "2024-03-27T15:15:02.761258Z" - } - }, - "source": [ - "from splink import block_on, SettingsCreator\n", - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"full_name\"),\n", - " block_on(\"substr(full_name,1,6)\", \"dob\", \"birth_place\"),\n", - " block_on(\"dob\", \"birth_place\"),\n", - " block_on(\"postcode_fake\"),\n", - " ],\n", - " comparisons=[\n", - " cl.JaroWinklerAtThresholds(\"full_name\", [0.9, 0.7]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " ctl.DateComparison(\n", - " \"dob\",\n", - " input_is_string=True,\n", - " datetime_metrics=[\"day\", \"month\", \"year\"],\n", - " datetime_thresholds=[5, 1, 5],\n", - " ),\n", - " cl.LevenshteinAtThresholds(\"postcode_fake\", 2),\n", - " cl.JaroWinklerAtThresholds(\"birth_place\", 0.9).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n", - " ],\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:02.766456Z", - "iopub.status.busy": "2024-03-27T15:15:02.766123Z", - "iopub.status.idle": "2024-03-27T15:15:03.425993Z", - "shell.execute_reply": "2024-03-27T15:15:03.424984Z" - } - }, - "source": [ - "from splink import Linker, DuckDBAPI\n", - "\n", - "\n", - "linker = Linker(df, settings, database_api=DuckDBAPI(), set_up_basic_logging=False)\n", - "deterministic_rules = [\n", - " \"l.full_name = r.full_name\",\n", - " \"l.postcode_fake = r.postcode_fake and l.dob = r.dob\",\n", - "]\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)" - ], - "outputs": [] - }, + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Historical people: Quick and dirty\n", + "\n", + "This example shows how to get some initial record linkage results as quickly as possible.\n", + "\n", + "There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:37.624889Z", + "iopub.status.busy": "2024-06-07T09:20:37.624517Z", + "iopub.status.idle": "2024-06-07T09:20:37.644289Z", + "shell.execute_reply": "2024-06-07T09:20:37.643404Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:37.648712Z", + "iopub.status.busy": "2024-06-07T09:20:37.648404Z", + "iopub.status.idle": "2024-06-07T09:20:39.278642Z", + "shell.execute_reply": "2024-06-07T09:20:39.277984Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:03.430749Z", - "iopub.status.busy": "2024-03-27T15:15:03.430387Z", - "iopub.status.idle": "2024-03-27T15:15:07.041399Z", - "shell.execute_reply": "2024-03-27T15:15:07.040743Z" - } - }, - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=2e6)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_idclusterfull_namefirst_and_surnamefirst_namesurnamedobbirth_placepostcode_fakegenderoccupation
0Q2296770-1Q2296770thomas clifford, 1st baron clifford of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
1Q2296770-2Q2296770thomas of chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfmalepolitician
2Q2296770-3Q2296770tom 1st baron clifford of chudleightom chudleightomchudleigh1630-08-01devontq13 8dfmalepolitician
3Q2296770-4Q2296770thomas 1st chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8huNonepolitician
4Q2296770-5Q2296770thomas clifford, 1st baron chudleighthomas chudleighthomaschudleigh1630-08-01devontq13 8dfNonepolitician
\n", + "
" ], - "outputs": [] - }, + "text/plain": [ + " unique_id cluster full_name \\\n", + "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n", + "1 Q2296770-2 Q2296770 thomas of chudleigh \n", + "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n", + "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n", + "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n", + "\n", + " first_and_surname first_name surname dob birth_place \\\n", + "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "2 tom chudleigh tom chudleigh 1630-08-01 devon \n", + "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n", + "\n", + " postcode_fake gender occupation \n", + "0 tq13 8df male politician \n", + "1 tq13 8df male politician \n", + "2 tq13 8df male politician \n", + "3 tq13 8hu None politician \n", + "4 tq13 8df None politician " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.datasets import splink_datasets\n", + "\n", + "df = splink_datasets.historical_50k\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:39.330739Z", + "iopub.status.busy": "2024-06-07T09:20:39.330384Z", + "iopub.status.idle": "2024-06-07T09:20:39.345331Z", + "shell.execute_reply": "2024-06-07T09:20:39.344598Z" + } + }, + "outputs": [], + "source": [ + "from splink import block_on, SettingsCreator\n", + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"full_name\"),\n", + " block_on(\"substr(full_name,1,6)\", \"dob\", \"birth_place\"),\n", + " block_on(\"dob\", \"birth_place\"),\n", + " block_on(\"postcode_fake\"),\n", + " ],\n", + " comparisons=[\n", + " cl.JaroWinklerAtThresholds(\"full_name\", [0.9, 0.7]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " ctl.DateComparison(\n", + " \"dob\",\n", + " input_is_string=True,\n", + " datetime_metrics=[\"day\", \"month\", \"year\"],\n", + " datetime_thresholds=[5, 1, 5],\n", + " ),\n", + " cl.LevenshteinAtThresholds(\"postcode_fake\", 2),\n", + " cl.JaroWinklerAtThresholds(\"birth_place\", 0.9).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:39.349123Z", + "iopub.status.busy": "2024-06-07T09:20:39.348832Z", + "iopub.status.idle": "2024-06-07T09:20:39.807802Z", + "shell.execute_reply": "2024-06-07T09:20:39.807089Z" + } + }, + "outputs": [], + "source": [ + "from splink import Linker, DuckDBAPI\n", + "\n", + "\n", + "linker = Linker(df, settings, database_api=DuckDBAPI(), set_up_basic_logging=False)\n", + "deterministic_rules = [\n", + " \"l.full_name = r.full_name\",\n", + " \"l.postcode_fake = r.postcode_fake and l.dob = r.dob\",\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:39.811242Z", + "iopub.status.busy": "2024-06-07T09:20:39.810994Z", + "iopub.status.idle": "2024-06-07T09:20:42.328241Z", + "shell.execute_reply": "2024-06-07T09:20:42.327675Z" + } + }, + "outputs": [], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=2e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:42.331754Z", + "iopub.status.busy": "2024-06-07T09:20:42.331463Z", + "iopub.status.idle": "2024-06-07T09:20:44.521913Z", + "shell.execute_reply": "2024-06-07T09:20:44.521209Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:07.045995Z", - "iopub.status.busy": "2024-03-27T15:15:07.045404Z", - "iopub.status.idle": "2024-03-27T15:15:09.400752Z", - "shell.execute_reply": "2024-03-27T15:15:09.400029Z" - } - }, - "source": [ - "results = linker.inference.predict(threshold_match_probability=0.9)" - ], - "outputs": [] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'full_name':\n", + " m values not fully trained\n", + "Comparison: 'dob':\n", + " m values not fully trained\n", + "Comparison: 'postcode_fake':\n", + " m values not fully trained\n", + "Comparison: 'birth_place':\n", + " m values not fully trained\n", + "Comparison: 'occupation':\n", + " m values not fully trained\n" + ] + } + ], + "source": [ + "results = linker.inference.predict(threshold_match_probability=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:20:44.525778Z", + "iopub.status.busy": "2024-06-07T09:20:44.525492Z", + "iopub.status.idle": "2024-06-07T09:20:44.543212Z", + "shell.execute_reply": "2024-06-07T09:20:44.542595Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:09.404703Z", - "iopub.status.busy": "2024-03-27T15:15:09.404377Z", - "iopub.status.idle": "2024-03-27T15:15:09.428537Z", - "shell.execute_reply": "2024-03-27T15:15:09.427244Z" - } - }, - "source": [ - "results.as_pandas_dataframe(limit=5)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilityunique_id_lunique_id_rfull_name_lfull_name_rgamma_full_namedob_ldob_rgamma_dobpostcode_fake_lpostcode_fake_rgamma_postcode_fakebirth_place_lbirth_place_rgamma_birth_placeoccupation_loccupation_rgamma_occupationmatch_key
03.1722460.900145Q16198727-6Q16198727-8henry juppjupp01802-08-061802-08-065Nonee4 9re-1waltham forestwaltham forest2cricketercricketer12
13.1724230.900156Q16220644-12Q16220644-71st bt.1st bt.31840-11-211810-11-214NoneNone-1liverpoolliverpool2Nonephysician-10
23.1732560.900208Q6180874-12Q6180874-19richard slaterslater01854-01-011854-01-015al5 2ayal3 7rq0st albansst albans2hymnwriterhymnwriter12
33.1741820.900265Q7519167-10Q7519167-8simeon langtonsimeon langton31150-01-811152-01-010NoneNone-1wealdenwealden2priestNone-10
43.1785670.900538Q15980561-12Q15980561-8harry roslinghenry rosling11828-01-111858-01-010tn27 0sytn27 0sy2ashfordashford2photographerNone-13
\n", + "
" ], - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "6b84d4a42f1a479ca6d8e1b02ccd8eda": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "df640bcb35b2441a904ae87dc47249f9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_6b84d4a42f1a479ca6d8e1b02ccd8eda", - "max": 100.0, - "min": 0.0, - "orientation": "horizontal", - "style": "IPY_MODEL_e2e4b97696234790991bc2a5ca2e731a", - "tabbable": null, - "tooltip": null, - "value": 100.0 - } - }, - "e2e4b97696234790991bc2a5ca2e731a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - } - }, - "version_major": 2, - "version_minor": 0 - } + "text/plain": [ + " match_weight match_probability unique_id_l unique_id_r full_name_l \\\n", + "0 3.172246 0.900145 Q16198727-6 Q16198727-8 henry jupp \n", + "1 3.172423 0.900156 Q16220644-12 Q16220644-7 1st bt. \n", + "2 3.173256 0.900208 Q6180874-12 Q6180874-19 richard slater \n", + "3 3.174182 0.900265 Q7519167-10 Q7519167-8 simeon langton \n", + "4 3.178567 0.900538 Q15980561-12 Q15980561-8 harry rosling \n", + "\n", + " full_name_r gamma_full_name dob_l dob_r gamma_dob \\\n", + "0 jupp 0 1802-08-06 1802-08-06 5 \n", + "1 1st bt. 3 1840-11-21 1810-11-21 4 \n", + "2 slater 0 1854-01-01 1854-01-01 5 \n", + "3 simeon langton 3 1150-01-81 1152-01-01 0 \n", + "4 henry rosling 1 1828-01-11 1858-01-01 0 \n", + "\n", + " postcode_fake_l postcode_fake_r gamma_postcode_fake birth_place_l \\\n", + "0 None e4 9re -1 waltham forest \n", + "1 None None -1 liverpool \n", + "2 al5 2ay al3 7rq 0 st albans \n", + "3 None None -1 wealden \n", + "4 tn27 0sy tn27 0sy 2 ashford \n", + "\n", + " birth_place_r gamma_birth_place occupation_l occupation_r \\\n", + "0 waltham forest 2 cricketer cricketer \n", + "1 liverpool 2 None physician \n", + "2 st albans 2 hymnwriter hymnwriter \n", + "3 wealden 2 priest None \n", + "4 ashford 2 photographer None \n", + "\n", + " gamma_occupation match_key \n", + "0 1 2 \n", + "1 -1 0 \n", + "2 1 2 \n", + "3 -1 0 \n", + "4 -1 3 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "results.as_pandas_dataframe(limit=5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "6b84d4a42f1a479ca6d8e1b02ccd8eda": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "df640bcb35b2441a904ae87dc47249f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_6b84d4a42f1a479ca6d8e1b02ccd8eda", + "max": 100.0, + "min": 0.0, + "orientation": "horizontal", + "style": "IPY_MODEL_e2e4b97696234790991bc2a5ca2e731a", + "tabbable": null, + "tooltip": null, + "value": 100.0 + } + }, + "e2e4b97696234790991bc2a5ca2e731a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/duckdb/real_time_record_linkage.ipynb b/docs/demos/examples/duckdb/real_time_record_linkage.ipynb index 1f6adf946f..63bbe23d56 100644 --- a/docs/demos/examples/duckdb/real_time_record_linkage.ipynb +++ b/docs/demos/examples/duckdb/real_time_record_linkage.ipynb @@ -37,13 +37,13 @@ "shell.execute_reply": "2024-03-27T15:15:11.889929Z" } }, + "outputs": [], "source": [ "# Uncomment and run this cell if you're running in Google Colab.\n", "# !pip install ipywidgets\n", "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev\n", "# !jupyter nbextension enable --py widgetsnbextension" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -63,6 +63,7 @@ "shell.execute_reply": "2024-03-27T15:15:13.841226Z" } }, + "outputs": [], "source": [ "import urllib.request\n", "import json\n", @@ -78,8 +79,7 @@ "\n", "\n", "linker = Linker(df, settings, database_api=DuckDBAPI())" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -92,10 +92,10 @@ "shell.execute_reply": "2024-03-27T15:15:14.720417Z" } }, + "outputs": [], "source": [ "linker.visualisations.waterfall_chart(linker.inference.predict().as_record_dict(limit=2))" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -117,6 +117,7 @@ "shell.execute_reply": "2024-03-27T15:15:14.961740Z" } }, + "outputs": [], "source": [ "record_1 = {\n", " \"unique_id\": 1,\n", @@ -142,17 +143,16 @@ "\n", "# To `compare_two_records` the linker needs to compute term frequency tables\n", "# If you have precomputed tables, you can linker.register_term_frequency_lookup()\n", - "linker.compute_tf_table(\"first_name\")\n", - "linker.compute_tf_table(\"surname\")\n", - "linker.compute_tf_table(\"dob\")\n", - "linker.compute_tf_table(\"city\")\n", - "linker.compute_tf_table(\"email\")\n", + "linker.table_management.compute_tf_table(\"first_name\")\n", + "linker.table_management.compute_tf_table(\"surname\")\n", + "linker.table_management.compute_tf_table(\"dob\")\n", + "linker.table_management.compute_tf_table(\"city\")\n", + "linker.table_management.compute_tf_table(\"email\")\n", "\n", "\n", - "df_two = linker.compare_two_records(record_1, record_2)\n", + "df_two = linker.inference.compare_two_records(record_1, record_2)\n", "df_two.as_pandas_dataframe()" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -176,6 +176,7 @@ "shell.execute_reply": "2024-03-27T15:15:15.925656Z" } }, + "outputs": [], "source": [ "import ipywidgets as widgets\n", "from IPython.display import display\n", @@ -218,7 +219,7 @@ " # Assuming 'linker' is defined earlier in your code\n", " linker._settings_obj._retain_intermediate_calculation_columns = True\n", "\n", - " df_two = linker.compare_two_records(record_left, record_right)\n", + " df_two = linker.inference.compare_two_records(record_left, record_right)\n", "\n", " recs = df_two.as_pandas_dataframe().to_dict(orient=\"records\")\n", " from splink.charts import waterfall_chart\n", @@ -229,8 +230,7 @@ "out = widgets.interactive_output(myfn, inputs_to_interactive_output)\n", "\n", "display(ui, out)" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -252,6 +252,7 @@ "shell.execute_reply": "2024-03-27T15:15:16.474896Z" } }, + "outputs": [], "source": [ "record = {\n", " \"unique_id\": 123987,\n", @@ -263,12 +264,11 @@ "}\n", "\n", "\n", - "df_inc = linker.find_matches_to_new_records(\n", + "df_inc = linker.inference.find_matches_to_new_records(\n", " [record], blocking_rules=[]\n", ").as_pandas_dataframe()\n", "df_inc.sort_values(\"match_weight\", ascending=False)" - ], - "outputs": [] + ] }, { "cell_type": "markdown", @@ -290,6 +290,7 @@ "shell.execute_reply": "2024-03-27T15:15:17.548423Z" } }, + "outputs": [], "source": [ "@widgets.interact(\n", " first_name=\"Robert\",\n", @@ -315,15 +316,14 @@ " if record[key].strip() == \"\":\n", " record[key] = None\n", "\n", - " df_inc = linker.find_matches_to_new_records(\n", + " df_inc = linker.inference.find_matches_to_new_records(\n", " [record], blocking_rules=[f\"(true)\"]\n", " ).as_pandas_dataframe()\n", " df_inc = df_inc.sort_values(\"match_weight\", ascending=False)\n", " recs = df_inc.to_dict(orient=\"records\")\n", "\n", " display(linker.visualisations.waterfall_chart(recs, filter_nulls=False))" - ], - "outputs": [] + ] }, { "cell_type": "code", @@ -336,10 +336,10 @@ "shell.execute_reply": "2024-03-27T15:15:17.884033Z" } }, + "outputs": [], "source": [ "linker.visualisations.match_weights_chart()" - ], - "outputs": [] + ] } ], "metadata": { @@ -358,7 +358,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.10.8" }, "widgets": { "application/vnd.jupyter.widget-state+json": { @@ -2485,4 +2485,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb index d34119f9de..472e477116 100644 --- a/docs/demos/examples/duckdb/transactions.ipynb +++ b/docs/demos/examples/duckdb/transactions.ipynb @@ -1,731 +1,2044 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking banking transactions\n", - "\n", - "This example shows how to perform a one-to-one link on banking transactions.\n", - "\n", - "The data is fake data, and was generated has the following features:\n", - "\n", - "- Money shows up in the destination account with some time delay\n", - "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", - "- The memo is sometimes truncated and content is sometimes missing\n", - "\n", - "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" - ] + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking banking transactions\n", + "\n", + "This example shows how to perform a one-to-one link on banking transactions.\n", + "\n", + "The data is fake data, and was generated has the following features:\n", + "\n", + "- Money shows up in the destination account with some time delay\n", + "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", + "- The memo is sometimes truncated and content is sometimes missing\n", + "\n", + "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:27.648457Z", + "iopub.status.busy": "2024-06-07T09:22:27.648128Z", + "iopub.status.idle": "2024-06-07T09:22:27.653498Z", + "shell.execute_reply": "2024-06-07T09:22:27.652626Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:27.657230Z", + "iopub.status.busy": "2024-06-07T09:22:27.656926Z", + "iopub.status.idle": "2024-06-07T09:22:31.983888Z", + "shell.execute_reply": "2024-06-07T09:22:31.983040Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloading: https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_origin.parquet\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " download progress: 0 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 2 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 5 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 7 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 9 %\t(..........)\r", + " download progress: 10 %\t(..........)\r", + " download progress: 10 %\t(=.........)\r", + " download progress: 11 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 14 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 16 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 18 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 20 %\t(=.........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 23 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 25 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 27 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 29 %\t(==........)\r", + " download progress: 30 %\t(==........)\r", + " download progress: 30 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 32 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 34 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 36 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 39 %\t(===.......)\r", + " download progress: 39 %\t(===.......)\r", + " download progress: 40 %\t(===.......)\r", + " download progress: 40 %\t(====......)\r", + " download progress: 41 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 43 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 45 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 47 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 50 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 52 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 54 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 59 %\t(=====.....)\r", + " download progress: 60 %\t(=====.....)\r", + " download progress: 60 %\t(======....)\r", + " download progress: 61 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 63 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 66 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 68 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 70 %\t(======....)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 75 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 77 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 79 %\t(=======...)\r", + " download progress: 80 %\t(=======...)\r", + " download progress: 80 %\t(========..)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 82 %\t(========..)\r", + " download progress: 82 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 84 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 86 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 88 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 90 %\t(========..)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 93 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 95 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 97 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 100 %\t(=========.)\r", + " download progress: 100 %\t(==========)\n", + "downloading: https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_destination.parquet\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:14.252200Z", - "iopub.status.busy": "2024-05-16T12:13:14.251497Z", - "iopub.status.idle": "2024-05-16T12:13:14.257616Z", - "shell.execute_reply": "2024-05-16T12:13:14.256908Z" - } - }, - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:14.261383Z", - "iopub.status.busy": "2024-05-16T12:13:14.261079Z", - "iopub.status.idle": "2024-05-16T12:13:16.084252Z", - "shell.execute_reply": "2024-05-16T12:13:16.083429Z" - } - }, - "source": [ - "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n", - "\n", - "df_origin = splink_datasets.transactions_origin\n", - "df_destination = splink_datasets.transactions_destination\n", - "\n", - "display(df_origin.head(2))\n", - "display(df_destination.head(2))" + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " download progress: 0 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 1 %\t(..........)\r", + " download progress: 2 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 3 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 4 %\t(..........)\r", + " download progress: 5 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 6 %\t(..........)\r", + " download progress: 7 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 8 %\t(..........)\r", + " download progress: 9 %\t(..........)\r", + " download progress: 10 %\t(..........)\r", + " download progress: 10 %\t(=.........)\r", + " download progress: 11 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 12 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 13 %\t(=.........)\r", + " download progress: 14 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 15 %\t(=.........)\r", + " download progress: 16 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 17 %\t(=.........)\r", + " download progress: 18 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 19 %\t(=.........)\r", + " download progress: 20 %\t(=.........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 21 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 22 %\t(==........)\r", + " download progress: 23 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 24 %\t(==........)\r", + " download progress: 25 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 26 %\t(==........)\r", + " download progress: 27 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 28 %\t(==........)\r", + " download progress: 29 %\t(==........)\r", + " download progress: 30 %\t(==........)\r", + " download progress: 30 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 31 %\t(===.......)\r", + " download progress: 32 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 33 %\t(===.......)\r", + " download progress: 34 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 35 %\t(===.......)\r", + " download progress: 36 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 37 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 38 %\t(===.......)\r", + " download progress: 39 %\t(===.......)\r", + " download progress: 40 %\t(===.......)\r", + " download progress: 40 %\t(====......)\r", + " download progress: 41 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 42 %\t(====......)\r", + " download progress: 43 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 44 %\t(====......)\r", + " download progress: 45 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 46 %\t(====......)\r", + " download progress: 47 %\t(====......)\r", + " download progress: 47 %\t(====......)\r", + " download progress: 48 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 49 %\t(====......)\r", + " download progress: 50 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 51 %\t(=====.....)\r", + " download progress: 52 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 53 %\t(=====.....)\r", + " download progress: 54 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 55 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 56 %\t(=====.....)\r", + " download progress: 57 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 58 %\t(=====.....)\r", + " download progress: 59 %\t(=====.....)\r", + " download progress: 60 %\t(=====.....)\r", + " download progress: 60 %\t(======....)\r", + " download progress: 61 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 62 %\t(======....)\r", + " download progress: 63 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 64 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 65 %\t(======....)\r", + " download progress: 66 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 67 %\t(======....)\r", + " download progress: 68 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 69 %\t(======....)\r", + " download progress: 70 %\t(======....)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 71 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 72 %\t(=======...)\r", + " download progress: 73 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 74 %\t(=======...)\r", + " download progress: 75 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 76 %\t(=======...)\r", + " download progress: 77 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 78 %\t(=======...)\r", + " download progress: 79 %\t(=======...)\r", + " download progress: 80 %\t(=======...)\r", + " download progress: 80 %\t(========..)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 81 %\t(========..)\r", + " download progress: 82 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 83 %\t(========..)\r", + " download progress: 84 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 85 %\t(========..)\r", + " download progress: 86 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 87 %\t(========..)\r", + " download progress: 88 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 89 %\t(========..)\r", + " download progress: 90 %\t(========..)\r", + " download progress: 90 %\t(=========.)\r", + " download progress: 91 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 92 %\t(=========.)\r", + " download progress: 93 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 94 %\t(=========.)\r", + " download progress: 95 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 96 %\t(=========.)\r", + " download progress: 97 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 98 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 99 %\t(=========.)\r", + " download progress: 100 %\t(==========)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C paym2022-03-2836.360
11M CORVINUS dona2022-02-14221.911
\n", + "
" ], - "outputs": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following chart, we can see this is a challenging dataset to link:\n", - "\n", - "- There are only 151 distinct transaction dates, with strong skew\n", - "- Some 'memos' are used multiple times (up to 48 times)\n", - "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n", + "1 1 M CORVINUS dona 2022-02-14 221.91 1" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:16.143823Z", - "iopub.status.busy": "2024-05-16T12:13:16.143431Z", - "iopub.status.idle": "2024-05-16T12:13:16.849535Z", - "shell.execute_reply": "2024-05-16T12:13:16.848871Z" - } - }, - "source": [ - "from splink.exploratory import profile_columns\n", - "\n", - "db_api = DuckDBAPI()\n", - "profile_columns(\n", - " [df_origin, df_destination],\n", - " db_api=db_api,\n", - " column_expressions=[\n", - " \"memo\",\n", - " \"transaction_date\",\n", - " \"amount\",\n", - " ],\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:16.852855Z", - "iopub.status.busy": "2024-05-16T12:13:16.852594Z", - "iopub.status.idle": "2024-05-16T12:13:18.407824Z", - "shell.execute_reply": "2024-05-16T12:13:18.407265Z" - } - }, - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "# Design blocking rules that allow for differences in transaction date and amounts\n", - "blocking_rule_date_1 = \"\"\"\n", - " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", - "blocking_rule_date_2 = \"\"\"\n", - " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", - "\n", - "blocking_rule_amount_1 = \"\"\"\n", - "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", - "\"\"\"\n", - "\n", - "blocking_rule_amount_2 = \"\"\"\n", - "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", - "\"\"\"\n", - "\n", - "blocking_rule_cheat = block_on(\"unique_id\")\n", - "\n", - "\n", - "brs = [\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - "]\n", - "\n", - "\n", - "db_api = DuckDBAPI()\n", - "\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=[df_origin, df_destination],\n", - " blocking_rules=brs,\n", - " db_api=db_api,\n", - " link_type=\"link_only\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C payment BGC2022-03-2936.360
11M CORVINUS BGC2022-02-16221.911
\n", + "
" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.411066Z", - "iopub.status.busy": "2024-05-16T12:13:18.410832Z", - "iopub.status.idle": "2024-05-16T12:13:18.418094Z", - "shell.execute_reply": "2024-05-16T12:13:18.416984Z" - } - }, - "source": [ - "# Full settings for linking model\n", - "import splink.comparison_level_library as cll\n", - "import splink.comparison_library as cl\n", - "\n", - "comparison_amount = {\n", - " \"output_column_name\": \"amount\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"amount\"),\n", - " cll.ExactMatchLevel(\"amount\"),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Amount percentage difference\",\n", - "}\n", - "\n", - "# The date distance is one sided becaause transactions should only arrive after they've left\n", - "# As a result, the comparison_template_library date difference functions are not appropriate\n", - "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", - "\n", - "comparison_date = {\n", - " \"output_column_name\": \"transaction_date\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"transaction_date\"),\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=1),\n", - " \"label_for_charts\": \"1 day\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=4),\n", - " \"label_for_charts\": \"<=4 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=10),\n", - " \"label_for_charts\": \"<=10 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=30),\n", - " \"label_for_charts\": \"<=30 days\",\n", - " },\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Transaction date days apart\",\n", - "}\n", - "\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"link_only\",\n", - " probability_two_random_records_match=1 / len(df_origin),\n", - " blocking_rules_to_generate_predictions=[\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - " ],\n", - " comparisons=[\n", - " comparison_amount,\n", - " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", - " comparison_date,\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.421517Z", - "iopub.status.busy": "2024-05-16T12:13:18.421286Z", - "iopub.status.idle": "2024-05-16T12:13:18.552970Z", - "shell.execute_reply": "2024-05-16T12:13:18.552184Z" - } - }, - "source": [ - "linker = Linker(\n", - " [df_origin, df_destination],\n", - " settings,\n", - " input_table_aliases=[\"__ori\", \"_dest\"],\n", - " database_api=db_api,\n", - ")" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.556284Z", - "iopub.status.busy": "2024-05-16T12:13:18.556053Z", - "iopub.status.idle": "2024-05-16T12:13:20.529952Z", - "shell.execute_reply": "2024-05-16T12:13:20.529065Z" - } - }, - "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:20.532832Z", - "iopub.status.busy": "2024-05-16T12:13:20.532606Z", - "iopub.status.idle": "2024-05-16T12:13:21.867808Z", - "shell.execute_reply": "2024-05-16T12:13:21.867084Z" - } - }, - "source": [ - "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:21.871283Z", - "iopub.status.busy": "2024-05-16T12:13:21.871004Z", - "iopub.status.idle": "2024-05-16T12:13:23.094606Z", - "shell.execute_reply": "2024-05-16T12:13:23.093838Z" - } - }, - "source": [ - "session = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:23.097922Z", - "iopub.status.busy": "2024-05-16T12:13:23.097670Z", - "iopub.status.idle": "2024-05-16T12:13:23.382589Z", - "shell.execute_reply": "2024-05-16T12:13:23.382014Z" - } - }, - "source": [ - "linker.visualisations.match_weights_chart()" + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n", + "1 1 M CORVINUS BGC 2022-02-16 221.91 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n", + "\n", + "df_origin = splink_datasets.transactions_origin\n", + "df_destination = splink_datasets.transactions_destination\n", + "\n", + "display(df_origin.head(2))\n", + "display(df_destination.head(2))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following chart, we can see this is a challenging dataset to link:\n", + "\n", + "- There are only 151 distinct transaction dates, with strong skew\n", + "- Some 'memos' are used multiple times (up to 48 times)\n", + "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:31.987843Z", + "iopub.status.busy": "2024-06-07T09:22:31.987459Z", + "iopub.status.idle": "2024-06-07T09:22:32.720064Z", + "shell.execute_reply": "2024-06-07T09:22:32.719389Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:23.385651Z", - "iopub.status.busy": "2024-05-16T12:13:23.385430Z", - "iopub.status.idle": "2024-05-16T12:13:47.966948Z", - "shell.execute_reply": "2024-05-16T12:13:47.966113Z" - } - }, - "source": [ - "df_predict = linker.inference.predict(threshold_match_probability=0.001)" + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = DuckDBAPI()\n", + "profile_columns(\n", + " [df_origin, df_destination],\n", + " db_api=db_api,\n", + " column_expressions=[\n", + " \"memo\",\n", + " \"transaction_date\",\n", + " \"amount\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:32.724189Z", + "iopub.status.busy": "2024-06-07T09:22:32.723901Z", + "iopub.status.idle": "2024-06-07T09:22:33.500975Z", + "shell.execute_reply": "2024-06-07T09:22:33.500399Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:47.970901Z", - "iopub.status.busy": "2024-05-16T12:13:47.970603Z", - "iopub.status.idle": "2024-05-16T12:13:48.365220Z", - "shell.execute_reply": "2024-05-16T12:13:48.364442Z" - } - }, - "source": [ - "linker.visualisations.comparison_viewer_dashboard(\n", - " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", - ")\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(\n", - " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", - ")" + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "# Design blocking rules that allow for differences in transaction date and amounts\n", + "blocking_rule_date_1 = \"\"\"\n", + " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", + "blocking_rule_date_2 = \"\"\"\n", + " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", + "\n", + "blocking_rule_amount_1 = \"\"\"\n", + "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", + "\"\"\"\n", + "\n", + "blocking_rule_amount_2 = \"\"\"\n", + "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", + "\"\"\"\n", + "\n", + "blocking_rule_cheat = block_on(\"unique_id\")\n", + "\n", + "\n", + "brs = [\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + "]\n", + "\n", + "\n", + "db_api = DuckDBAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[df_origin, df_destination],\n", + " blocking_rules=brs,\n", + " db_api=db_api,\n", + " link_type=\"link_only\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:33.504001Z", + "iopub.status.busy": "2024-06-07T09:22:33.503779Z", + "iopub.status.idle": "2024-06-07T09:22:33.511675Z", + "shell.execute_reply": "2024-06-07T09:22:33.511212Z" + } + }, + "outputs": [], + "source": [ + "# Full settings for linking model\n", + "import splink.comparison_level_library as cll\n", + "import splink.comparison_library as cl\n", + "\n", + "comparison_amount = {\n", + " \"output_column_name\": \"amount\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"amount\"),\n", + " cll.ExactMatchLevel(\"amount\"),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Amount percentage difference\",\n", + "}\n", + "\n", + "# The date distance is one sided becaause transactions should only arrive after they've left\n", + "# As a result, the comparison_template_library date difference functions are not appropriate\n", + "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", + "\n", + "comparison_date = {\n", + " \"output_column_name\": \"transaction_date\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"transaction_date\"),\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=1),\n", + " \"label_for_charts\": \"1 day\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=4),\n", + " \"label_for_charts\": \"<=4 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=10),\n", + " \"label_for_charts\": \"<=10 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=30),\n", + " \"label_for_charts\": \"<=30 days\",\n", + " },\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Transaction date days apart\",\n", + "}\n", + "\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " probability_two_random_records_match=1 / len(df_origin),\n", + " blocking_rules_to_generate_predictions=[\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + " ],\n", + " comparisons=[\n", + " comparison_amount,\n", + " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", + " comparison_date,\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:33.514381Z", + "iopub.status.busy": "2024-06-07T09:22:33.514150Z", + "iopub.status.idle": "2024-06-07T09:22:33.621746Z", + "shell.execute_reply": "2024-06-07T09:22:33.621038Z" + } + }, + "outputs": [], + "source": [ + "linker = Linker(\n", + " [df_origin, df_destination],\n", + " settings,\n", + " input_table_aliases=[\"__ori\", \"_dest\"],\n", + " database_api=db_api,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:33.625044Z", + "iopub.status.busy": "2024-06-07T09:22:33.624807Z", + "iopub.status.idle": "2024-06-07T09:22:35.145751Z", + "shell.execute_reply": "2024-06-07T09:22:35.145280Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - amount (no m values are trained).\n", + " - memo (no m values are trained).\n", + " - transaction_date (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:35.148614Z", + "iopub.status.busy": "2024-06-07T09:22:35.148331Z", + "iopub.status.idle": "2024-06-07T09:22:36.323460Z", + "shell.execute_reply": "2024-06-07T09:22:36.322736Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"memo\" = r.\"memo\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - amount\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - memo\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.596 in the m_probability of amount, level `Exact match on amount`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.167 in the m_probability of transaction_date, level `1 day`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.00961 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00211 in the m_probability of transaction_date, level `<=30 days`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.000367 in the m_probability of transaction_date, level `<=30 days`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.000315 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.000282 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was -0.000254 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.00023 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was -0.000209 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -0.00019 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.000174 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was -0.000159 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was -0.000147 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was -0.000135 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was -0.000125 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.000116 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was -0.000108 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was -0.0001 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was -9.33e-05 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 20 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - memo (no m values are trained).\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:36.326561Z", + "iopub.status.busy": "2024-06-07T09:22:36.326344Z", + "iopub.status.idle": "2024-06-07T09:22:37.563023Z", + "shell.execute_reply": "2024-06-07T09:22:37.562461Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"amount\" = r.\"amount\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - memo\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - amount\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.378 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.104 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0215 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.00538 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00474 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00502 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.00499 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.00466 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.00413 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00348 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.00283 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was 0.00223 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.00171 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.00129 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.000959 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.000706 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was 0.000516 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 0.000375 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was 0.000272 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was 0.000196 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was 0.000141 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was 0.000102 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was 7.32e-05 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 23 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "session = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:37.565956Z", + "iopub.status.busy": "2024-06-07T09:22:37.565738Z", + "iopub.status.idle": "2024-06-07T09:22:37.832159Z", + "shell.execute_reply": "2024-06-07T09:22:37.831506Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:48.369330Z", - "iopub.status.busy": "2024-05-16T12:13:48.369001Z", - "iopub.status.idle": "2024-05-16T12:13:54.043730Z", - "shell.execute_reply": "2024-05-16T12:13:54.043073Z" - } - }, - "source": [ - "pred_errors = linker.inference.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", - ")\n", - "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))" + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:37.835082Z", + "iopub.status.busy": "2024-06-07T09:22:37.834871Z", + "iopub.status.idle": "2024-06-07T09:22:58.616771Z", + "shell.execute_reply": "2024-06-07T09:22:58.615862Z" + } + }, + "outputs": [], + "source": [ + "df_predict = linker.inference.predict(threshold_match_probability=0.001)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:58.620828Z", + "iopub.status.busy": "2024-06-07T09:22:58.620523Z", + "iopub.status.idle": "2024-06-07T09:22:59.018555Z", + "shell.execute_reply": "2024-06-07T09:22:59.017917Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:54.047308Z", - "iopub.status.busy": "2024-05-16T12:13:54.047030Z", - "iopub.status.idle": "2024-05-16T12:13:54.884355Z", - "shell.execute_reply": "2024-05-16T12:13:54.883814Z" - } - }, - "source": [ - "pred_errors = linker.inference.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", - ")\n", - "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))" + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.comparison_viewer_dashboard(\n", + " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", + ")\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(\n", + " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:22:59.022067Z", + "iopub.status.busy": "2024-06-07T09:22:59.021794Z", + "iopub.status.idle": "2024-06-07T09:23:04.254280Z", + "shell.execute_reply": "2024-06-07T09:23:04.253648Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "outputs": [] + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "0cb4a943a08a42c7841ca32d466f9eed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "2bae68755fc34e38ac69e792f314ba8e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "4430006dcc174ff092d96adf68c301ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "5c32bb2a7a714bd79accac15915b17e5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "6222247c7cbe45b19cfeb9b182147a18": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "63719efff46e49ecba53edb438f35c3f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "921bb606e07743f7a252c05830098a57": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "ed234594aea94bf98ffb67a51d3811f4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "fd157120a2ca488496c737cec882713d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - } - }, - "version_major": 2, - "version_minor": 0 - } + ], + "source": [ + "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", + ")\n", + "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-06-07T09:23:04.257242Z", + "iopub.status.busy": "2024-06-07T09:23:04.257017Z", + "iopub.status.idle": "2024-06-07T09:23:05.029715Z", + "shell.execute_reply": "2024-06-07T09:23:05.029153Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", + ")\n", + "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0cb4a943a08a42c7841ca32d466f9eed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "2bae68755fc34e38ac69e792f314ba8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "4430006dcc174ff092d96adf68c301ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "5c32bb2a7a714bd79accac15915b17e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "6222247c7cbe45b19cfeb9b182147a18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "63719efff46e49ecba53edb438f35c3f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "921bb606e07743f7a252c05830098a57": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "ed234594aea94bf98ffb67a51d3811f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "fd157120a2ca488496c737cec882713d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb index ff1369e20a..3eea1b0b3e 100644 --- a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb @@ -385,7 +385,7 @@ }, "outputs": [], "source": [ - "linker.accuracy_analysis_from_labels_column(\n", + "linker.evaluation.accuracy_analysis_from_labels_column(\n", " \"cluster\", output_type=\"roc\", match_weight_round_to_nearest=0.02\n", ")" ] @@ -403,7 +403,7 @@ }, "outputs": [], "source": [ - "records = linker.inference.prediction_errors_from_labels_column(\n", + "records = linker.evaluation.prediction_errors_from_labels_column(\n", " \"cluster\",\n", " threshold=0.999,\n", " include_false_negatives=False,\n", @@ -426,7 +426,7 @@ "outputs": [], "source": [ "# Some of the false negatives will be because they weren't detected by the blocking rules\n", - "records = linker.inference.prediction_errors_from_labels_column(\n", + "records = linker.evaluation.prediction_errors_from_labels_column(\n", " \"cluster\",\n", " threshold=0.5,\n", " include_false_negatives=True,\n",