diff --git a/docs/topic_guides/comparisons/choosing_comparators.ipynb b/docs/topic_guides/comparisons/choosing_comparators.ipynb
index 28c8b2f73f..32172e4e5b 100644
--- a/docs/topic_guides/comparisons/choosing_comparators.ipynb
+++ b/docs/topic_guides/comparisons/choosing_comparators.ipynb
@@ -51,70 +51,12 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " string1 | \n",
- " string2 | \n",
- " levenshtein_distance | \n",
- " damerau_levenshtein_distance | \n",
- " jaro_similarity | \n",
- " jaro_winkler_similarity | \n",
- " jaccard_similarity | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Richard | \n",
- " iRchard | \n",
- " 2 | \n",
- " 1 | \n",
- " 0.95 | \n",
- " 0.95 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " string1 string2 levenshtein_distance damerau_levenshtein_distance \\\n",
- "0 Richard iRchard 2 1 \n",
- "\n",
- " jaro_similarity jaro_winkler_similarity jaccard_similarity \n",
- "0 0.95 0.95 1.0 "
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"import splink.comparison_helpers as ch\n",
"\n",
"ch.comparator_score(\"Richard\", \"iRchard\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -128,131 +70,6 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " string1 | \n",
- " string2 | \n",
- " error_type | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Richard | \n",
- " Richard | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Richard | \n",
- " ichard | \n",
- " Deletion | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Richard | \n",
- " Richar | \n",
- " Deletion | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Richard | \n",
- " iRchard | \n",
- " Transposition | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Richard | \n",
- " Richadr | \n",
- " Transposition | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Richard | \n",
- " Rich | \n",
- " Shortening | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " Richard | \n",
- " Rick | \n",
- " Nickname/Alias | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " Richard | \n",
- " Ricky | \n",
- " Nickname/Alias | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " Richard | \n",
- " Dick | \n",
- " Nickname/Alias | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " Richard | \n",
- " Rico | \n",
- " Nickname/Alias | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " Richard | \n",
- " Rachael | \n",
- " Different Name | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " Richard | \n",
- " Stephen | \n",
- " Different Name | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " string1 string2 error_type\n",
- "0 Richard Richard None\n",
- "1 Richard ichard Deletion\n",
- "2 Richard Richar Deletion\n",
- "3 Richard iRchard Transposition\n",
- "4 Richard Richadr Transposition\n",
- "5 Richard Rich Shortening\n",
- "6 Richard Rick Nickname/Alias\n",
- "7 Richard Ricky Nickname/Alias\n",
- "8 Richard Dick Nickname/Alias\n",
- "9 Richard Rico Nickname/Alias\n",
- "10 Richard Rachael Different Name\n",
- "11 Richard Stephen Different Name"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"import pandas as pd\n",
"\n",
@@ -302,7 +119,8 @@
"}\n",
"df = pd.DataFrame(data)\n",
"df"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -316,107 +134,10 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/rosskennedy/splink/splink/comparison_helpers.py:121: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " similarity_df[\"comparator\"] = similarity_df[\"comparator\"].str.replace(\n",
- "/Users/rosskennedy/splink/splink/comparison_helpers.py:126: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " distance_df[\"comparator\"] = distance_df[\"comparator\"].str.replace(\"_distance\", \"\")\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.comparator_score_chart(data, \"string1\", \"string2\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -438,200 +159,10 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " string1 | \n",
- " string2 | \n",
- " levenshtein_distance | \n",
- " damerau_levenshtein_distance | \n",
- " jaro_similarity | \n",
- " jaro_winkler_similarity | \n",
- " jaccard_similarity | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Richard | \n",
- " Richard | \n",
- " 0 | \n",
- " 0 | \n",
- " 1.00 | \n",
- " 1.00 | \n",
- " 1.00 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Richard | \n",
- " ichard | \n",
- " 1 | \n",
- " 1 | \n",
- " 0.95 | \n",
- " 0.95 | \n",
- " 0.86 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Richard | \n",
- " Richar | \n",
- " 1 | \n",
- " 1 | \n",
- " 0.95 | \n",
- " 0.97 | \n",
- " 0.86 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Richard | \n",
- " iRchard | \n",
- " 2 | \n",
- " 1 | \n",
- " 0.95 | \n",
- " 0.95 | \n",
- " 1.00 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Richard | \n",
- " Richadr | \n",
- " 2 | \n",
- " 1 | \n",
- " 0.95 | \n",
- " 0.97 | \n",
- " 1.00 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Richard | \n",
- " Rich | \n",
- " 3 | \n",
- " 3 | \n",
- " 0.86 | \n",
- " 0.91 | \n",
- " 0.57 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " Richard | \n",
- " Rick | \n",
- " 4 | \n",
- " 4 | \n",
- " 0.73 | \n",
- " 0.81 | \n",
- " 0.38 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " Richard | \n",
- " Ricky | \n",
- " 4 | \n",
- " 4 | \n",
- " 0.68 | \n",
- " 0.68 | \n",
- " 0.33 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " Richard | \n",
- " Dick | \n",
- " 5 | \n",
- " 5 | \n",
- " 0.60 | \n",
- " 0.60 | \n",
- " 0.22 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " Richard | \n",
- " Rico | \n",
- " 4 | \n",
- " 4 | \n",
- " 0.73 | \n",
- " 0.81 | \n",
- " 0.38 | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " Richard | \n",
- " Rachael | \n",
- " 3 | \n",
- " 3 | \n",
- " 0.71 | \n",
- " 0.74 | \n",
- " 0.44 | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " Richard | \n",
- " Stephen | \n",
- " 7 | \n",
- " 7 | \n",
- " 0.43 | \n",
- " 0.43 | \n",
- " 0.08 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " string1 string2 levenshtein_distance damerau_levenshtein_distance \\\n",
- "0 Richard Richard 0 0 \n",
- "1 Richard ichard 1 1 \n",
- "2 Richard Richar 1 1 \n",
- "3 Richard iRchard 2 1 \n",
- "4 Richard Richadr 2 1 \n",
- "5 Richard Rich 3 3 \n",
- "6 Richard Rick 4 4 \n",
- "7 Richard Ricky 4 4 \n",
- "8 Richard Dick 5 5 \n",
- "9 Richard Rico 4 4 \n",
- "10 Richard Rachael 3 3 \n",
- "11 Richard Stephen 7 7 \n",
- "\n",
- " jaro_similarity jaro_winkler_similarity jaccard_similarity \n",
- "0 1.00 1.00 1.00 \n",
- "1 0.95 0.95 0.86 \n",
- "2 0.95 0.97 0.86 \n",
- "3 0.95 0.95 1.00 \n",
- "4 0.95 0.97 1.00 \n",
- "5 0.86 0.91 0.57 \n",
- "6 0.73 0.81 0.38 \n",
- "7 0.68 0.68 0.33 \n",
- "8 0.60 0.60 0.22 \n",
- "9 0.73 0.81 0.38 \n",
- "10 0.71 0.74 0.44 \n",
- "11 0.43 0.43 0.08 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.comparator_score_df(data, \"string1\", \"string2\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -647,109 +178,12 @@
"cell_type": "code",
"execution_count": 5,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/rosskennedy/splink/splink/comparison_helpers.py:172: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " similarity_df[\"comparator\"] = similarity_df[\"comparator\"].str.replace(\n",
- "/Users/rosskennedy/splink/splink/comparison_helpers.py:177: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " distance_df[\"comparator\"] = distance_df[\"comparator\"].str.replace(\"_distance\", \"\")\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.comparator_score_threshold_chart(\n",
" data, \"string1\", \"string2\", distance_threshold=2, similarity_threshold=0.8\n",
")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -767,12 +201,12 @@
"cell_type": "code",
"execution_count": 6,
"metadata": {},
- "outputs": [],
"source": [
"import splink.duckdb.comparison_library as cl\n",
"\n",
"first_name_comparison = cl.jaro_winkler_at_thresholds(\"first_name\", [0.9, 0.8, 0.7])"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -786,34 +220,10 @@
"cell_type": "code",
"execution_count": 7,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'output_column_name': 'first_name',\n",
- " 'comparison_levels': [{'sql_condition': '\"first_name_l\" IS NULL OR \"first_name_r\" IS NULL',\n",
- " 'label_for_charts': 'Null',\n",
- " 'is_null_level': True},\n",
- " {'sql_condition': '\"first_name_l\" = \"first_name_r\"',\n",
- " 'label_for_charts': 'Exact match'},\n",
- " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.9',\n",
- " 'label_for_charts': 'Jaro_winkler_similarity >= 0.9'},\n",
- " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.8',\n",
- " 'label_for_charts': 'Jaro_winkler_similarity >= 0.8'},\n",
- " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.7',\n",
- " 'label_for_charts': 'Jaro_winkler_similarity >= 0.7'},\n",
- " {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],\n",
- " 'comparison_description': 'Exact match vs. First_Name within jaro_winkler_similarity thresholds 0.9, 0.8, 0.7 vs. anything else'}"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"first_name_comparison.as_dict()"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -845,43 +255,21 @@
"cell_type": "code",
"execution_count": 8,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'soundex': 'R02063', 'metaphone': 'RXRT', 'dmetaphone': ('RXRT', 'RKRT')}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"import splink.comparison_helpers\n",
"\n",
"ch.phonetic_transform(\"Richard\")"
- ]
+ ],
+ "outputs": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'soundex': 'S30105', 'metaphone': 'STFN', 'dmetaphone': ('STFN', '')}"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.phonetic_transform(\"Steven\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -895,124 +283,6 @@
"cell_type": "code",
"execution_count": 10,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " string1 | \n",
- " string2 | \n",
- " error_type | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Stephen | \n",
- " Stephen | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Stephen | \n",
- " Steven | \n",
- " Spelling Variation | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Stephen | \n",
- " Stephan | \n",
- " Spelling Variation/Similar Name | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Stephen | \n",
- " Steve | \n",
- " Nickname/Alias | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Stephen | \n",
- " Stehpen | \n",
- " Transposition | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Stephen | \n",
- " tSephen | \n",
- " Transposition | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " Stephen | \n",
- " Stephne | \n",
- " Transposition | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " Stephen | \n",
- " Stphen | \n",
- " Deletion | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " Stephen | \n",
- " Stepheb | \n",
- " Replacement | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " Stephen | \n",
- " Stephanie | \n",
- " Different Name | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " Stephen | \n",
- " Richard | \n",
- " Different Name | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " string1 string2 error_type\n",
- "0 Stephen Stephen None\n",
- "1 Stephen Steven Spelling Variation\n",
- "2 Stephen Stephan Spelling Variation/Similar Name\n",
- "3 Stephen Steve Nickname/Alias\n",
- "4 Stephen Stehpen Transposition\n",
- "5 Stephen tSephen Transposition\n",
- "6 Stephen Stephne Transposition\n",
- "7 Stephen Stphen Deletion\n",
- "8 Stephen Stepheb Replacement\n",
- "9 Stephen Stephanie Different Name\n",
- "10 Stephen Richard Different Name"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"data = {\n",
" \"string1\": [\n",
@@ -1058,7 +328,8 @@
"\n",
"df = pd.DataFrame(data)\n",
"df"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -1072,89 +343,10 @@
"cell_type": "code",
"execution_count": 11,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.phonetic_match_chart(data, \"string1\", \"string2\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -1178,164 +370,10 @@
"cell_type": "code",
"execution_count": 12,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " string1 | \n",
- " string2 | \n",
- " soundex | \n",
- " metaphone | \n",
- " dmetaphone | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Stephen | \n",
- " Stephen | \n",
- " [S30105, S30105] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Stephen | \n",
- " Steven | \n",
- " [S30105, S30105] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Stephen | \n",
- " Stephan | \n",
- " [S30105, S30105] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Stephen | \n",
- " Steve | \n",
- " [S30105, S3010] | \n",
- " [STFN, STF] | \n",
- " [(STFN, ), (STF, )] | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Stephen | \n",
- " Stehpen | \n",
- " [S30105, S30105] | \n",
- " [STFN, STPN] | \n",
- " [(STFN, ), (STPN, )] | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Stephen | \n",
- " tSephen | \n",
- " [S30105, t50105] | \n",
- " [STFN, TSFN] | \n",
- " [(STFN, ), (TSFN, )] | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " Stephen | \n",
- " Stephne | \n",
- " [S30105, S301050] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " Stephen | \n",
- " Stphen | \n",
- " [S30105, S3105] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " Stephen | \n",
- " Stepheb | \n",
- " [S30105, S30101] | \n",
- " [STFN, STFP] | \n",
- " [(STFN, ), (STFP, )] | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " Stephen | \n",
- " Stephanie | \n",
- " [S30105, S301050] | \n",
- " [STFN, STFN] | \n",
- " [(STFN, ), (STFN, )] | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " Stephen | \n",
- " Richard | \n",
- " [S30105, R02063] | \n",
- " [STFN, RXRT] | \n",
- " [(STFN, ), (RXRT, RKRT)] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " string1 string2 soundex metaphone \\\n",
- "0 Stephen Stephen [S30105, S30105] [STFN, STFN] \n",
- "1 Stephen Steven [S30105, S30105] [STFN, STFN] \n",
- "2 Stephen Stephan [S30105, S30105] [STFN, STFN] \n",
- "3 Stephen Steve [S30105, S3010] [STFN, STF] \n",
- "4 Stephen Stehpen [S30105, S30105] [STFN, STPN] \n",
- "5 Stephen tSephen [S30105, t50105] [STFN, TSFN] \n",
- "6 Stephen Stephne [S30105, S301050] [STFN, STFN] \n",
- "7 Stephen Stphen [S30105, S3105] [STFN, STFN] \n",
- "8 Stephen Stepheb [S30105, S30101] [STFN, STFP] \n",
- "9 Stephen Stephanie [S30105, S301050] [STFN, STFN] \n",
- "10 Stephen Richard [S30105, R02063] [STFN, RXRT] \n",
- "\n",
- " dmetaphone \n",
- "0 [(STFN, ), (STFN, )] \n",
- "1 [(STFN, ), (STFN, )] \n",
- "2 [(STFN, ), (STFN, )] \n",
- "3 [(STFN, ), (STF, )] \n",
- "4 [(STFN, ), (STPN, )] \n",
- "5 [(STFN, ), (TSFN, )] \n",
- "6 [(STFN, ), (STFN, )] \n",
- "7 [(STFN, ), (STFN, )] \n",
- "8 [(STFN, ), (STFP, )] \n",
- "9 [(STFN, ), (STFN, )] \n",
- "10 [(STFN, ), (RXRT, RKRT)] "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"ch.phonetic_transform_df(data, \"string1\", \"string2\")"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
@@ -1359,31 +397,6 @@
"cell_type": "code",
"execution_count": 13,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'output_column_name': 'custom_first_name_first_name_dm',\n",
- " 'comparison_levels': [{'sql_condition': '\"first_name_l\" IS NULL OR \"first_name_r\" IS NULL',\n",
- " 'label_for_charts': 'Null',\n",
- " 'is_null_level': True},\n",
- " {'sql_condition': '\"first_name_l\" = \"first_name_r\"',\n",
- " 'label_for_charts': 'Exact match first_name'},\n",
- " {'sql_condition': '\"first_name_dm_l\" = \"first_name_dm_r\"',\n",
- " 'label_for_charts': 'Exact match first_name_dm'},\n",
- " {'sql_condition': 'levenshtein(\"first_name_l\", \"first_name_r\") <= 2',\n",
- " 'label_for_charts': 'Levenshtein <= 2'},\n",
- " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.8',\n",
- " 'label_for_charts': 'Jaro_winkler_similarity >= 0.8'},\n",
- " {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],\n",
- " 'comparison_description': 'Exact match vs. Names with phonetic exact match vs. First_Name within levenshtein threshold 2 vs. First_Name within jaro_winkler threshold 0.8 vs. anything else'}"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
"import splink.duckdb.comparison_template_library as ctl\n",
"\n",
@@ -1396,7 +409,8 @@
")\n",
"\n",
"first_name_comparison.as_dict()"
- ]
+ ],
+ "outputs": []
},
{
"attachments": {},
diff --git a/scripts/generate_dialect_comparison_docs.py b/scripts/generate_dialect_comparison_docs.py
index ebd9f8d616..fd485cbd56 100644
--- a/scripts/generate_dialect_comparison_docs.py
+++ b/scripts/generate_dialect_comparison_docs.py
@@ -4,8 +4,8 @@
import inspect
from pathlib import Path
-from splink.comparison import Comparison
-from splink.comparison_level import ComparisonLevel
+from splink.internals.comparison import Comparison
+from splink.internals.comparison_level import ComparisonLevel
from splink.dialect_base import DialectBase
# could always pick this up dynamically,
diff --git a/splink/comparison_library.py b/splink/comparison_library.py
index 28c7b17940..8be9600e12 100644
--- a/splink/comparison_library.py
+++ b/splink/comparison_library.py
@@ -2,10 +2,10 @@
from typing import Any, Iterable, List, Optional, Union
-from . import comparison_level_library as cll
-from .comparison_creator import ComparisonCreator
-from .comparison_level_creator import ComparisonLevelCreator
-from .comparison_level_library import CustomLevel, DateMetricType
+from .internals import comparison_level_library as cll
+from splink.internals.comparison_creator import ComparisonCreator
+from splink.internals.comparison_level_creator import ComparisonLevelCreator
+from splink.internals.comparison_level_library import CustomLevel, DateMetricType
from .misc import ensure_is_iterable
diff --git a/splink/comparison_template_library.py b/splink/comparison_template_library.py
index f7664f1214..ba7d74dd00 100644
--- a/splink/comparison_template_library.py
+++ b/splink/comparison_template_library.py
@@ -2,11 +2,11 @@
from typing import List, Type, Union
-from . import comparison_level_library as cll
+from .internals import comparison_level_library as cll
from splink.internals.column_expression import ColumnExpression
-from .comparison_creator import ComparisonCreator
-from .comparison_level_creator import ComparisonLevelCreator
-from .comparison_level_library import DateMetricType
+from splink.internals.comparison_creator import ComparisonCreator
+from splink.internals.comparison_level_creator import ComparisonLevelCreator
+from splink.internals.comparison_level_library import DateMetricType
from .misc import ensure_is_iterable
# alternatively we could stick an inheritance layer in these, just for typing:
diff --git a/splink/dialects.py b/splink/dialects.py
index 3167150943..0941657d16 100644
--- a/splink/dialects.py
+++ b/splink/dialects.py
@@ -4,7 +4,7 @@
from typing import TYPE_CHECKING, Type, TypeVar, final
if TYPE_CHECKING:
- from .comparison_level_library import (
+ from splink.internals.comparison_level_library import (
AbsoluteTimeDifferenceLevel,
ArrayIntersectLevel,
)
diff --git a/splink/em_training_session.py b/splink/em_training_session.py
index e3e58df69a..3edc8b03dd 100644
--- a/splink/em_training_session.py
+++ b/splink/em_training_session.py
@@ -10,8 +10,8 @@
probability_two_random_records_match_iteration_chart,
)
-from .comparison import Comparison
-from .comparison_level import ComparisonLevel
+from splink.internals.comparison import Comparison
+from splink.internals.comparison_level import ComparisonLevel
from .comparison_vector_values import compute_comparison_vector_values_sql
from .constants import LEVEL_NOT_OBSERVED_TEXT
from .database_api import DatabaseAPISubClass
diff --git a/splink/expectation_maximisation.py b/splink/expectation_maximisation.py
index 22eef53447..a5d1b678f9 100644
--- a/splink/expectation_maximisation.py
+++ b/splink/expectation_maximisation.py
@@ -6,8 +6,8 @@
import pandas as pd
-from .comparison import Comparison
-from .comparison_level import ComparisonLevel
+from splink.internals.comparison import Comparison
+from splink.internals.comparison_level import ComparisonLevel
from .constants import LEVEL_NOT_OBSERVED_TEXT
from .database_api import DatabaseAPISubClass
from .input_column import InputColumn
diff --git a/splink/comparison.py b/splink/internals/comparison.py
similarity index 99%
rename from splink/comparison.py
rename to splink/internals/comparison.py
index 46a9d46b77..a6f5500cf5 100644
--- a/splink/comparison.py
+++ b/splink/internals/comparison.py
@@ -3,11 +3,11 @@
from typing import TYPE_CHECKING, Any, List, Optional
from .comparison_level import ComparisonLevel, _default_m_values, _default_u_values
-from .misc import dedupe_preserving_order, join_list_with_commas_final_and
+from splink.misc import dedupe_preserving_order, join_list_with_commas_final_and
# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
if TYPE_CHECKING:
- from .settings import ColumnInfoSettings
+ from splink.settings import ColumnInfoSettings
class Comparison:
diff --git a/splink/comparison_creator.py b/splink/internals/comparison_creator.py
similarity index 99%
rename from splink/comparison_creator.py
rename to splink/internals/comparison_creator.py
index 8930657332..2f8343248c 100644
--- a/splink/comparison_creator.py
+++ b/splink/internals/comparison_creator.py
@@ -6,7 +6,7 @@
from splink.internals.column_expression import ColumnExpression
from .comparison import Comparison
from .comparison_level_creator import ComparisonLevelCreator
-from .exceptions import SplinkException
+from splink.exceptions import SplinkException
class ComparisonCreator(ABC):
diff --git a/splink/comparison_helpers.py b/splink/internals/comparison_helpers.py
similarity index 100%
rename from splink/comparison_helpers.py
rename to splink/internals/comparison_helpers.py
diff --git a/splink/comparison_level.py b/splink/internals/comparison_level.py
similarity index 99%
rename from splink/comparison_level.py
rename to splink/internals/comparison_level.py
index c177731e6a..63cbd1890f 100644
--- a/splink/comparison_level.py
+++ b/splink/internals/comparison_level.py
@@ -13,16 +13,16 @@
from sqlglot.optimizer.normalize import normalize
from sqlglot.optimizer.simplify import simplify
-from .constants import LEVEL_NOT_OBSERVED_TEXT
-from .input_column import InputColumn
-from .misc import (
+from splink.constants import LEVEL_NOT_OBSERVED_TEXT
+from splink.input_column import InputColumn
+from splink.misc import (
dedupe_preserving_order,
interpolate,
join_list_with_commas_final_and,
match_weight_to_bayes_factor,
)
-from .parse_sql import get_columns_used_from_sql
-from .sql_transform import sqlglot_tree_signature
+from splink.parse_sql import get_columns_used_from_sql
+from splink.sql_transform import sqlglot_tree_signature
logger = logging.getLogger(__name__)
diff --git a/splink/comparison_level_composition.py b/splink/internals/comparison_level_composition.py
similarity index 98%
rename from splink/comparison_level_composition.py
rename to splink/internals/comparison_level_composition.py
index 54f5fbdeb8..e64088aef0 100644
--- a/splink/comparison_level_composition.py
+++ b/splink/internals/comparison_level_composition.py
@@ -6,7 +6,7 @@
from .comparison_creator import ComparisonLevelCreator
from .comparison_level import ComparisonLevel
-from .dialects import SplinkDialect
+from splink.dialects import SplinkDialect
def _ensure_is_comparison_level_creator(
diff --git a/splink/comparison_level_creator.py b/splink/internals/comparison_level_creator.py
similarity index 99%
rename from splink/comparison_level_creator.py
rename to splink/internals/comparison_level_creator.py
index 5f6eee7302..6434c36e8e 100644
--- a/splink/comparison_level_creator.py
+++ b/splink/internals/comparison_level_creator.py
@@ -6,7 +6,7 @@
from splink.internals.column_expression import ColumnExpression
from .comparison_level import ComparisonLevel
-from .dialects import SplinkDialect
+from splink.dialects import SplinkDialect
class ComparisonLevelCreator(ABC):
diff --git a/splink/comparison_level_library.py b/splink/internals/comparison_level_library.py
similarity index 99%
rename from splink/comparison_level_library.py
rename to splink/internals/comparison_level_library.py
index 72b0036111..ad0aa2a2b7 100644
--- a/splink/comparison_level_library.py
+++ b/splink/internals/comparison_level_library.py
@@ -11,8 +11,8 @@
# import composition functions for export
from .comparison_level_composition import And, Not, Or # NOQA: F401
from .comparison_level_creator import ComparisonLevelCreator
-from .comparison_level_sql import great_circle_distance_km_sql
-from .dialects import SplinkDialect
+from splink.comparison_level_sql import great_circle_distance_km_sql
+from splink.dialects import SplinkDialect
# type aliases:
T = TypeVar("T", bound=ComparisonLevelCreator)
diff --git a/splink/linker.py b/splink/linker.py
index 3f52f736ed..54713758ac 100644
--- a/splink/linker.py
+++ b/splink/linker.py
@@ -40,8 +40,8 @@
render_splink_cluster_studio_html,
)
-from .comparison import Comparison
-from .comparison_level import ComparisonLevel
+from splink.internals.comparison import Comparison
+from splink.internals.comparison_level import ComparisonLevel
from .comparison_vector_distribution import (
comparison_vector_distribution_sql,
)
diff --git a/splink/m_u_records_to_parameters.py b/splink/m_u_records_to_parameters.py
index 8c7b6f42de..91376ebc2d 100644
--- a/splink/m_u_records_to_parameters.py
+++ b/splink/m_u_records_to_parameters.py
@@ -3,7 +3,7 @@
import logging
from typing import Any, Dict, List
-from .comparison_level import ComparisonLevel
+from splink.internals.comparison_level import ComparisonLevel
from .constants import LEVEL_NOT_OBSERVED_TEXT
logger = logging.getLogger(__name__)
diff --git a/splink/predict.py b/splink/predict.py
index cb65c2ee27..93150e0569 100644
--- a/splink/predict.py
+++ b/splink/predict.py
@@ -4,7 +4,7 @@
import logging
from typing import List
-from .comparison import Comparison
+from splink.internals.comparison import Comparison
from .input_column import InputColumn
from .misc import prob_to_bayes_factor, prob_to_match_weight
from .settings import CoreModelSettings, Settings
diff --git a/splink/settings.py b/splink/settings.py
index 7c83b7a16a..f40114aa87 100644
--- a/splink/settings.py
+++ b/splink/settings.py
@@ -12,8 +12,8 @@
)
from splink.internals.charts import m_u_parameters_chart, match_weights_chart
-from .comparison import Comparison
-from .comparison_level import ComparisonLevel
+from splink.internals.comparison import Comparison
+from splink.internals.comparison_level import ComparisonLevel
from .input_column import InputColumn
from .misc import dedupe_preserving_order, prob_to_bayes_factor, prob_to_match_weight
from .parse_sql import get_columns_used_from_sql
diff --git a/splink/settings_creator.py b/splink/settings_creator.py
index 407a793799..1c9c9559d1 100644
--- a/splink/settings_creator.py
+++ b/splink/settings_creator.py
@@ -9,7 +9,7 @@
from splink.internals.blocking_rule_creator import BlockingRuleCreator
from splink.internals.blocking_rule_creator_utils import to_blocking_rule_creator
-from .comparison_creator import ComparisonCreator
+from splink.internals.comparison_creator import ComparisonCreator
from .comparison_library import CustomComparison
from .settings import Settings
diff --git a/splink/settings_validation/log_invalid_columns.py b/splink/settings_validation/log_invalid_columns.py
index 4a8abe5aa8..b0d21ff3c4 100644
--- a/splink/settings_validation/log_invalid_columns.py
+++ b/splink/settings_validation/log_invalid_columns.py
@@ -7,7 +7,7 @@
import sqlglot
import sqlglot.expressions
-from ..comparison import Comparison
+from splink.internals.comparison import Comparison
from ..parse_sql import parse_columns_in_sql
from .settings_column_cleaner import (
SettingsColumnCleaner,
diff --git a/splink/waterfall_chart.py b/splink/waterfall_chart.py
index 8747bf2f95..900c813bdb 100644
--- a/splink/waterfall_chart.py
+++ b/splink/waterfall_chart.py
@@ -4,7 +4,7 @@
from copy import deepcopy
from typing import Any, Dict
-from .comparison import Comparison
+from splink.internals.comparison import Comparison
from .misc import prob_to_bayes_factor
diff --git a/tests/literal_utils.py b/tests/literal_utils.py
index f7095088ee..8c63b8c016 100644
--- a/tests/literal_utils.py
+++ b/tests/literal_utils.py
@@ -3,8 +3,8 @@
import pytest
-from splink.comparison_creator import ComparisonCreator
-from splink.comparison_level_creator import ComparisonLevelCreator
+from splink.internals.comparison_creator import ComparisonCreator
+from splink.internals.comparison_level_creator import ComparisonLevelCreator
class ComparisonLevelTestSpec:
diff --git a/tests/test_columns_selected.py b/tests/test_columns_selected.py
index 19bd2f111f..fe0efe7fa8 100644
--- a/tests/test_columns_selected.py
+++ b/tests/test_columns_selected.py
@@ -4,7 +4,7 @@
import pandas as pd
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker
diff --git a/tests/test_comparison_level.py b/tests/test_comparison_level.py
index 968f13d982..dd25797422 100644
--- a/tests/test_comparison_level.py
+++ b/tests/test_comparison_level.py
@@ -1,6 +1,6 @@
from pytest import mark, raises
-from splink.comparison_level import ComparisonLevel
+from splink.internals.comparison_level import ComparisonLevel
from .decorator import mark_with_dialects_excluding
diff --git a/tests/test_comparison_level_composition.py b/tests/test_comparison_level_composition.py
index 9cb694113f..9967d05341 100644
--- a/tests/test_comparison_level_composition.py
+++ b/tests/test_comparison_level_composition.py
@@ -1,7 +1,7 @@
import pandas as pd
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from splink.input_column import _get_dialect_quotes
from .decorator import mark_with_dialects_excluding
diff --git a/tests/test_comparison_level_lib.py b/tests/test_comparison_level_lib.py
index 4ae459cb45..60c2a4c09e 100644
--- a/tests/test_comparison_level_lib.py
+++ b/tests/test_comparison_level_lib.py
@@ -1,6 +1,6 @@
import pandas as pd
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from .decorator import mark_with_dialects_excluding
diff --git a/tests/test_compound_comparison_levels.py b/tests/test_compound_comparison_levels.py
index 107a9b8a60..ee802575e4 100644
--- a/tests/test_compound_comparison_levels.py
+++ b/tests/test_compound_comparison_levels.py
@@ -1,6 +1,6 @@
import pandas as pd
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker
diff --git a/tests/test_date_levels_and_comparisons.py b/tests/test_date_levels_and_comparisons.py
index 6d6291219b..199bd2b1c0 100644
--- a/tests/test_date_levels_and_comparisons.py
+++ b/tests/test_date_levels_and_comparisons.py
@@ -2,7 +2,7 @@
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.comparison_template_library as ctl
from splink.internals.column_expression import ColumnExpression
diff --git a/tests/test_disable_tf_exact_match_detection.py b/tests/test_disable_tf_exact_match_detection.py
index ddf7257f5f..5b4ab37f9c 100644
--- a/tests/test_disable_tf_exact_match_detection.py
+++ b/tests/test_disable_tf_exact_match_detection.py
@@ -1,7 +1,7 @@
import pandas as pd
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from splink import DuckDBAPI, Linker, SettingsCreator
diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py
index 8eb6268951..afe2175a58 100644
--- a/tests/test_full_example_duckdb.py
+++ b/tests/test_full_example_duckdb.py
@@ -6,7 +6,7 @@
import pyarrow.parquet as pq
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
from splink.blocking_analysis import count_comparisons_from_blocking_rule
from splink.duckdb.database_api import DuckDBAPI
diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py
index 08246a6516..9f52b481c7 100644
--- a/tests/test_full_example_spark.py
+++ b/tests/test_full_example_spark.py
@@ -5,7 +5,7 @@
import pytest
from pyspark.sql.types import StringType, StructField, StructType
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
from splink.exploratory import completeness_chart, profile_columns
from splink.linker import Linker
diff --git a/tests/test_km_distance_level.py b/tests/test_km_distance_level.py
index 771f484a8f..862b70b234 100644
--- a/tests/test_km_distance_level.py
+++ b/tests/test_km_distance_level.py
@@ -1,6 +1,6 @@
import pandas as pd
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker
diff --git a/tests/test_new_comparison_levels.py b/tests/test_new_comparison_levels.py
index 2db086dc8e..4153256733 100644
--- a/tests/test_new_comparison_levels.py
+++ b/tests/test_new_comparison_levels.py
@@ -3,7 +3,7 @@
import pandas as pd
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.comparison_template_library as ctl
from splink.internals.column_expression import ColumnExpression
diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py
index 5019b44a92..9c15eddb54 100644
--- a/tests/test_new_db_api.py
+++ b/tests/test_new_db_api.py
@@ -1,6 +1,6 @@
import os
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
from splink import block_on
from splink.blocking_analysis import (
diff --git a/tests/test_regex_param.py b/tests/test_regex_param.py
index 5d1594d28d..a9d02642b4 100644
--- a/tests/test_regex_param.py
+++ b/tests/test_regex_param.py
@@ -1,7 +1,7 @@
import pandas as pd
import pytest
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from splink.internals.column_expression import ColumnExpression
from .decorator import mark_with_dialects_excluding
diff --git a/tests/test_spark_udfs.py b/tests/test_spark_udfs.py
index 73b28a6964..06599c169d 100644
--- a/tests/test_spark_udfs.py
+++ b/tests/test_spark_udfs.py
@@ -1,6 +1,6 @@
import pandas as pd
-import splink.comparison_level_library as cll
+import splink.internals.comparison_level_library as cll
from splink.linker import Linker
from tests.decorator import mark_with_dialects_including