diff --git a/docs/topic_guides/comparisons/choosing_comparators.ipynb b/docs/topic_guides/comparisons/choosing_comparators.ipynb index 28c8b2f73f..32172e4e5b 100644 --- a/docs/topic_guides/comparisons/choosing_comparators.ipynb +++ b/docs/topic_guides/comparisons/choosing_comparators.ipynb @@ -51,70 +51,12 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
string1string2levenshtein_distancedamerau_levenshtein_distancejaro_similarityjaro_winkler_similarityjaccard_similarity
0RichardiRchard210.950.951.0
\n", - "
" - ], - "text/plain": [ - " string1 string2 levenshtein_distance damerau_levenshtein_distance \\\n", - "0 Richard iRchard 2 1 \n", - "\n", - " jaro_similarity jaro_winkler_similarity jaccard_similarity \n", - "0 0.95 0.95 1.0 " - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "import splink.comparison_helpers as ch\n", "\n", "ch.comparator_score(\"Richard\", \"iRchard\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -128,131 +70,6 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
string1string2error_type
0RichardRichardNone
1RichardichardDeletion
2RichardRicharDeletion
3RichardiRchardTransposition
4RichardRichadrTransposition
5RichardRichShortening
6RichardRickNickname/Alias
7RichardRickyNickname/Alias
8RichardDickNickname/Alias
9RichardRicoNickname/Alias
10RichardRachaelDifferent Name
11RichardStephenDifferent Name
\n", - "
" - ], - "text/plain": [ - " string1 string2 error_type\n", - "0 Richard Richard None\n", - "1 Richard ichard Deletion\n", - "2 Richard Richar Deletion\n", - "3 Richard iRchard Transposition\n", - "4 Richard Richadr Transposition\n", - "5 Richard Rich Shortening\n", - "6 Richard Rick Nickname/Alias\n", - "7 Richard Ricky Nickname/Alias\n", - "8 Richard Dick Nickname/Alias\n", - "9 Richard Rico Nickname/Alias\n", - "10 Richard Rachael Different Name\n", - "11 Richard Stephen Different Name" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "import pandas as pd\n", "\n", @@ -302,7 +119,8 @@ "}\n", "df = pd.DataFrame(data)\n", "df" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -316,107 +134,10 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/rosskennedy/splink/splink/comparison_helpers.py:121: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " similarity_df[\"comparator\"] = similarity_df[\"comparator\"].str.replace(\n", - "/Users/rosskennedy/splink/splink/comparison_helpers.py:126: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " distance_df[\"comparator\"] = distance_df[\"comparator\"].str.replace(\"_distance\", \"\")\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.HConcatChart(...)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.comparator_score_chart(data, \"string1\", \"string2\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -438,200 +159,10 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
string1string2levenshtein_distancedamerau_levenshtein_distancejaro_similarityjaro_winkler_similarityjaccard_similarity
0RichardRichard001.001.001.00
1Richardichard110.950.950.86
2RichardRichar110.950.970.86
3RichardiRchard210.950.951.00
4RichardRichadr210.950.971.00
5RichardRich330.860.910.57
6RichardRick440.730.810.38
7RichardRicky440.680.680.33
8RichardDick550.600.600.22
9RichardRico440.730.810.38
10RichardRachael330.710.740.44
11RichardStephen770.430.430.08
\n", - "
" - ], - "text/plain": [ - " string1 string2 levenshtein_distance damerau_levenshtein_distance \\\n", - "0 Richard Richard 0 0 \n", - "1 Richard ichard 1 1 \n", - "2 Richard Richar 1 1 \n", - "3 Richard iRchard 2 1 \n", - "4 Richard Richadr 2 1 \n", - "5 Richard Rich 3 3 \n", - "6 Richard Rick 4 4 \n", - "7 Richard Ricky 4 4 \n", - "8 Richard Dick 5 5 \n", - "9 Richard Rico 4 4 \n", - "10 Richard Rachael 3 3 \n", - "11 Richard Stephen 7 7 \n", - "\n", - " jaro_similarity jaro_winkler_similarity jaccard_similarity \n", - "0 1.00 1.00 1.00 \n", - "1 0.95 0.95 0.86 \n", - "2 0.95 0.97 0.86 \n", - "3 0.95 0.95 1.00 \n", - "4 0.95 0.97 1.00 \n", - "5 0.86 0.91 0.57 \n", - "6 0.73 0.81 0.38 \n", - "7 0.68 0.68 0.33 \n", - "8 0.60 0.60 0.22 \n", - "9 0.73 0.81 0.38 \n", - "10 0.71 0.74 0.44 \n", - "11 0.43 0.43 0.08 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.comparator_score_df(data, \"string1\", \"string2\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -647,109 +178,12 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/rosskennedy/splink/splink/comparison_helpers.py:172: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " similarity_df[\"comparator\"] = similarity_df[\"comparator\"].str.replace(\n", - "/Users/rosskennedy/splink/splink/comparison_helpers.py:177: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " distance_df[\"comparator\"] = distance_df[\"comparator\"].str.replace(\"_distance\", \"\")\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.HConcatChart(...)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.comparator_score_threshold_chart(\n", " data, \"string1\", \"string2\", distance_threshold=2, similarity_threshold=0.8\n", ")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -767,12 +201,12 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], "source": [ "import splink.duckdb.comparison_library as cl\n", "\n", "first_name_comparison = cl.jaro_winkler_at_thresholds(\"first_name\", [0.9, 0.8, 0.7])" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -786,34 +220,10 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'output_column_name': 'first_name',\n", - " 'comparison_levels': [{'sql_condition': '\"first_name_l\" IS NULL OR \"first_name_r\" IS NULL',\n", - " 'label_for_charts': 'Null',\n", - " 'is_null_level': True},\n", - " {'sql_condition': '\"first_name_l\" = \"first_name_r\"',\n", - " 'label_for_charts': 'Exact match'},\n", - " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.9',\n", - " 'label_for_charts': 'Jaro_winkler_similarity >= 0.9'},\n", - " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.8',\n", - " 'label_for_charts': 'Jaro_winkler_similarity >= 0.8'},\n", - " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.7',\n", - " 'label_for_charts': 'Jaro_winkler_similarity >= 0.7'},\n", - " {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],\n", - " 'comparison_description': 'Exact match vs. First_Name within jaro_winkler_similarity thresholds 0.9, 0.8, 0.7 vs. anything else'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "first_name_comparison.as_dict()" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -845,43 +255,21 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'soundex': 'R02063', 'metaphone': 'RXRT', 'dmetaphone': ('RXRT', 'RKRT')}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "import splink.comparison_helpers\n", "\n", "ch.phonetic_transform(\"Richard\")" - ] + ], + "outputs": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'soundex': 'S30105', 'metaphone': 'STFN', 'dmetaphone': ('STFN', '')}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.phonetic_transform(\"Steven\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -895,124 +283,6 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
string1string2error_type
0StephenStephenNone
1StephenStevenSpelling Variation
2StephenStephanSpelling Variation/Similar Name
3StephenSteveNickname/Alias
4StephenStehpenTransposition
5StephentSephenTransposition
6StephenStephneTransposition
7StephenStphenDeletion
8StephenStephebReplacement
9StephenStephanieDifferent Name
10StephenRichardDifferent Name
\n", - "
" - ], - "text/plain": [ - " string1 string2 error_type\n", - "0 Stephen Stephen None\n", - "1 Stephen Steven Spelling Variation\n", - "2 Stephen Stephan Spelling Variation/Similar Name\n", - "3 Stephen Steve Nickname/Alias\n", - "4 Stephen Stehpen Transposition\n", - "5 Stephen tSephen Transposition\n", - "6 Stephen Stephne Transposition\n", - "7 Stephen Stphen Deletion\n", - "8 Stephen Stepheb Replacement\n", - "9 Stephen Stephanie Different Name\n", - "10 Stephen Richard Different Name" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "data = {\n", " \"string1\": [\n", @@ -1058,7 +328,8 @@ "\n", "df = pd.DataFrame(data)\n", "df" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -1072,89 +343,10 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.phonetic_match_chart(data, \"string1\", \"string2\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -1178,164 +370,10 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
string1string2soundexmetaphonedmetaphone
0StephenStephen[S30105, S30105][STFN, STFN][(STFN, ), (STFN, )]
1StephenSteven[S30105, S30105][STFN, STFN][(STFN, ), (STFN, )]
2StephenStephan[S30105, S30105][STFN, STFN][(STFN, ), (STFN, )]
3StephenSteve[S30105, S3010][STFN, STF][(STFN, ), (STF, )]
4StephenStehpen[S30105, S30105][STFN, STPN][(STFN, ), (STPN, )]
5StephentSephen[S30105, t50105][STFN, TSFN][(STFN, ), (TSFN, )]
6StephenStephne[S30105, S301050][STFN, STFN][(STFN, ), (STFN, )]
7StephenStphen[S30105, S3105][STFN, STFN][(STFN, ), (STFN, )]
8StephenStepheb[S30105, S30101][STFN, STFP][(STFN, ), (STFP, )]
9StephenStephanie[S30105, S301050][STFN, STFN][(STFN, ), (STFN, )]
10StephenRichard[S30105, R02063][STFN, RXRT][(STFN, ), (RXRT, RKRT)]
\n", - "
" - ], - "text/plain": [ - " string1 string2 soundex metaphone \\\n", - "0 Stephen Stephen [S30105, S30105] [STFN, STFN] \n", - "1 Stephen Steven [S30105, S30105] [STFN, STFN] \n", - "2 Stephen Stephan [S30105, S30105] [STFN, STFN] \n", - "3 Stephen Steve [S30105, S3010] [STFN, STF] \n", - "4 Stephen Stehpen [S30105, S30105] [STFN, STPN] \n", - "5 Stephen tSephen [S30105, t50105] [STFN, TSFN] \n", - "6 Stephen Stephne [S30105, S301050] [STFN, STFN] \n", - "7 Stephen Stphen [S30105, S3105] [STFN, STFN] \n", - "8 Stephen Stepheb [S30105, S30101] [STFN, STFP] \n", - "9 Stephen Stephanie [S30105, S301050] [STFN, STFN] \n", - "10 Stephen Richard [S30105, R02063] [STFN, RXRT] \n", - "\n", - " dmetaphone \n", - "0 [(STFN, ), (STFN, )] \n", - "1 [(STFN, ), (STFN, )] \n", - "2 [(STFN, ), (STFN, )] \n", - "3 [(STFN, ), (STF, )] \n", - "4 [(STFN, ), (STPN, )] \n", - "5 [(STFN, ), (TSFN, )] \n", - "6 [(STFN, ), (STFN, )] \n", - "7 [(STFN, ), (STFN, )] \n", - "8 [(STFN, ), (STFP, )] \n", - "9 [(STFN, ), (STFN, )] \n", - "10 [(STFN, ), (RXRT, RKRT)] " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ch.phonetic_transform_df(data, \"string1\", \"string2\")" - ] + ], + "outputs": [] }, { "attachments": {}, @@ -1359,31 +397,6 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'output_column_name': 'custom_first_name_first_name_dm',\n", - " 'comparison_levels': [{'sql_condition': '\"first_name_l\" IS NULL OR \"first_name_r\" IS NULL',\n", - " 'label_for_charts': 'Null',\n", - " 'is_null_level': True},\n", - " {'sql_condition': '\"first_name_l\" = \"first_name_r\"',\n", - " 'label_for_charts': 'Exact match first_name'},\n", - " {'sql_condition': '\"first_name_dm_l\" = \"first_name_dm_r\"',\n", - " 'label_for_charts': 'Exact match first_name_dm'},\n", - " {'sql_condition': 'levenshtein(\"first_name_l\", \"first_name_r\") <= 2',\n", - " 'label_for_charts': 'Levenshtein <= 2'},\n", - " {'sql_condition': 'jaro_winkler_similarity(\"first_name_l\", \"first_name_r\") >= 0.8',\n", - " 'label_for_charts': 'Jaro_winkler_similarity >= 0.8'},\n", - " {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],\n", - " 'comparison_description': 'Exact match vs. Names with phonetic exact match vs. First_Name within levenshtein threshold 2 vs. First_Name within jaro_winkler threshold 0.8 vs. anything else'}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "import splink.duckdb.comparison_template_library as ctl\n", "\n", @@ -1396,7 +409,8 @@ ")\n", "\n", "first_name_comparison.as_dict()" - ] + ], + "outputs": [] }, { "attachments": {}, diff --git a/scripts/generate_dialect_comparison_docs.py b/scripts/generate_dialect_comparison_docs.py index ebd9f8d616..fd485cbd56 100644 --- a/scripts/generate_dialect_comparison_docs.py +++ b/scripts/generate_dialect_comparison_docs.py @@ -4,8 +4,8 @@ import inspect from pathlib import Path -from splink.comparison import Comparison -from splink.comparison_level import ComparisonLevel +from splink.internals.comparison import Comparison +from splink.internals.comparison_level import ComparisonLevel from splink.dialect_base import DialectBase # could always pick this up dynamically, diff --git a/splink/comparison_library.py b/splink/comparison_library.py index 28c7b17940..8be9600e12 100644 --- a/splink/comparison_library.py +++ b/splink/comparison_library.py @@ -2,10 +2,10 @@ from typing import Any, Iterable, List, Optional, Union -from . import comparison_level_library as cll -from .comparison_creator import ComparisonCreator -from .comparison_level_creator import ComparisonLevelCreator -from .comparison_level_library import CustomLevel, DateMetricType +from .internals import comparison_level_library as cll +from splink.internals.comparison_creator import ComparisonCreator +from splink.internals.comparison_level_creator import ComparisonLevelCreator +from splink.internals.comparison_level_library import CustomLevel, DateMetricType from .misc import ensure_is_iterable diff --git a/splink/comparison_template_library.py b/splink/comparison_template_library.py index f7664f1214..ba7d74dd00 100644 --- a/splink/comparison_template_library.py +++ b/splink/comparison_template_library.py @@ -2,11 +2,11 @@ from typing import List, Type, Union -from . import comparison_level_library as cll +from .internals import comparison_level_library as cll from splink.internals.column_expression import ColumnExpression -from .comparison_creator import ComparisonCreator -from .comparison_level_creator import ComparisonLevelCreator -from .comparison_level_library import DateMetricType +from splink.internals.comparison_creator import ComparisonCreator +from splink.internals.comparison_level_creator import ComparisonLevelCreator +from splink.internals.comparison_level_library import DateMetricType from .misc import ensure_is_iterable # alternatively we could stick an inheritance layer in these, just for typing: diff --git a/splink/dialects.py b/splink/dialects.py index 3167150943..0941657d16 100644 --- a/splink/dialects.py +++ b/splink/dialects.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Type, TypeVar, final if TYPE_CHECKING: - from .comparison_level_library import ( + from splink.internals.comparison_level_library import ( AbsoluteTimeDifferenceLevel, ArrayIntersectLevel, ) diff --git a/splink/em_training_session.py b/splink/em_training_session.py index e3e58df69a..3edc8b03dd 100644 --- a/splink/em_training_session.py +++ b/splink/em_training_session.py @@ -10,8 +10,8 @@ probability_two_random_records_match_iteration_chart, ) -from .comparison import Comparison -from .comparison_level import ComparisonLevel +from splink.internals.comparison import Comparison +from splink.internals.comparison_level import ComparisonLevel from .comparison_vector_values import compute_comparison_vector_values_sql from .constants import LEVEL_NOT_OBSERVED_TEXT from .database_api import DatabaseAPISubClass diff --git a/splink/expectation_maximisation.py b/splink/expectation_maximisation.py index 22eef53447..a5d1b678f9 100644 --- a/splink/expectation_maximisation.py +++ b/splink/expectation_maximisation.py @@ -6,8 +6,8 @@ import pandas as pd -from .comparison import Comparison -from .comparison_level import ComparisonLevel +from splink.internals.comparison import Comparison +from splink.internals.comparison_level import ComparisonLevel from .constants import LEVEL_NOT_OBSERVED_TEXT from .database_api import DatabaseAPISubClass from .input_column import InputColumn diff --git a/splink/comparison.py b/splink/internals/comparison.py similarity index 99% rename from splink/comparison.py rename to splink/internals/comparison.py index 46a9d46b77..a6f5500cf5 100644 --- a/splink/comparison.py +++ b/splink/internals/comparison.py @@ -3,11 +3,11 @@ from typing import TYPE_CHECKING, Any, List, Optional from .comparison_level import ComparisonLevel, _default_m_values, _default_u_values -from .misc import dedupe_preserving_order, join_list_with_commas_final_and +from splink.misc import dedupe_preserving_order, join_list_with_commas_final_and # https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports if TYPE_CHECKING: - from .settings import ColumnInfoSettings + from splink.settings import ColumnInfoSettings class Comparison: diff --git a/splink/comparison_creator.py b/splink/internals/comparison_creator.py similarity index 99% rename from splink/comparison_creator.py rename to splink/internals/comparison_creator.py index 8930657332..2f8343248c 100644 --- a/splink/comparison_creator.py +++ b/splink/internals/comparison_creator.py @@ -6,7 +6,7 @@ from splink.internals.column_expression import ColumnExpression from .comparison import Comparison from .comparison_level_creator import ComparisonLevelCreator -from .exceptions import SplinkException +from splink.exceptions import SplinkException class ComparisonCreator(ABC): diff --git a/splink/comparison_helpers.py b/splink/internals/comparison_helpers.py similarity index 100% rename from splink/comparison_helpers.py rename to splink/internals/comparison_helpers.py diff --git a/splink/comparison_level.py b/splink/internals/comparison_level.py similarity index 99% rename from splink/comparison_level.py rename to splink/internals/comparison_level.py index c177731e6a..63cbd1890f 100644 --- a/splink/comparison_level.py +++ b/splink/internals/comparison_level.py @@ -13,16 +13,16 @@ from sqlglot.optimizer.normalize import normalize from sqlglot.optimizer.simplify import simplify -from .constants import LEVEL_NOT_OBSERVED_TEXT -from .input_column import InputColumn -from .misc import ( +from splink.constants import LEVEL_NOT_OBSERVED_TEXT +from splink.input_column import InputColumn +from splink.misc import ( dedupe_preserving_order, interpolate, join_list_with_commas_final_and, match_weight_to_bayes_factor, ) -from .parse_sql import get_columns_used_from_sql -from .sql_transform import sqlglot_tree_signature +from splink.parse_sql import get_columns_used_from_sql +from splink.sql_transform import sqlglot_tree_signature logger = logging.getLogger(__name__) diff --git a/splink/comparison_level_composition.py b/splink/internals/comparison_level_composition.py similarity index 98% rename from splink/comparison_level_composition.py rename to splink/internals/comparison_level_composition.py index 54f5fbdeb8..e64088aef0 100644 --- a/splink/comparison_level_composition.py +++ b/splink/internals/comparison_level_composition.py @@ -6,7 +6,7 @@ from .comparison_creator import ComparisonLevelCreator from .comparison_level import ComparisonLevel -from .dialects import SplinkDialect +from splink.dialects import SplinkDialect def _ensure_is_comparison_level_creator( diff --git a/splink/comparison_level_creator.py b/splink/internals/comparison_level_creator.py similarity index 99% rename from splink/comparison_level_creator.py rename to splink/internals/comparison_level_creator.py index 5f6eee7302..6434c36e8e 100644 --- a/splink/comparison_level_creator.py +++ b/splink/internals/comparison_level_creator.py @@ -6,7 +6,7 @@ from splink.internals.column_expression import ColumnExpression from .comparison_level import ComparisonLevel -from .dialects import SplinkDialect +from splink.dialects import SplinkDialect class ComparisonLevelCreator(ABC): diff --git a/splink/comparison_level_library.py b/splink/internals/comparison_level_library.py similarity index 99% rename from splink/comparison_level_library.py rename to splink/internals/comparison_level_library.py index 72b0036111..ad0aa2a2b7 100644 --- a/splink/comparison_level_library.py +++ b/splink/internals/comparison_level_library.py @@ -11,8 +11,8 @@ # import composition functions for export from .comparison_level_composition import And, Not, Or # NOQA: F401 from .comparison_level_creator import ComparisonLevelCreator -from .comparison_level_sql import great_circle_distance_km_sql -from .dialects import SplinkDialect +from splink.comparison_level_sql import great_circle_distance_km_sql +from splink.dialects import SplinkDialect # type aliases: T = TypeVar("T", bound=ComparisonLevelCreator) diff --git a/splink/linker.py b/splink/linker.py index 3f52f736ed..54713758ac 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -40,8 +40,8 @@ render_splink_cluster_studio_html, ) -from .comparison import Comparison -from .comparison_level import ComparisonLevel +from splink.internals.comparison import Comparison +from splink.internals.comparison_level import ComparisonLevel from .comparison_vector_distribution import ( comparison_vector_distribution_sql, ) diff --git a/splink/m_u_records_to_parameters.py b/splink/m_u_records_to_parameters.py index 8c7b6f42de..91376ebc2d 100644 --- a/splink/m_u_records_to_parameters.py +++ b/splink/m_u_records_to_parameters.py @@ -3,7 +3,7 @@ import logging from typing import Any, Dict, List -from .comparison_level import ComparisonLevel +from splink.internals.comparison_level import ComparisonLevel from .constants import LEVEL_NOT_OBSERVED_TEXT logger = logging.getLogger(__name__) diff --git a/splink/predict.py b/splink/predict.py index cb65c2ee27..93150e0569 100644 --- a/splink/predict.py +++ b/splink/predict.py @@ -4,7 +4,7 @@ import logging from typing import List -from .comparison import Comparison +from splink.internals.comparison import Comparison from .input_column import InputColumn from .misc import prob_to_bayes_factor, prob_to_match_weight from .settings import CoreModelSettings, Settings diff --git a/splink/settings.py b/splink/settings.py index 7c83b7a16a..f40114aa87 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -12,8 +12,8 @@ ) from splink.internals.charts import m_u_parameters_chart, match_weights_chart -from .comparison import Comparison -from .comparison_level import ComparisonLevel +from splink.internals.comparison import Comparison +from splink.internals.comparison_level import ComparisonLevel from .input_column import InputColumn from .misc import dedupe_preserving_order, prob_to_bayes_factor, prob_to_match_weight from .parse_sql import get_columns_used_from_sql diff --git a/splink/settings_creator.py b/splink/settings_creator.py index 407a793799..1c9c9559d1 100644 --- a/splink/settings_creator.py +++ b/splink/settings_creator.py @@ -9,7 +9,7 @@ from splink.internals.blocking_rule_creator import BlockingRuleCreator from splink.internals.blocking_rule_creator_utils import to_blocking_rule_creator -from .comparison_creator import ComparisonCreator +from splink.internals.comparison_creator import ComparisonCreator from .comparison_library import CustomComparison from .settings import Settings diff --git a/splink/settings_validation/log_invalid_columns.py b/splink/settings_validation/log_invalid_columns.py index 4a8abe5aa8..b0d21ff3c4 100644 --- a/splink/settings_validation/log_invalid_columns.py +++ b/splink/settings_validation/log_invalid_columns.py @@ -7,7 +7,7 @@ import sqlglot import sqlglot.expressions -from ..comparison import Comparison +from splink.internals.comparison import Comparison from ..parse_sql import parse_columns_in_sql from .settings_column_cleaner import ( SettingsColumnCleaner, diff --git a/splink/waterfall_chart.py b/splink/waterfall_chart.py index 8747bf2f95..900c813bdb 100644 --- a/splink/waterfall_chart.py +++ b/splink/waterfall_chart.py @@ -4,7 +4,7 @@ from copy import deepcopy from typing import Any, Dict -from .comparison import Comparison +from splink.internals.comparison import Comparison from .misc import prob_to_bayes_factor diff --git a/tests/literal_utils.py b/tests/literal_utils.py index f7095088ee..8c63b8c016 100644 --- a/tests/literal_utils.py +++ b/tests/literal_utils.py @@ -3,8 +3,8 @@ import pytest -from splink.comparison_creator import ComparisonCreator -from splink.comparison_level_creator import ComparisonLevelCreator +from splink.internals.comparison_creator import ComparisonCreator +from splink.internals.comparison_level_creator import ComparisonLevelCreator class ComparisonLevelTestSpec: diff --git a/tests/test_columns_selected.py b/tests/test_columns_selected.py index 19bd2f111f..fe0efe7fa8 100644 --- a/tests/test_columns_selected.py +++ b/tests/test_columns_selected.py @@ -4,7 +4,7 @@ import pandas as pd -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_comparison_level.py b/tests/test_comparison_level.py index 968f13d982..dd25797422 100644 --- a/tests/test_comparison_level.py +++ b/tests/test_comparison_level.py @@ -1,6 +1,6 @@ from pytest import mark, raises -from splink.comparison_level import ComparisonLevel +from splink.internals.comparison_level import ComparisonLevel from .decorator import mark_with_dialects_excluding diff --git a/tests/test_comparison_level_composition.py b/tests/test_comparison_level_composition.py index 9cb694113f..9967d05341 100644 --- a/tests/test_comparison_level_composition.py +++ b/tests/test_comparison_level_composition.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from splink.input_column import _get_dialect_quotes from .decorator import mark_with_dialects_excluding diff --git a/tests/test_comparison_level_lib.py b/tests/test_comparison_level_lib.py index 4ae459cb45..60c2a4c09e 100644 --- a/tests/test_comparison_level_lib.py +++ b/tests/test_comparison_level_lib.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from .decorator import mark_with_dialects_excluding diff --git a/tests/test_compound_comparison_levels.py b/tests/test_compound_comparison_levels.py index 107a9b8a60..ee802575e4 100644 --- a/tests/test_compound_comparison_levels.py +++ b/tests/test_compound_comparison_levels.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_date_levels_and_comparisons.py b/tests/test_date_levels_and_comparisons.py index 6d6291219b..199bd2b1c0 100644 --- a/tests/test_date_levels_and_comparisons.py +++ b/tests/test_date_levels_and_comparisons.py @@ -2,7 +2,7 @@ import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl import splink.comparison_template_library as ctl from splink.internals.column_expression import ColumnExpression diff --git a/tests/test_disable_tf_exact_match_detection.py b/tests/test_disable_tf_exact_match_detection.py index ddf7257f5f..5b4ab37f9c 100644 --- a/tests/test_disable_tf_exact_match_detection.py +++ b/tests/test_disable_tf_exact_match_detection.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from splink import DuckDBAPI, Linker, SettingsCreator diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index 8eb6268951..afe2175a58 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -6,7 +6,7 @@ import pyarrow.parquet as pq import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl from splink.blocking_analysis import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py index 08246a6516..9f52b481c7 100644 --- a/tests/test_full_example_spark.py +++ b/tests/test_full_example_spark.py @@ -5,7 +5,7 @@ import pytest from pyspark.sql.types import StringType, StructField, StructType -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl from splink.exploratory import completeness_chart, profile_columns from splink.linker import Linker diff --git a/tests/test_km_distance_level.py b/tests/test_km_distance_level.py index 771f484a8f..862b70b234 100644 --- a/tests/test_km_distance_level.py +++ b/tests/test_km_distance_level.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_new_comparison_levels.py b/tests/test_new_comparison_levels.py index 2db086dc8e..4153256733 100644 --- a/tests/test_new_comparison_levels.py +++ b/tests/test_new_comparison_levels.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl import splink.comparison_template_library as ctl from splink.internals.column_expression import ColumnExpression diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 5019b44a92..9c15eddb54 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -1,6 +1,6 @@ import os -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll import splink.comparison_library as cl from splink import block_on from splink.blocking_analysis import ( diff --git a/tests/test_regex_param.py b/tests/test_regex_param.py index 5d1594d28d..a9d02642b4 100644 --- a/tests/test_regex_param.py +++ b/tests/test_regex_param.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from splink.internals.column_expression import ColumnExpression from .decorator import mark_with_dialects_excluding diff --git a/tests/test_spark_udfs.py b/tests/test_spark_udfs.py index 73b28a6964..06599c169d 100644 --- a/tests/test_spark_udfs.py +++ b/tests/test_spark_udfs.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_level_library as cll +import splink.internals.comparison_level_library as cll from splink.linker import Linker from tests.decorator import mark_with_dialects_including