tests pass

moj-analytical-services · Jun 22, 2024 · f973871 · f973871
1 parent 247f417
commit f973871
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 123 deletions.
diff --git a/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb b/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb
@@ -54,7 +54,7 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "24/06/22 10:47:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+            "24/06/22 10:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
             "Setting default log level to \"WARN\".\n",
             "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
           ]
@@ -69,7 +69,7 @@
         "conf = SparkConf()\n",
         "# This parallelism setting is only suitable for a small toy example\n",
         "conf.set(\"spark.driver.memory\", \"12g\")\n",
-        "conf.set(\"spark.default.parallelism\", \"16\")\n",
+        "conf.set(\"spark.default.parallelism\", \"8\")\n",
         "\n",
         "\n",
         "# Add custom similarity functions, which are bundled with Splink\n",
@@ -286,7 +286,7 @@
             "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
             "    - dob\n",
             "                                                                                \n",
-            "WARNING:                                                                        \n",
+            "WARNING:\n",
             "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
             "\n",
             "Iteration 1: Largest change in params was -0.548 in the m_probability of surname, level `Exact match on surname`\n",
@@ -332,7 +332,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": 11,
       "metadata": {
         "execution": {
           "iopub.execute_input": "2024-03-13T12:31:44.605970Z",
@@ -390,122 +390,122 @@
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
-              "      <td>3.255480</td>\n",
-              "      <td>0.905212</td>\n",
-              "      <td>171</td>\n",
-              "      <td>252</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
+              "      <td>14.992986</td>\n",
+              "      <td>0.999969</td>\n",
+              "      <td>192</td>\n",
+              "      <td>194</td>\n",
+              "      <td>Isla</td>\n",
+              "      <td>Isla</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.169</td>\n",
-              "      <td>0.169</td>\n",
+              "      <td>0.009</td>\n",
+              "      <td>0.009</td>\n",
               "      <td>11.371023</td>\n",
               "      <td>...</td>\n",
-              "      <td>5.870654</td>\n",
-              "      <td>0.411903</td>\n",
-              "      <td>[email protected]</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>0</td>\n",
-              "      <td>0.005</td>\n",
-              "      <td>0.211</td>\n",
-              "      <td>0.349824</td>\n",
+              "      <td>0.626291</td>\n",
               "      <td>1.000000</td>\n",
+              "      <td>[email protected]</td>\n",
+              "      <td>[email protected]</td>\n",
+              "      <td>4</td>\n",
+              "      <td>0.004</td>\n",
+              "      <td>0.004</td>\n",
+              "      <td>8.473825</td>\n",
+              "      <td>11.429930</td>\n",
               "      <td>0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
-              "      <td>17.789350</td>\n",
-              "      <td>0.999996</td>\n",
-              "      <td>939</td>\n",
-              "      <td>940</td>\n",
-              "      <td>Isabelle</td>\n",
-              "      <td>Isabelle</td>\n",
+              "      <td>10.461631</td>\n",
+              "      <td>0.999291</td>\n",
+              "      <td>303</td>\n",
+              "      <td>304</td>\n",
+              "      <td>Charlie</td>\n",
+              "      <td>Charlie</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.008</td>\n",
-              "      <td>0.008</td>\n",
+              "      <td>0.006</td>\n",
+              "      <td>0.006</td>\n",
               "      <td>11.371023</td>\n",
               "      <td>...</td>\n",
               "      <td>5.870654</td>\n",
-              "      <td>5.481481</td>\n",
-              "      <td>[email protected]</td>\n",
-              "      <td>[email protected]</td>\n",
-              "      <td>4</td>\n",
-              "      <td>0.004</td>\n",
-              "      <td>0.004</td>\n",
-              "      <td>8.473825</td>\n",
-              "      <td>11.429930</td>\n",
+              "      <td>2.159371</td>\n",
+              "      <td>[email protected]</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0.002</td>\n",
+              "      <td>0.211</td>\n",
+              "      <td>0.349824</td>\n",
+              "      <td>1.000000</td>\n",
               "      <td>0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
-              "      <td>21.767896</td>\n",
-              "      <td>1.000000</td>\n",
-              "      <td>63</td>\n",
-              "      <td>66</td>\n",
-              "      <td>Adam</td>\n",
-              "      <td>Adam</td>\n",
+              "      <td>15.283680</td>\n",
+              "      <td>0.999975</td>\n",
+              "      <td>79</td>\n",
+              "      <td>84</td>\n",
+              "      <td>Ryan</td>\n",
+              "      <td>Ryan</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.006</td>\n",
-              "      <td>0.006</td>\n",
+              "      <td>0.005</td>\n",
+              "      <td>0.005</td>\n",
               "      <td>11.371023</td>\n",
               "      <td>...</td>\n",
-              "      <td>5.870654</td>\n",
-              "      <td>4.453704</td>\n",
-              "      <td>ajones6@cortez-wilcox.com</td>\n",
-              "      <td>ajones-@cortez6wilcox.com</td>\n",
-              "      <td>2</td>\n",
-              "      <td>0.004</td>\n",
+              "      <td>0.626291</td>\n",
+              "      <td>1.000000</td>\n",
+              "      <td>r.cole1@ramirez-anthony.com</td>\n",
+              "      <td>r.cole1@ramtrez-anihony.com</td>\n",
+              "      <td>3</td>\n",
+              "      <td>0.005</td>\n",
               "      <td>0.001</td>\n",
-              "      <td>252.479846</td>\n",
+              "      <td>210.647668</td>\n",
               "      <td>1.000000</td>\n",
               "      <td>0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>3</th>\n",
-              "      <td>20.106275</td>\n",
-              "      <td>0.999999</td>\n",
-              "      <td>247</td>\n",
-              "      <td>250</td>\n",
-              "      <td>NaN</td>\n",
-              "      <td>NaN</td>\n",
+              "      <td>7.419792</td>\n",
+              "      <td>0.994194</td>\n",
+              "      <td>111</td>\n",
+              "      <td>113</td>\n",
+              "      <td>Oliver</td>\n",
+              "      <td>Oliver</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.169</td>\n",
-              "      <td>0.169</td>\n",
+              "      <td>0.028</td>\n",
+              "      <td>0.028</td>\n",
               "      <td>11.371023</td>\n",
               "      <td>...</td>\n",
               "      <td>5.870654</td>\n",
-              "      <td>5.089947</td>\n",
-              "      <td>[email protected]</td>\n",
-              "      <td>[email protected]</td>\n",
+              "      <td>0.411903</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.006</td>\n",
-              "      <td>0.006</td>\n",
+              "      <td>0.211</td>\n",
+              "      <td>0.211</td>\n",
               "      <td>8.473825</td>\n",
-              "      <td>7.619953</td>\n",
+              "      <td>0.216681</td>\n",
               "      <td>0</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>4</th>\n",
-              "      <td>15.904031</td>\n",
-              "      <td>0.999984</td>\n",
-              "      <td>428</td>\n",
-              "      <td>430</td>\n",
-              "      <td>Archie</td>\n",
-              "      <td>Archie</td>\n",
+              "      <td>4.030262</td>\n",
+              "      <td>0.942327</td>\n",
+              "      <td>220</td>\n",
+              "      <td>222</td>\n",
+              "      <td>Logan</td>\n",
+              "      <td>Logan</td>\n",
               "      <td>4</td>\n",
-              "      <td>0.003</td>\n",
-              "      <td>0.003</td>\n",
+              "      <td>0.010</td>\n",
+              "      <td>0.010</td>\n",
               "      <td>11.371023</td>\n",
               "      <td>...</td>\n",
-              "      <td>5.870654</td>\n",
-              "      <td>2.159371</td>\n",
-              "      <td>a.s@humphrey.com</td>\n",
-              "      <td>a.s@humphrey.com</td>\n",
-              "      <td>4</td>\n",
-              "      <td>0.005</td>\n",
-              "      <td>0.005</td>\n",
-              "      <td>8.473825</td>\n",
-              "      <td>9.143944</td>\n",
+              "      <td>0.626291</td>\n",
+              "      <td>1.000000</td>\n",
+              "      <td>l.feruson46@sahh.com</td>\n",
+              "      <td>l.ferguson46@shah.com</td>\n",
+              "      <td>2</td>\n",
+              "      <td>0.001</td>\n",
+              "      <td>0.002</td>\n",
+              "      <td>252.479846</td>\n",
+              "      <td>1.000000</td>\n",
               "      <td>0</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
@@ -515,44 +515,44 @@
             ],
             "text/plain": [
               "   match_weight  match_probability  unique_id_l  unique_id_r first_name_l  \\\n",
-              "0      3.255480           0.905212          171          252          NaN   \n",
-              "1     17.789350           0.999996          939          940     Isabelle   \n",
-              "2     21.767896           1.000000           63           66         Adam   \n",
-              "3     20.106275           0.999999          247          250          NaN   \n",
-              "4     15.904031           0.999984          428          430       Archie   \n",
+              "0     14.992986           0.999969          192          194         Isla   \n",
+              "1     10.461631           0.999291          303          304      Charlie   \n",
+              "2     15.283680           0.999975           79           84         Ryan   \n",
+              "3      7.419792           0.994194          111          113       Oliver   \n",
+              "4      4.030262           0.942327          220          222        Logan   \n",
               "\n",
               "  first_name_r  gamma_first_name  tf_first_name_l  tf_first_name_r  \\\n",
-              "0          NaN                 4            0.169            0.169   \n",
-              "1     Isabelle                 4            0.008            0.008   \n",
-              "2         Adam                 4            0.006            0.006   \n",
-              "3          NaN                 4            0.169            0.169   \n",
-              "4       Archie                 4            0.003            0.003   \n",
+              "0         Isla                 4            0.009            0.009   \n",
+              "1      Charlie                 4            0.006            0.006   \n",
+              "2         Ryan                 4            0.005            0.005   \n",
+              "3       Oliver                 4            0.028            0.028   \n",
+              "4        Logan                 4            0.010            0.010   \n",
               "\n",
-              "   bf_first_name  ...   bf_city bf_tf_adj_city                        email_l  \\\n",
-              "0      11.371023  ...  5.870654       0.411903        [email protected]   \n",
-              "1      11.371023  ...  5.870654       5.481481  [email protected]   \n",
-              "2      11.371023  ...  5.870654       4.453704      ajones6@cortez-wilcox.com   \n",
-              "3      11.371023  ...  5.870654       5.089947  [email protected]   \n",
-              "4      11.371023  ...  5.870654       2.159371               a.s@humphrey.com   \n",
+              "   bf_first_name  ...   bf_city bf_tf_adj_city                      email_l  \\\n",
+              "0      11.371023  ...  0.626291       1.000000              [email protected]   \n",
+              "1      11.371023  ...  5.870654       2.159371      [email protected]   \n",
+              "2      11.371023  ...  0.626291       1.000000  r.cole1@ramirez-anthony.com   \n",
+              "3      11.371023  ...  5.870654       0.411903                          NaN   \n",
+              "4      11.371023  ...  0.626291       1.000000         l.feruson46@sahh.com   \n",
               "\n",
-              "                         email_r  gamma_email  tf_email_l  tf_email_r  \\\n",
-              "0                            NaN            0       0.005       0.211   \n",
-              "1  [email protected]            4       0.004       0.004   \n",
-              "2      ajones-@cortez6wilcox.com            2       0.004       0.001   \n",
-              "3  [email protected]            4       0.006       0.006   \n",
-              "4               a.s@humphrey.com            4       0.005       0.005   \n",
+              "                       email_r  gamma_email  tf_email_l  tf_email_r  \\\n",
+              "0              [email protected]            4       0.004       0.004   \n",
+              "1                          NaN            0       0.002       0.211   \n",
+              "2  r.cole1@ramtrez-anihony.com            3       0.005       0.001   \n",
+              "3                          NaN            4       0.211       0.211   \n",
+              "4        l.ferguson46@shah.com            2       0.001       0.002   \n",
               "\n",
               "     bf_email bf_tf_adj_email match_key  \n",
-              "0    0.349824        1.000000         0  \n",
-              "1    8.473825       11.429930         0  \n",
-              "2  252.479846        1.000000         0  \n",
-              "3    8.473825        7.619953         0  \n",
-              "4    8.473825        9.143944         0  \n",
+              "0    8.473825       11.429930         0  \n",
+              "1    0.349824        1.000000         0  \n",
+              "2  210.647668        1.000000         0  \n",
+              "3    8.473825        0.216681         0  \n",
+              "4  252.479846        1.000000         0  \n",
               "\n",
               "[5 rows x 37 columns]"
             ]
           },
-          "execution_count": 13,
+          "execution_count": 11,
           "metadata": {},
           "output_type": "execute_result"
         }

diff --git a/splink/internals/comparison_level_composition.py b/splink/internals/comparison_level_composition.py
@@ -46,7 +46,6 @@ def create_sql(self, sql_dialect: SplinkDialect) -> str:
             map(lambda cl: f"({cl.create_sql(sql_dialect)})", self.comparison_levels)
         )
 
-    @final
     def create_label_for_charts(self) -> str:
         return f" {self._clause} ".join(
             map(lambda cl: f"({cl.create_label_for_charts()})", self.comparison_levels)

diff --git a/splink/internals/comparison_library.py b/splink/internals/comparison_library.py
@@ -718,9 +718,6 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
                 cll.ExactMatchLevel(date_as_iso_string.substr(0, 4)),
             )
 
-            level.create_label_for_charts = (
-                lambda: "Exact match on year (1st of January only)"
-            )
             levels.append(level)
 
         levels.append(cll.ExactMatchLevel(self.col_expression))
@@ -821,7 +818,7 @@ def __init__(
             cols["latitude"] = lat_col
             cols["longitude"] = long_col
         else:
-            self.km_thresholds = None
+            self.km_thresholds = []
         super().__init__(cols)
 
     def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
@@ -830,8 +827,10 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
         district_col_expression = full_col_expression.regex_extract(self.DISTRICT_REGEX)
         area_col_expression = full_col_expression.regex_extract(self.AREA_REGEX)
 
-        if not self.km_thresholds:
-            levels: list[ComparisonLevelCreator] = [
+        levels: list[ComparisonLevelCreator] = []
+
+        if len(self.km_thresholds) == 0:
+            levels = [
                 cll.NullLevel(
                     full_col_expression, valid_string_pattern=self.valid_postcode_regex
                 ),
@@ -843,7 +842,7 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
         if self.km_thresholds:
             # Don't include the very high level postcode categories
             # if using km thresholds - they are better modelled as geo distances
-            levels: list[ComparisonLevelCreator] = [
+            levels = [
                 cll.NullLevel(
                     full_col_expression, valid_string_pattern=self.valid_postcode_regex
                 ),
@@ -903,7 +902,7 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
         levels: list[ComparisonLevelCreator] = [
             cll.NullLevel(full_col_expression, valid_string_pattern=None),
             cll.ExactMatchLevel(full_col_expression).configure(
-                tf_adjustment_column=full_col_expression
+                tf_adjustment_column=full_col_expression.raw_sql_expression
             ),
             cll.ExactMatchLevel(username_col_expression).configure(
                 label_for_charts="Exact match on username"
@@ -956,8 +955,8 @@ def __init__(
                 contain arrays of dmetaphone values, which are of length 1 or 2.
         """
 
-        jaro_winkler_thresholds = ensure_is_iterable(jaro_winkler_thresholds)
-        self.jaro_winkler_thresholds = [*jaro_winkler_thresholds]
+        jaro_winkler_thresholds_itr = ensure_is_iterable(jaro_winkler_thresholds)
+        self.jaro_winkler_thresholds = list(jaro_winkler_thresholds_itr)
 
         cols = {"name": col_name}
         if dmeta_col_name is not None:
@@ -1045,8 +1044,8 @@ def __init__(
                 concatenated forename and surname values. If provided, term
                 frequencies are applied on the exact match using this column
         """
-        jaro_winkler_thresholds = ensure_is_iterable(jaro_winkler_thresholds)
-        self.jaro_winkler_thresholds = [*jaro_winkler_thresholds]
+        jaro_winkler_thresholds_itr = ensure_is_iterable(jaro_winkler_thresholds)
+        self.jaro_winkler_thresholds = list(jaro_winkler_thresholds_itr)
         cols = {"forename": forename_col_name, "surname": surname_col_name}
         if forename_surname_concat_col_name is not None:
             cols["forename_surname_concat"] = forename_surname_concat_col_name