Skip to content

Commit

Permalink
tests pass
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Jun 22, 2024
1 parent 247f417 commit f973871
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 123 deletions.
220 changes: 110 additions & 110 deletions docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"24/06/22 10:47:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"24/06/22 10:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
]
Expand All @@ -69,7 +69,7 @@
"conf = SparkConf()\n",
"# This parallelism setting is only suitable for a small toy example\n",
"conf.set(\"spark.driver.memory\", \"12g\")\n",
"conf.set(\"spark.default.parallelism\", \"16\")\n",
"conf.set(\"spark.default.parallelism\", \"8\")\n",
"\n",
"\n",
"# Add custom similarity functions, which are bundled with Splink\n",
Expand Down Expand Up @@ -286,7 +286,7 @@
"Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
" - dob\n",
" \n",
"WARNING: \n",
"WARNING:\n",
"Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
"\n",
"Iteration 1: Largest change in params was -0.548 in the m_probability of surname, level `Exact match on surname`\n",
Expand Down Expand Up @@ -332,7 +332,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-13T12:31:44.605970Z",
Expand Down Expand Up @@ -390,122 +390,122 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3.255480</td>\n",
" <td>0.905212</td>\n",
" <td>171</td>\n",
" <td>252</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>14.992986</td>\n",
" <td>0.999969</td>\n",
" <td>192</td>\n",
" <td>194</td>\n",
" <td>Isla</td>\n",
" <td>Isla</td>\n",
" <td>4</td>\n",
" <td>0.169</td>\n",
" <td>0.169</td>\n",
" <td>0.009</td>\n",
" <td>0.009</td>\n",
" <td>11.371023</td>\n",
" <td>...</td>\n",
" <td>5.870654</td>\n",
" <td>0.411903</td>\n",
" <td>[email protected]</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.005</td>\n",
" <td>0.211</td>\n",
" <td>0.349824</td>\n",
" <td>0.626291</td>\n",
" <td>1.000000</td>\n",
" <td>[email protected]</td>\n",
" <td>[email protected]</td>\n",
" <td>4</td>\n",
" <td>0.004</td>\n",
" <td>0.004</td>\n",
" <td>8.473825</td>\n",
" <td>11.429930</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>17.789350</td>\n",
" <td>0.999996</td>\n",
" <td>939</td>\n",
" <td>940</td>\n",
" <td>Isabelle</td>\n",
" <td>Isabelle</td>\n",
" <td>10.461631</td>\n",
" <td>0.999291</td>\n",
" <td>303</td>\n",
" <td>304</td>\n",
" <td>Charlie</td>\n",
" <td>Charlie</td>\n",
" <td>4</td>\n",
" <td>0.008</td>\n",
" <td>0.008</td>\n",
" <td>0.006</td>\n",
" <td>0.006</td>\n",
" <td>11.371023</td>\n",
" <td>...</td>\n",
" <td>5.870654</td>\n",
" <td>5.481481</td>\n",
" <td>[email protected]</td>\n",
" <td>[email protected]</td>\n",
" <td>4</td>\n",
" <td>0.004</td>\n",
" <td>0.004</td>\n",
" <td>8.473825</td>\n",
" <td>11.429930</td>\n",
" <td>2.159371</td>\n",
" <td>[email protected]</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.002</td>\n",
" <td>0.211</td>\n",
" <td>0.349824</td>\n",
" <td>1.000000</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>21.767896</td>\n",
" <td>1.000000</td>\n",
" <td>63</td>\n",
" <td>66</td>\n",
" <td>Adam</td>\n",
" <td>Adam</td>\n",
" <td>15.283680</td>\n",
" <td>0.999975</td>\n",
" <td>79</td>\n",
" <td>84</td>\n",
" <td>Ryan</td>\n",
" <td>Ryan</td>\n",
" <td>4</td>\n",
" <td>0.006</td>\n",
" <td>0.006</td>\n",
" <td>0.005</td>\n",
" <td>0.005</td>\n",
" <td>11.371023</td>\n",
" <td>...</td>\n",
" <td>5.870654</td>\n",
" <td>4.453704</td>\n",
" <td>ajones6@cortez-wilcox.com</td>\n",
" <td>ajones-@cortez6wilcox.com</td>\n",
" <td>2</td>\n",
" <td>0.004</td>\n",
" <td>0.626291</td>\n",
" <td>1.000000</td>\n",
" <td>r.cole1@ramirez-anthony.com</td>\n",
" <td>r.cole1@ramtrez-anihony.com</td>\n",
" <td>3</td>\n",
" <td>0.005</td>\n",
" <td>0.001</td>\n",
" <td>252.479846</td>\n",
" <td>210.647668</td>\n",
" <td>1.000000</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20.106275</td>\n",
" <td>0.999999</td>\n",
" <td>247</td>\n",
" <td>250</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.419792</td>\n",
" <td>0.994194</td>\n",
" <td>111</td>\n",
" <td>113</td>\n",
" <td>Oliver</td>\n",
" <td>Oliver</td>\n",
" <td>4</td>\n",
" <td>0.169</td>\n",
" <td>0.169</td>\n",
" <td>0.028</td>\n",
" <td>0.028</td>\n",
" <td>11.371023</td>\n",
" <td>...</td>\n",
" <td>5.870654</td>\n",
" <td>5.089947</td>\n",
" <td>[email protected]</td>\n",
" <td>[email protected]</td>\n",
" <td>0.411903</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>0.006</td>\n",
" <td>0.006</td>\n",
" <td>0.211</td>\n",
" <td>0.211</td>\n",
" <td>8.473825</td>\n",
" <td>7.619953</td>\n",
" <td>0.216681</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>15.904031</td>\n",
" <td>0.999984</td>\n",
" <td>428</td>\n",
" <td>430</td>\n",
" <td>Archie</td>\n",
" <td>Archie</td>\n",
" <td>4.030262</td>\n",
" <td>0.942327</td>\n",
" <td>220</td>\n",
" <td>222</td>\n",
" <td>Logan</td>\n",
" <td>Logan</td>\n",
" <td>4</td>\n",
" <td>0.003</td>\n",
" <td>0.003</td>\n",
" <td>0.010</td>\n",
" <td>0.010</td>\n",
" <td>11.371023</td>\n",
" <td>...</td>\n",
" <td>5.870654</td>\n",
" <td>2.159371</td>\n",
" <td>a.s@humphrey.com</td>\n",
" <td>a.s@humphrey.com</td>\n",
" <td>4</td>\n",
" <td>0.005</td>\n",
" <td>0.005</td>\n",
" <td>8.473825</td>\n",
" <td>9.143944</td>\n",
" <td>0.626291</td>\n",
" <td>1.000000</td>\n",
" <td>l.feruson46@sahh.com</td>\n",
" <td>l.ferguson46@shah.com</td>\n",
" <td>2</td>\n",
" <td>0.001</td>\n",
" <td>0.002</td>\n",
" <td>252.479846</td>\n",
" <td>1.000000</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
Expand All @@ -515,44 +515,44 @@
],
"text/plain": [
" match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
"0 3.255480 0.905212 171 252 NaN \n",
"1 17.789350 0.999996 939 940 Isabelle \n",
"2 21.767896 1.000000 63 66 Adam \n",
"3 20.106275 0.999999 247 250 NaN \n",
"4 15.904031 0.999984 428 430 Archie \n",
"0 14.992986 0.999969 192 194 Isla \n",
"1 10.461631 0.999291 303 304 Charlie \n",
"2 15.283680 0.999975 79 84 Ryan \n",
"3 7.419792 0.994194 111 113 Oliver \n",
"4 4.030262 0.942327 220 222 Logan \n",
"\n",
" first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
"0 NaN 4 0.169 0.169 \n",
"1 Isabelle 4 0.008 0.008 \n",
"2 Adam 4 0.006 0.006 \n",
"3 NaN 4 0.169 0.169 \n",
"4 Archie 4 0.003 0.003 \n",
"0 Isla 4 0.009 0.009 \n",
"1 Charlie 4 0.006 0.006 \n",
"2 Ryan 4 0.005 0.005 \n",
"3 Oliver 4 0.028 0.028 \n",
"4 Logan 4 0.010 0.010 \n",
"\n",
" bf_first_name ... bf_city bf_tf_adj_city email_l \\\n",
"0 11.371023 ... 5.870654 0.411903 [email protected] \n",
"1 11.371023 ... 5.870654 5.481481 [email protected] \n",
"2 11.371023 ... 5.870654 4.453704 ajones6@cortez-wilcox.com \n",
"3 11.371023 ... 5.870654 5.089947 [email protected] \n",
"4 11.371023 ... 5.870654 2.159371 a.s@humphrey.com \n",
" bf_first_name ... bf_city bf_tf_adj_city email_l \\\n",
"0 11.371023 ... 0.626291 1.000000 [email protected] \n",
"1 11.371023 ... 5.870654 2.159371 [email protected] \n",
"2 11.371023 ... 0.626291 1.000000 r.cole1@ramirez-anthony.com \n",
"3 11.371023 ... 5.870654 0.411903 NaN \n",
"4 11.371023 ... 0.626291 1.000000 l.feruson46@sahh.com \n",
"\n",
" email_r gamma_email tf_email_l tf_email_r \\\n",
"0 NaN 0 0.005 0.211 \n",
"1 [email protected] 4 0.004 0.004 \n",
"2 ajones-@cortez6wilcox.com 2 0.004 0.001 \n",
"3 [email protected] 4 0.006 0.006 \n",
"4 a.s@humphrey.com 4 0.005 0.005 \n",
" email_r gamma_email tf_email_l tf_email_r \\\n",
"0 [email protected] 4 0.004 0.004 \n",
"1 NaN 0 0.002 0.211 \n",
"2 r.cole1@ramtrez-anihony.com 3 0.005 0.001 \n",
"3 NaN 4 0.211 0.211 \n",
"4 l.ferguson46@shah.com 2 0.001 0.002 \n",
"\n",
" bf_email bf_tf_adj_email match_key \n",
"0 0.349824 1.000000 0 \n",
"1 8.473825 11.429930 0 \n",
"2 252.479846 1.000000 0 \n",
"3 8.473825 7.619953 0 \n",
"4 8.473825 9.143944 0 \n",
"0 8.473825 11.429930 0 \n",
"1 0.349824 1.000000 0 \n",
"2 210.647668 1.000000 0 \n",
"3 8.473825 0.216681 0 \n",
"4 252.479846 1.000000 0 \n",
"\n",
"[5 rows x 37 columns]"
]
},
"execution_count": 13,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
1 change: 0 additions & 1 deletion splink/internals/comparison_level_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def create_sql(self, sql_dialect: SplinkDialect) -> str:
map(lambda cl: f"({cl.create_sql(sql_dialect)})", self.comparison_levels)
)

@final
def create_label_for_charts(self) -> str:
return f" {self._clause} ".join(
map(lambda cl: f"({cl.create_label_for_charts()})", self.comparison_levels)
Expand Down
23 changes: 11 additions & 12 deletions splink/internals/comparison_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,9 +718,6 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
cll.ExactMatchLevel(date_as_iso_string.substr(0, 4)),
)

level.create_label_for_charts = (
lambda: "Exact match on year (1st of January only)"
)
levels.append(level)

levels.append(cll.ExactMatchLevel(self.col_expression))
Expand Down Expand Up @@ -821,7 +818,7 @@ def __init__(
cols["latitude"] = lat_col
cols["longitude"] = long_col
else:
self.km_thresholds = None
self.km_thresholds = []
super().__init__(cols)

def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
Expand All @@ -830,8 +827,10 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
district_col_expression = full_col_expression.regex_extract(self.DISTRICT_REGEX)
area_col_expression = full_col_expression.regex_extract(self.AREA_REGEX)

if not self.km_thresholds:
levels: list[ComparisonLevelCreator] = [
levels: list[ComparisonLevelCreator] = []

if len(self.km_thresholds) == 0:
levels = [
cll.NullLevel(
full_col_expression, valid_string_pattern=self.valid_postcode_regex
),
Expand All @@ -843,7 +842,7 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
if self.km_thresholds:
# Don't include the very high level postcode categories
# if using km thresholds - they are better modelled as geo distances
levels: list[ComparisonLevelCreator] = [
levels = [
cll.NullLevel(
full_col_expression, valid_string_pattern=self.valid_postcode_regex
),
Expand Down Expand Up @@ -903,7 +902,7 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
levels: list[ComparisonLevelCreator] = [
cll.NullLevel(full_col_expression, valid_string_pattern=None),
cll.ExactMatchLevel(full_col_expression).configure(
tf_adjustment_column=full_col_expression
tf_adjustment_column=full_col_expression.raw_sql_expression
),
cll.ExactMatchLevel(username_col_expression).configure(
label_for_charts="Exact match on username"
Expand Down Expand Up @@ -956,8 +955,8 @@ def __init__(
contain arrays of dmetaphone values, which are of length 1 or 2.
"""

jaro_winkler_thresholds = ensure_is_iterable(jaro_winkler_thresholds)
self.jaro_winkler_thresholds = [*jaro_winkler_thresholds]
jaro_winkler_thresholds_itr = ensure_is_iterable(jaro_winkler_thresholds)
self.jaro_winkler_thresholds = list(jaro_winkler_thresholds_itr)

cols = {"name": col_name}
if dmeta_col_name is not None:
Expand Down Expand Up @@ -1045,8 +1044,8 @@ def __init__(
concatenated forename and surname values. If provided, term
frequencies are applied on the exact match using this column
"""
jaro_winkler_thresholds = ensure_is_iterable(jaro_winkler_thresholds)
self.jaro_winkler_thresholds = [*jaro_winkler_thresholds]
jaro_winkler_thresholds_itr = ensure_is_iterable(jaro_winkler_thresholds)
self.jaro_winkler_thresholds = list(jaro_winkler_thresholds_itr)
cols = {"forename": forename_col_name, "surname": surname_col_name}
if forename_surname_concat_col_name is not None:
cols["forename_surname_concat"] = forename_surname_concat_col_name
Expand Down
Loading

0 comments on commit f973871

Please sign in to comment.