-
Notifications
You must be signed in to change notification settings - Fork 166
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
167 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,7 @@ | |
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"24/06/22 10:47:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", | ||
"24/06/22 10:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", | ||
"Setting default log level to \"WARN\".\n", | ||
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" | ||
] | ||
|
@@ -69,7 +69,7 @@ | |
"conf = SparkConf()\n", | ||
"# This parallelism setting is only suitable for a small toy example\n", | ||
"conf.set(\"spark.driver.memory\", \"12g\")\n", | ||
"conf.set(\"spark.default.parallelism\", \"16\")\n", | ||
"conf.set(\"spark.default.parallelism\", \"8\")\n", | ||
"\n", | ||
"\n", | ||
"# Add custom similarity functions, which are bundled with Splink\n", | ||
|
@@ -286,7 +286,7 @@ | |
"Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", | ||
" - dob\n", | ||
" \n", | ||
"WARNING: \n", | ||
"WARNING:\n", | ||
"Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n", | ||
"\n", | ||
"Iteration 1: Largest change in params was -0.548 in the m_probability of surname, level `Exact match on surname`\n", | ||
|
@@ -332,7 +332,7 @@ | |
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"execution_count": 11, | ||
"metadata": { | ||
"execution": { | ||
"iopub.execute_input": "2024-03-13T12:31:44.605970Z", | ||
|
@@ -390,122 +390,122 @@ | |
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>3.255480</td>\n", | ||
" <td>0.905212</td>\n", | ||
" <td>171</td>\n", | ||
" <td>252</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>14.992986</td>\n", | ||
" <td>0.999969</td>\n", | ||
" <td>192</td>\n", | ||
" <td>194</td>\n", | ||
" <td>Isla</td>\n", | ||
" <td>Isla</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.169</td>\n", | ||
" <td>0.169</td>\n", | ||
" <td>0.009</td>\n", | ||
" <td>0.009</td>\n", | ||
" <td>11.371023</td>\n", | ||
" <td>...</td>\n", | ||
" <td>5.870654</td>\n", | ||
" <td>0.411903</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>0.211</td>\n", | ||
" <td>0.349824</td>\n", | ||
" <td>0.626291</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.004</td>\n", | ||
" <td>0.004</td>\n", | ||
" <td>8.473825</td>\n", | ||
" <td>11.429930</td>\n", | ||
" <td>0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>17.789350</td>\n", | ||
" <td>0.999996</td>\n", | ||
" <td>939</td>\n", | ||
" <td>940</td>\n", | ||
" <td>Isabelle</td>\n", | ||
" <td>Isabelle</td>\n", | ||
" <td>10.461631</td>\n", | ||
" <td>0.999291</td>\n", | ||
" <td>303</td>\n", | ||
" <td>304</td>\n", | ||
" <td>Charlie</td>\n", | ||
" <td>Charlie</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.008</td>\n", | ||
" <td>0.008</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>11.371023</td>\n", | ||
" <td>...</td>\n", | ||
" <td>5.870654</td>\n", | ||
" <td>5.481481</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.004</td>\n", | ||
" <td>0.004</td>\n", | ||
" <td>8.473825</td>\n", | ||
" <td>11.429930</td>\n", | ||
" <td>2.159371</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0.002</td>\n", | ||
" <td>0.211</td>\n", | ||
" <td>0.349824</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>21.767896</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>63</td>\n", | ||
" <td>66</td>\n", | ||
" <td>Adam</td>\n", | ||
" <td>Adam</td>\n", | ||
" <td>15.283680</td>\n", | ||
" <td>0.999975</td>\n", | ||
" <td>79</td>\n", | ||
" <td>84</td>\n", | ||
" <td>Ryan</td>\n", | ||
" <td>Ryan</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>11.371023</td>\n", | ||
" <td>...</td>\n", | ||
" <td>5.870654</td>\n", | ||
" <td>4.453704</td>\n", | ||
" <td>ajones6@cortez-wilcox.com</td>\n", | ||
" <td>ajones-@cortez6wilcox.com</td>\n", | ||
" <td>2</td>\n", | ||
" <td>0.004</td>\n", | ||
" <td>0.626291</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>r.cole1@ramirez-anthony.com</td>\n", | ||
" <td>r.cole1@ramtrez-anihony.com</td>\n", | ||
" <td>3</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>0.001</td>\n", | ||
" <td>252.479846</td>\n", | ||
" <td>210.647668</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>20.106275</td>\n", | ||
" <td>0.999999</td>\n", | ||
" <td>247</td>\n", | ||
" <td>250</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>7.419792</td>\n", | ||
" <td>0.994194</td>\n", | ||
" <td>111</td>\n", | ||
" <td>113</td>\n", | ||
" <td>Oliver</td>\n", | ||
" <td>Oliver</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.169</td>\n", | ||
" <td>0.169</td>\n", | ||
" <td>0.028</td>\n", | ||
" <td>0.028</td>\n", | ||
" <td>11.371023</td>\n", | ||
" <td>...</td>\n", | ||
" <td>5.870654</td>\n", | ||
" <td>5.089947</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>[email protected]</td>\n", | ||
" <td>0.411903</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>0.006</td>\n", | ||
" <td>0.211</td>\n", | ||
" <td>0.211</td>\n", | ||
" <td>8.473825</td>\n", | ||
" <td>7.619953</td>\n", | ||
" <td>0.216681</td>\n", | ||
" <td>0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>15.904031</td>\n", | ||
" <td>0.999984</td>\n", | ||
" <td>428</td>\n", | ||
" <td>430</td>\n", | ||
" <td>Archie</td>\n", | ||
" <td>Archie</td>\n", | ||
" <td>4.030262</td>\n", | ||
" <td>0.942327</td>\n", | ||
" <td>220</td>\n", | ||
" <td>222</td>\n", | ||
" <td>Logan</td>\n", | ||
" <td>Logan</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.003</td>\n", | ||
" <td>0.003</td>\n", | ||
" <td>0.010</td>\n", | ||
" <td>0.010</td>\n", | ||
" <td>11.371023</td>\n", | ||
" <td>...</td>\n", | ||
" <td>5.870654</td>\n", | ||
" <td>2.159371</td>\n", | ||
" <td>a.s@humphrey.com</td>\n", | ||
" <td>a.s@humphrey.com</td>\n", | ||
" <td>4</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>0.005</td>\n", | ||
" <td>8.473825</td>\n", | ||
" <td>9.143944</td>\n", | ||
" <td>0.626291</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>l.feruson46@sahh.com</td>\n", | ||
" <td>l.ferguson46@shah.com</td>\n", | ||
" <td>2</td>\n", | ||
" <td>0.001</td>\n", | ||
" <td>0.002</td>\n", | ||
" <td>252.479846</td>\n", | ||
" <td>1.000000</td>\n", | ||
" <td>0</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
|
@@ -515,44 +515,44 @@ | |
], | ||
"text/plain": [ | ||
" match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", | ||
"0 3.255480 0.905212 171 252 NaN \n", | ||
"1 17.789350 0.999996 939 940 Isabelle \n", | ||
"2 21.767896 1.000000 63 66 Adam \n", | ||
"3 20.106275 0.999999 247 250 NaN \n", | ||
"4 15.904031 0.999984 428 430 Archie \n", | ||
"0 14.992986 0.999969 192 194 Isla \n", | ||
"1 10.461631 0.999291 303 304 Charlie \n", | ||
"2 15.283680 0.999975 79 84 Ryan \n", | ||
"3 7.419792 0.994194 111 113 Oliver \n", | ||
"4 4.030262 0.942327 220 222 Logan \n", | ||
"\n", | ||
" first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", | ||
"0 NaN 4 0.169 0.169 \n", | ||
"1 Isabelle 4 0.008 0.008 \n", | ||
"2 Adam 4 0.006 0.006 \n", | ||
"3 NaN 4 0.169 0.169 \n", | ||
"4 Archie 4 0.003 0.003 \n", | ||
"0 Isla 4 0.009 0.009 \n", | ||
"1 Charlie 4 0.006 0.006 \n", | ||
"2 Ryan 4 0.005 0.005 \n", | ||
"3 Oliver 4 0.028 0.028 \n", | ||
"4 Logan 4 0.010 0.010 \n", | ||
"\n", | ||
" bf_first_name ... bf_city bf_tf_adj_city email_l \\\n", | ||
"0 11.371023 ... 5.870654 0.411903 [email protected] \n", | ||
"1 11.371023 ... 5.870654 5.481481 [email protected] \n", | ||
"2 11.371023 ... 5.870654 4.453704 ajones6@cortez-wilcox.com \n", | ||
"3 11.371023 ... 5.870654 5.089947 [email protected] \n", | ||
"4 11.371023 ... 5.870654 2.159371 a.s@humphrey.com \n", | ||
" bf_first_name ... bf_city bf_tf_adj_city email_l \\\n", | ||
"0 11.371023 ... 0.626291 1.000000 [email protected] \n", | ||
"1 11.371023 ... 5.870654 2.159371 [email protected] \n", | ||
"2 11.371023 ... 0.626291 1.000000 r.cole1@ramirez-anthony.com \n", | ||
"3 11.371023 ... 5.870654 0.411903 NaN \n", | ||
"4 11.371023 ... 0.626291 1.000000 l.feruson46@sahh.com \n", | ||
"\n", | ||
" email_r gamma_email tf_email_l tf_email_r \\\n", | ||
"0 NaN 0 0.005 0.211 \n", | ||
"1 [email protected] 4 0.004 0.004 \n", | ||
"2 ajones-@cortez6wilcox.com 2 0.004 0.001 \n", | ||
"3 [email protected] 4 0.006 0.006 \n", | ||
"4 a.s@humphrey.com 4 0.005 0.005 \n", | ||
" email_r gamma_email tf_email_l tf_email_r \\\n", | ||
"0 [email protected] 4 0.004 0.004 \n", | ||
"1 NaN 0 0.002 0.211 \n", | ||
"2 r.cole1@ramtrez-anihony.com 3 0.005 0.001 \n", | ||
"3 NaN 4 0.211 0.211 \n", | ||
"4 l.ferguson46@shah.com 2 0.001 0.002 \n", | ||
"\n", | ||
" bf_email bf_tf_adj_email match_key \n", | ||
"0 0.349824 1.000000 0 \n", | ||
"1 8.473825 11.429930 0 \n", | ||
"2 252.479846 1.000000 0 \n", | ||
"3 8.473825 7.619953 0 \n", | ||
"4 8.473825 9.143944 0 \n", | ||
"0 8.473825 11.429930 0 \n", | ||
"1 0.349824 1.000000 0 \n", | ||
"2 210.647668 1.000000 0 \n", | ||
"3 8.473825 0.216681 0 \n", | ||
"4 252.479846 1.000000 0 \n", | ||
"\n", | ||
"[5 rows x 37 columns]" | ||
] | ||
}, | ||
"execution_count": 13, | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.