Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor of 1664: add ability to do efficient blocking based on list/array intersections #1692

Merged
merged 43 commits into from
Jan 17, 2024
Merged
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ed6e47b
add tests for array-based blocking
nerskin Oct 23, 2023
f3024ef
Add logic for blocking on array intersections by unnesting tables
nerskin Oct 23, 2023
bfea53c
update hardcoded hash in test_correctness_of_convergence.py
nerskin Oct 23, 2023
4399142
Merge branch 'moj-analytical-services:master' into master
nerskin Oct 23, 2023
d3fe4dd
linting/formatting
nerskin Oct 24, 2023
a48b0ec
update table names for consistency with splink conventions
nerskin Oct 24, 2023
3b8222a
Update tests
nerskin Oct 24, 2023
1f1ea76
ensure that tables names are unique
nerskin Oct 24, 2023
eacdc04
lint
nerskin Oct 24, 2023
a9076ea
wip
RobinL Nov 1, 2023
8ed833f
wip
RobinL Nov 1, 2023
217f633
move materialisation logic to separate function
RobinL Nov 2, 2023
43298b1
rename for clarity
RobinL Nov 3, 2023
90fadb5
improve clairty of names
RobinL Nov 3, 2023
efb72c3
pushing logic into blockingrule class
RobinL Nov 3, 2023
4caa847
better names
RobinL Nov 6, 2023
4c40ffd
remove materialised tables after use
RobinL Nov 6, 2023
836ea36
is salted
RobinL Nov 6, 2023
46d834b
merge in master
RobinL Nov 6, 2023
fa096cd
fix merge
RobinL Nov 6, 2023
8621d2d
exploding blocking rule class
RobinL Nov 6, 2023
4d40bd9
all logic now pushed into blocking rules classes
RobinL Nov 6, 2023
1117737
better names
RobinL Nov 6, 2023
7c71849
change all files to current master
RobinL Nov 30, 2023
d78bec1
Merge branch 'master' into refactor_ids_to_compare_creation
RobinL Nov 30, 2023
de972f7
Merge branch 'master' into refactor_ids_to_compare_creation
RobinL Nov 30, 2023
da5a499
initial working implementation
RobinL Nov 30, 2023
bba25dc
update
RobinL Nov 30, 2023
24cd2e7
fix spark
RobinL Dec 11, 2023
0b2f338
Merge branch 'master' into refactor_ids_to_compare_creation
RobinL Dec 11, 2023
1d63218
format better
RobinL Dec 11, 2023
12e0c90
fix tests
RobinL Dec 11, 2023
c2aea3c
make work with deterministic link
RobinL Dec 11, 2023
a58deca
put check supported logic in place most likely to be caught
RobinL Dec 11, 2023
4021d3f
gives correct error messages
RobinL Dec 11, 2023
0cede3d
fix tests
RobinL Dec 11, 2023
26bf082
add tests back in
RobinL Dec 11, 2023
c7307eb
fix link type issue and unique ids
RobinL Dec 12, 2023
8aecb27
lint
RobinL Dec 12, 2023
6798cb1
fix line length
RobinL Dec 12, 2023
9ef61f1
rename method for clarity
RobinL Dec 12, 2023
1ece5d7
Merge branch 'master' into refactor_ids_to_compare_creation
RobinL Jan 17, 2024
ca0e202
Move out of loop
RobinL Jan 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
merge in master
RobinL committed Nov 6, 2023
commit 46d834b6bfe2da4f50297a28c7e9bce11dd87dc3
55 changes: 55 additions & 0 deletions splink/blocking.py
Original file line number Diff line number Diff line change
@@ -360,6 +360,8 @@ def materialise_exploded_id_tables(linker: Linker):


def block_using_rules_sql(linker: Linker):
# flake8: noqa: C901
def block_using_rules_sqls(linker: Linker):
"""Use the blocking rules specified in the linker's settings object to
generate a SQL statement that will create pairwise record comparions
according to the blocking rule(s).
@@ -368,6 +370,59 @@ def block_using_rules_sql(linker: Linker):
so that duplicate comparisons are not generated.
"""

sqls = []

# For the two dataset link only, rather than a self join of
# __splink__df_concat_with_tf, it's much faster to split the input
# into two tables, and join (because then Splink doesn't have to evaluate)
# intra-dataset comparisons.
# see https://github.com/moj-analytical-services/splink/pull/1359
if (
linker._two_dataset_link_only
and not linker._find_new_matches_mode
and not linker._compare_two_records_mode
):
source_dataset_col = linker._source_dataset_column_name
# Need df_l to be the one with the lowest id to preeserve the property
# that the left dataset is the one with the lowest concatenated id
keys = linker._input_tables_dict.keys()
keys = list(sorted(keys))
df_l = linker._input_tables_dict[keys[0]]
df_r = linker._input_tables_dict[keys[1]]

# This also needs to work for training u
if linker._train_u_using_random_sample_mode:
spl_switch = "_sample"
else:
spl_switch = ""

sql = f"""
select * from __splink__df_concat_with_tf{spl_switch}
where {source_dataset_col} = '{df_l.templated_name}'
"""
sqls.append(
{
"sql": sql,
"output_table_name": f"__splink__df_concat_with_tf{spl_switch}_left",
}
)

sql = f"""
select * from __splink__df_concat_with_tf{spl_switch}
where {source_dataset_col} = '{df_r.templated_name}'
"""
sqls.append(
{
"sql": sql,
"output_table_name": f"__splink__df_concat_with_tf{spl_switch}_right",
}
)

if type(linker).__name__ in ["SparkLinker"]:
apply_salt = True
else:
apply_salt = False

settings_obj = linker._settings_obj

columns_to_select = settings_obj._columns_to_select_for_blocking
5 changes: 3 additions & 2 deletions splink/linker.py
Original file line number Diff line number Diff line change
@@ -1732,8 +1732,9 @@ def predict(

materialise_exploded_id_tables(self)

sql = block_using_rules_sql(self)
self._enqueue_sql(sql, "__splink__df_blocked")
sqls = block_using_rules_sqls(self)
for sql in sqls:
self._enqueue_sql(sql["sql"], sql["output_table_name"])

repartition_after_blocking = getattr(self, "repartition_after_blocking", False)

You are viewing a condensed version of this merge commit. You can view the full changes here.