From ed6e47b78413a38b6d2c61bd83d3531f94cd1649 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Mon, 23 Oct 2023 21:58:50 +1100 Subject: [PATCH 01/37] add tests for array-based blocking --- tests/test_array_based_blocking.py | 227 +++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 tests/test_array_based_blocking.py diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py new file mode 100644 index 0000000000..87d2b6fa65 --- /dev/null +++ b/tests/test_array_based_blocking.py @@ -0,0 +1,227 @@ +import random +import copy + +import pandas as pd + +from tests.decorator import mark_with_dialects_including + +from splink.spark.linker import SparkLinker + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_simple_example_link_only(test_helpers, dialect): + data_l = pd.DataFrame.from_dict( + [ + {"unique_id": 1, "gender": "m", "postcode": ["2612", "2000"]}, + {"unique_id": 2, "gender": "m", "postcode": ["2612", "2617"]}, + {"unique_id": 3, "gender": "f", "postcode": ["2617"]}, + ] + ) + data_r = pd.DataFrame.from_dict( + [ + {"unique_id": 4, "gender": "m", "postcode": ["2617", "2600"]}, + {"unique_id": 5, "gender": "f", "postcode": ["2000"]}, + {"unique_id": 6, "gender": "m", "postcode": ["2617", "2612", "2000"]}, + ] + ) + helper = test_helpers[dialect] + settings = { + "link_type": "link_only", + "blocking_rules_to_generate_predictions": [ + { + "blocking_rule": "l.gender = r.gender and l.postcode = r.postcode", + "arrays_to_explode": ["postcode"], + }, + "l.gender = r.gender", + ], + "comparisons": [helper.cl.array_intersect_at_sizes("postcode", [1])], + } + ## the pairs returned by the first blocking rule are (1,6),(2,4),(2,6) + ## the additional pairs returned by the second blocking rule are (1,4),(3,5) + linker = helper.Linker([data_l, data_r], settings, **helper.extra_linker_args()) + linker.debug_mode = False + returned_triples = linker.predict().as_pandas_dataframe()[ + ["unique_id_l", "unique_id_r", "match_key"] + ] + returned_triples = { + (unique_id_l, unique_id_r, match_key) + for unique_id_l, unique_id_r, match_key in zip( + returned_triples.unique_id_l, + returned_triples.unique_id_r, + returned_triples.match_key, + ) + } + expected_triples = {(1, 6, "0"), (2, 4, "0"), (2, 6, "0"), (1, 4, "1"), (3, 5, "1")} + assert expected_triples == returned_triples + + +def generate_array_based_datasets_helper( + n_rows=1000, n_array_based_columns=3, n_distinct_values=1000, array_size=3, seed=1 +): + random.seed(seed) + datasets = [] + for _k in range(2): + results_dict = {} + results_dict["cluster"] = list(range(n_rows)) + for i in range(n_array_based_columns): + col = [] + for j in range(n_rows): + col.append(random.sample(range(n_distinct_values), array_size)) + if random.random() < 0.8 or i == n_array_based_columns - 1: + col[-1].append(j) + random.shuffle(col[-1]) + results_dict[f"array_column_{i}"] = col + datasets.append(pd.DataFrame.from_dict(results_dict)) + return datasets + + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect): + helper = test_helpers[dialect] + input_data_l, input_data_r = generate_array_based_datasets_helper() + input_data_l = input_data_l.assign( + unique_id=[str(cluster_id) + "-0" for cluster_id in input_data_l.cluster] + ) + input_data_r = input_data_r.assign( + unique_id=[str(cluster_id) + "-1" for cluster_id in input_data_r.cluster] + ) + input_data = pd.concat([input_data_l, input_data_r]) + blocking_rules = [ + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": "l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_2"], + }, + ] + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": blocking_rules, + "unique_id_column_name": "unique_id", + "additional_columns_to_retain": ["cluster"], + "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], + } + linker = helper.Linker(input_data, settings, **helper.extra_linker_args()) + linker.debug_mode = False + df_predict = linker.predict().as_pandas_dataframe() + ## check that there are no duplicates in the output + assert ( + df_predict.drop_duplicates(["unique_id_l", "unique_id_r"]).shape[0] + == df_predict.shape[0] + ) + + ## check that the output contains no links with match_key=1, + ## since all pairs returned by the second rule should also be + ## returned by the first rule and so should be filtered out + assert df_predict[df_predict.match_key == 1].shape[0] == 0 + + ## check that all 1000 true matches are in the output (this is guaranteed by how the data was generated) + assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 + + + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): + helper = test_helpers[dialect] + input_data_l, input_data_r = generate_array_based_datasets_helper() + blocking_rules = [ + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2=r.array_column_2", + "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], + }, + { + "blocking_rule": "l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_2"], + }, + ] + settings = { + "link_type": "link_only", + "blocking_rules_to_generate_predictions": blocking_rules, + "unique_id_column_name": "cluster", + "additional_columns_to_retain": ["cluster"], + "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], + } + linker = helper.Linker( + [input_data_l, input_data_r], settings, **helper.extra_linker_args() + ) + linker.debug_mode=False + df_predict = linker.predict().as_pandas_dataframe() + + ## check that we get no within-dataset links + within_dataset_links = df_predict[ + df_predict.source_dataset_l == df_predict.source_dataset_r + ].shape[0] + assert within_dataset_links == 0 + + ## check that no pair of ids appears twice in the output + assert ( + df_predict.drop_duplicates(["cluster_l", "cluster_r"]).shape[0] + == df_predict.shape[0] + ) + + ## check that the second blocking rule returns no matches, + ## since every pair matching the second rule will also match the first, and so should be filtered out + assert df_predict[df_predict.match_key == 1].shape[0] == 0 + + ## check that all 1000 true matches are returned + assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 + +@mark_with_dialects_including('spark',pass_dialect=True) +def test_array_based_blocking_with_salted_rules(test_helpers,dialect): + helper = test_helpers[dialect] + input_data_l,input_data_r = generate_array_based_datasets_helper() + blocking_rules = [ + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "arrays_to_explode": ["array_column_0", "array_column_1"], + "salting_partitions": 3 + }, + { + "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2=r.array_column_2", + "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], + "salting_partitions": 2 + }, + { + "blocking_rule": "l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_2"], + "salting_partitions": 1 + }, + ] + settings = { + "link_type":"link_only", + "blocking_rules_to_generate_predictions":blocking_rules, + "unique_id_column_name":"cluster", + "additional_columns_to_retain":["cluster"], + "comparisons":[helper.cl.array_intersect_at_sizes("array_column_1",[1])] + } + + linker = helper.Linker( + [input_data_l, input_data_r], settings, **helper.extra_linker_args() + ) + linker.debug_mode=False + df_predict = linker.predict().as_pandas_dataframe() + + ## check that there are no duplicates in the output + assert df_predict.drop_duplicates(['cluster_l','cluster_r']).shape[0] == df_predict.shape[0] + + ## check that results include the same pairs (and with the same match keys) as an equivalent linkage with no salting + blocking_rules_no_salt = copy.deepcopy(blocking_rules) + settings_no_salt = copy.deepcopy(settings) + for br in blocking_rules_no_salt: + br.pop('salting_partitions') + settings_no_salt['blocking_rules_to_generate_predictions'] = blocking_rules_no_salt + linker_no_salt = helper.Linker([input_data_l,input_data_r],settings_no_salt,**helper.extra_linker_args()) + df_predict_no_salt = linker_no_salt.predict().as_pandas_dataframe() + predictions_no_salt = set(zip(df_predict_no_salt.cluster_l,df_predict_no_salt.cluster_r,df_predict_no_salt.match_key)) + predictions_with_salt = set(zip(df_predict.cluster_l,df_predict.cluster_r,df_predict.match_key)) + + assert predictions_no_salt == predictions_with_salt From f3024ef4b654590a9df99025fa1d46ce225eef9d Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Mon, 23 Oct 2023 22:40:16 +1100 Subject: [PATCH 02/37] Add logic for blocking on array intersections by unnesting tables --- splink/blocking.py | 161 ++++++++++++++++++++++++++++------------ splink/duckdb/linker.py | 18 ++++- splink/linker.py | 9 ++- splink/spark/linker.py | 15 ++++ 4 files changed, 152 insertions(+), 51 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 18e697b624..9981148036 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -8,6 +8,8 @@ from .misc import ensure_is_list from .unique_id_concat import _composite_unique_id_from_nodes_sql +from .pipeline import SQLPipeline +from .input_column import InputColumn logger = logging.getLogger(__name__) @@ -25,11 +27,13 @@ def blocking_rule_to_obj(br): raise ValueError("No blocking rule submitted...") sqlglot_dialect = br.get("sql_dialect", None) salting_partitions = br.get("salting_partitions", 1) + arrays_to_explode = br.get("arrays_to_explode", list()) return BlockingRule( blocking_rule, salting_partitions, sqlglot_dialect, + arrays_to_explode ) else: @@ -43,6 +47,7 @@ def __init__( blocking_rule: BlockingRule | dict | str, salting_partitions=1, sqlglot_dialect: str = None, + arrays_to_explode: list =[], ): if sqlglot_dialect: self._sql_dialect = sqlglot_dialect @@ -51,6 +56,8 @@ def __init__( self.preceding_rules = [] self.sqlglot_dialect = sqlglot_dialect self.salting_partitions = salting_partitions + self.arrays_to_explode = arrays_to_explode + self.ids_to_compare = [] @property def sql_dialect(self): @@ -69,16 +76,33 @@ def add_preceding_rules(self, rules): rules = ensure_is_list(rules) self.preceding_rules = rules - @property - def and_not_preceding_rules_sql(self): + def exclude_from_following_rules_sql(self, linker: Linker): + unique_id_column = linker._settings_obj._unique_id_column_name + + if self.ids_to_compare: + + ids_to_compare_sql = ' union all '.join([f'select * from {ids.physical_name}' for ids in self.ids_to_compare]) + #self.ids_to_compare[0].physical_name + + return f"""EXISTS ( + select 1 from ({ids_to_compare_sql}) as ids_to_compare + where ( + l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and + r.{unique_id_column} = ids_to_compare.{unique_id_column}_r + ) + ) + """ + else: + # Note the coalesce function is important here - otherwise + # you filter out any records with nulls in the previous rules + # meaning these comparisons get lost + return f"coalesce(({self.blocking_rule}),false)" + + def and_not_preceding_rules_sql(self, linker: Linker): if not self.preceding_rules: return "" - - # Note the coalesce function is important here - otherwise - # you filter out any records with nulls in the previous rules - # meaning these comparisons get lost or_clauses = [ - f"coalesce(({r.blocking_rule}), false)" for r in self.preceding_rules + br.exclude_from_following_rules_sql(linker) for br in self.preceding_rules ] previous_rules = " OR ".join(or_clauses) return f"AND NOT ({previous_rules})" @@ -152,6 +176,9 @@ def as_dict(self): if self.salting_partitions > 1 and self.sql_dialect == "spark": output["salting_partitions"] = self.salting_partitions + + if self.arrays_to_explode: + output["arrays_to_explode"] = self.arrays_to_explode return output @@ -195,7 +222,6 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition - # flake8: noqa: C901 def block_using_rules_sql(linker: Linker): """Use the blocking rules specified in the linker's settings object to @@ -243,6 +269,85 @@ def block_using_rules_sql(linker: Linker): " will not be implemented for this run." ) + # Cover the case where there are no blocking rules + # This is a bit of a hack where if you do a self-join on 'true' + # you create a cartesian product, rather than having separate code + # that generates a cross join for the case of no blocking rules + if not blocking_rules: + blocking_rules = [BlockingRule("1=1")] + + # For Blocking rules for deterministic rules, add a match probability + # column with all probabilities set to 1. + if linker._deterministic_link_mode: + probability = ", 1.00 as match_probability" + else: + probability = "" + + sqls = [] + for br in blocking_rules: + + # Apply our salted rules to resolve skew issues. If no salt was + # selected to be added, then apply the initial blocking rule. + if apply_salt: + salted_blocking_rules = br.salted_blocking_rules + else: + salted_blocking_rules = [br.blocking_rule] + + for salted_br in salted_blocking_rules: + if not br.arrays_to_explode: + sql = f""" + select + {sql_select_expr} + , '{br.match_key}' as match_key + {probability} + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({salted_br}) + {where_condition} + {br.and_not_preceding_rules_sql(linker)} + """ + else: + try: + input_dataframe = linker._intermediate_table_cache[ + "__splink__df_concat_with_tf" + ] + except KeyError: + input_dataframe = linker._initialise_df_concat_with_tf() + input_colnames = {col.name() for col in input_dataframe.columns} + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() + for colname in br.arrays_to_explode + ] + linker._enqueue_sql( + f"{linker._gen_explode_sql('__splink__df_concat_with_tf',br.arrays_to_explode,list(input_colnames.difference(arrays_to_explode_quoted)))}", + "unnested_input", + ) + unique_id_col = settings_obj._unique_id_column_name + + if link_type == "two_dataset_link_only": + where_condition = ( + where_condition + " and l.source_dataset < r.source_dataset" + ) + + linker._enqueue_sql( + f""" + select distinct l.{unique_id_col} as {unique_id_col}_l,r.{unique_id_col} as {unique_id_col}_r + from unnested_input as l inner join unnested_input as r on ({salted_br}) + {where_condition} {br.and_not_preceding_rules_sql(linker)}""", + f"ids_to_compare_blocking_rule_{br.match_key}", + ) + ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) + br.ids_to_compare.append(ids_to_compare) + sql = f""" + select {sql_select_expr}, '{br.match_key}' as match_key + {probability} + from {ids_to_compare.physical_name} as pairs + left join {linker._input_tablename_l} as l on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + sqls.append(sql) + if ( linker._two_dataset_link_only and not linker._find_new_matches_mode @@ -273,45 +378,5 @@ def block_using_rules_sql(linker: Linker): """ linker._enqueue_sql(sql, f"__splink__df_concat_with_tf{sample_switch}_right") - # Cover the case where there are no blocking rules - # This is a bit of a hack where if you do a self-join on 'true' - # you create a cartesian product, rather than having separate code - # that generates a cross join for the case of no blocking rules - if not blocking_rules: - blocking_rules = [BlockingRule("1=1")] - - # For Blocking rules for deterministic rules, add a match probability - # column with all probabilities set to 1. - if linker._deterministic_link_mode: - probability = ", 1.00 as match_probability" - else: - probability = "" - - sqls = [] - for br in blocking_rules: - # Apply our salted rules to resolve skew issues. If no salt was - # selected to be added, then apply the initial blocking rule. - if apply_salt: - salted_blocking_rules = br.salted_blocking_rules - else: - salted_blocking_rules = [br.blocking_rule] - - for salted_br in salted_blocking_rules: - sql = f""" - select - {sql_select_expr} - , '{br.match_key}' as match_key - {probability} - from {linker._input_tablename_l} as l - inner join {linker._input_tablename_r} as r - on - ({salted_br}) - {br.and_not_preceding_rules_sql} - {where_condition} - """ - - sqls.append(sql) - sql = "union all".join(sqls) - return sql diff --git a/splink/duckdb/linker.py b/splink/duckdb/linker.py index 79fc0b43c1..34a7c8bd6c 100644 --- a/splink/duckdb/linker.py +++ b/splink/duckdb/linker.py @@ -218,7 +218,6 @@ def _execute_sql_against_backend(self, sql, templated_name, physical_name): ({sql}) """ self._log_and_run_sql_execution(sql, templated_name, physical_name) - return DuckDBDataFrame(templated_name, physical_name, self) def _run_sql_execution(self, final_sql, templated_name, physical_name): @@ -319,3 +318,20 @@ def export_to_duckdb_file(self, output_path, delete_intermediate_tables=False): new_con = duckdb.connect(database=output_path) new_con.execute(f"IMPORT DATABASE '{tmpdir}';") new_con.close() + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + """Generated sql that explodes one or more columns in a table""" + columns_to_explode = columns_to_explode.copy() + other_columns_to_retain = other_columns_to_retain.copy() + # base case + if len(columns_to_explode) == 0: + return f"select {','.join(other_columns_to_retain)} from {tbl_name}" + else: + column_to_explode = columns_to_explode.pop() + cols_to_select = ( + [f"unnest({column_to_explode}) as {column_to_explode}"] + + other_columns_to_retain + + columns_to_explode + ) + other_columns_to_retain.append(column_to_explode) + return f"select {','.join(cols_to_select)} from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})" diff --git a/splink/linker.py b/splink/linker.py index 897dfc9899..d6a293c320 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -632,9 +632,9 @@ def _execute_sql_pipeline( start_time = time.time() output_tablename = task.output_table_name sql = task.sql - print("------") # noqa: T201 + print("------", flush=True) # noqa: T201 print( # noqa: T201 - f"--------Creating table: {output_tablename}--------" + f"--------Creating table: {output_tablename}--------", flush=True ) dataframe = self._sql_to_splink_dataframe_checking_cache( @@ -3696,3 +3696,8 @@ def _remove_splinkdataframe_from_cache(self, splink_dataframe: SplinkDataFrame): for k in keys_to_delete: del self._intermediate_table_cache[k] + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + raise NotImplementedError( + f"Unnesting blocking rules are not supported for {type(self)}" + ) diff --git a/splink/spark/linker.py b/splink/spark/linker.py index 8cf7477671..3334a0caee 100644 --- a/splink/spark/linker.py +++ b/splink/spark/linker.py @@ -534,3 +534,18 @@ def _check_ansi_enabled_if_converting_dates(self): classed as comparison level = "ELSE". Ensure date strings are cleaned to remove bad dates \n""" ) + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + """Generated sql that explodes one or more columns in a table""" + columns_to_explode = columns_to_explode.copy() + other_columns_to_retain = other_columns_to_retain.copy() + if len(columns_to_explode) == 0: + return f"select {','.join(other_columns_to_retain)} from {tbl_name}" + else: + column_to_explode = columns_to_explode.pop() + cols_to_select = ( + [f"explode({column_to_explode}) as {column_to_explode}"] + + other_columns_to_retain + + columns_to_explode + ) + return f"select {','.join(cols_to_select)} from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})" From bfea53c722bd112ecc548bedbc2256550741d3b4 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Mon, 23 Oct 2023 23:34:39 +1100 Subject: [PATCH 03/37] update hardcoded hash in test_correctness_of_convergence.py --- tests/test_correctness_of_convergence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_correctness_of_convergence.py b/tests/test_correctness_of_convergence.py index 7b536a03a5..435c26d09b 100644 --- a/tests/test_correctness_of_convergence.py +++ b/tests/test_correctness_of_convergence.py @@ -68,7 +68,7 @@ def test_splink_converges_to_known_params(): # CREATE TABLE __splink__df_comparison_vectors_abc123 # and modify the following line to include the value of the hash (abc123 above) - cvv_hashed_tablename = "__splink__df_comparison_vectors_ee08ffa85" + cvv_hashed_tablename = "__splink__df_comparison_vectors_98aaa302a" linker.register_table(df, cvv_hashed_tablename) em_training_session = EMTrainingSession( From d3fe4dd47c29b189ded594f6db587b5b7b70b1ff Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Tue, 24 Oct 2023 11:19:09 +1100 Subject: [PATCH 04/37] linting/formatting --- splink/blocking.py | 20 +++--- splink/duckdb/linker.py | 3 +- splink/spark/linker.py | 3 +- tests/test_array_based_blocking.py | 98 +++++++++++++++++++----------- 4 files changed, 75 insertions(+), 49 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 9981148036..aca178425f 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -30,10 +30,7 @@ def blocking_rule_to_obj(br): arrays_to_explode = br.get("arrays_to_explode", list()) return BlockingRule( - blocking_rule, - salting_partitions, - sqlglot_dialect, - arrays_to_explode + blocking_rule, salting_partitions, sqlglot_dialect, arrays_to_explode ) else: @@ -47,7 +44,7 @@ def __init__( blocking_rule: BlockingRule | dict | str, salting_partitions=1, sqlglot_dialect: str = None, - arrays_to_explode: list =[], + arrays_to_explode: list = [], ): if sqlglot_dialect: self._sql_dialect = sqlglot_dialect @@ -80,10 +77,12 @@ def exclude_from_following_rules_sql(self, linker: Linker): unique_id_column = linker._settings_obj._unique_id_column_name if self.ids_to_compare: - - ids_to_compare_sql = ' union all '.join([f'select * from {ids.physical_name}' for ids in self.ids_to_compare]) - #self.ids_to_compare[0].physical_name - + + ids_to_compare_sql = " union all ".join( + [f"select * from {ids.physical_name}" for ids in self.ids_to_compare] + ) + # self.ids_to_compare[0].physical_name + return f"""EXISTS ( select 1 from ({ids_to_compare_sql}) as ids_to_compare where ( @@ -176,7 +175,7 @@ def as_dict(self): if self.salting_partitions > 1 and self.sql_dialect == "spark": output["salting_partitions"] = self.salting_partitions - + if self.arrays_to_explode: output["arrays_to_explode"] = self.arrays_to_explode @@ -222,6 +221,7 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition + # flake8: noqa: C901 def block_using_rules_sql(linker: Linker): """Use the blocking rules specified in the linker's settings object to diff --git a/splink/duckdb/linker.py b/splink/duckdb/linker.py index 34a7c8bd6c..8b0e016df6 100644 --- a/splink/duckdb/linker.py +++ b/splink/duckdb/linker.py @@ -334,4 +334,5 @@ def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain + columns_to_explode ) other_columns_to_retain.append(column_to_explode) - return f"select {','.join(cols_to_select)} from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})" + return f"""select {','.join(cols_to_select)} + from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})""" # noqa: E501 diff --git a/splink/spark/linker.py b/splink/spark/linker.py index 3334a0caee..c2cfcbc646 100644 --- a/splink/spark/linker.py +++ b/splink/spark/linker.py @@ -548,4 +548,5 @@ def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain + other_columns_to_retain + columns_to_explode ) - return f"select {','.join(cols_to_select)} from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})" + return f"""select {','.join(cols_to_select)} + from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})""" # noqa: E501 diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index 87d2b6fa65..08c3cab017 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -1,11 +1,10 @@ +import copy import random -import copy import pandas as pd from tests.decorator import mark_with_dialects_including -from splink.spark.linker import SparkLinker @mark_with_dialects_including("duckdb", "spark", pass_dialect=True) def test_simple_example_link_only(test_helpers, dialect): @@ -87,11 +86,14 @@ def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect): input_data = pd.concat([input_data_l, input_data_r]) blocking_rules = [ { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1""", "arrays_to_explode": ["array_column_0", "array_column_1"], }, { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2 = r.array_column_2", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1 + and l.array_column_2 = r.array_column_2""", "arrays_to_explode": ["array_column_0", "array_column_1"], }, { @@ -115,27 +117,30 @@ def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect): == df_predict.shape[0] ) - ## check that the output contains no links with match_key=1, - ## since all pairs returned by the second rule should also be + ## check that the output contains no links with match_key=1, + ## since all pairs returned by the second rule should also be ## returned by the first rule and so should be filtered out assert df_predict[df_predict.match_key == 1].shape[0] == 0 - ## check that all 1000 true matches are in the output (this is guaranteed by how the data was generated) + ## check that all 1000 true matches are in the output + ## (this is guaranteed by how the data was generated) assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 - @mark_with_dialects_including("duckdb", "spark", pass_dialect=True) def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): helper = test_helpers[dialect] input_data_l, input_data_r = generate_array_based_datasets_helper() blocking_rules = [ { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1""", "arrays_to_explode": ["array_column_0", "array_column_1"], }, { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2=r.array_column_2", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1 + and l.array_column_2=r.array_column_2""", "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], }, { @@ -153,7 +158,7 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): linker = helper.Linker( [input_data_l, input_data_r], settings, **helper.extra_linker_args() ) - linker.debug_mode=False + linker.debug_mode = False df_predict = linker.predict().as_pandas_dataframe() ## check that we get no within-dataset links @@ -169,59 +174,78 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): ) ## check that the second blocking rule returns no matches, - ## since every pair matching the second rule will also match the first, and so should be filtered out + ## since every pair matching the second rule will also match the first, + ## and so should be filtered out assert df_predict[df_predict.match_key == 1].shape[0] == 0 ## check that all 1000 true matches are returned assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 -@mark_with_dialects_including('spark',pass_dialect=True) -def test_array_based_blocking_with_salted_rules(test_helpers,dialect): + +@mark_with_dialects_including("spark", pass_dialect=True) +def test_array_based_blocking_with_salted_rules(test_helpers, dialect): helper = test_helpers[dialect] - input_data_l,input_data_r = generate_array_based_datasets_helper() + input_data_l, input_data_r = generate_array_based_datasets_helper() blocking_rules = [ { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1""", "arrays_to_explode": ["array_column_0", "array_column_1"], - "salting_partitions": 3 + "salting_partitions": 3, }, { - "blocking_rule": "l.array_column_0 = r.array_column_0 and l.array_column_1 = r.array_column_1 and l.array_column_2=r.array_column_2", + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1 + and l.array_column_2=r.array_column_2""", "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], - "salting_partitions": 2 + "salting_partitions": 2, }, { "blocking_rule": "l.array_column_2 = r.array_column_2", "arrays_to_explode": ["array_column_2"], - "salting_partitions": 1 + "salting_partitions": 1, }, ] settings = { - "link_type":"link_only", - "blocking_rules_to_generate_predictions":blocking_rules, - "unique_id_column_name":"cluster", - "additional_columns_to_retain":["cluster"], - "comparisons":[helper.cl.array_intersect_at_sizes("array_column_1",[1])] + "link_type": "link_only", + "blocking_rules_to_generate_predictions": blocking_rules, + "unique_id_column_name": "cluster", + "additional_columns_to_retain": ["cluster"], + "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], } - + linker = helper.Linker( [input_data_l, input_data_r], settings, **helper.extra_linker_args() ) - linker.debug_mode=False + linker.debug_mode = False df_predict = linker.predict().as_pandas_dataframe() - + ## check that there are no duplicates in the output - assert df_predict.drop_duplicates(['cluster_l','cluster_r']).shape[0] == df_predict.shape[0] - - ## check that results include the same pairs (and with the same match keys) as an equivalent linkage with no salting + assert ( + df_predict.drop_duplicates(["cluster_l", "cluster_r"]).shape[0] + == df_predict.shape[0] + ) + + ## check that results include the same pairs (and with the same match keys) + ## as an equivalent linkage with no salting blocking_rules_no_salt = copy.deepcopy(blocking_rules) settings_no_salt = copy.deepcopy(settings) for br in blocking_rules_no_salt: - br.pop('salting_partitions') - settings_no_salt['blocking_rules_to_generate_predictions'] = blocking_rules_no_salt - linker_no_salt = helper.Linker([input_data_l,input_data_r],settings_no_salt,**helper.extra_linker_args()) + br.pop("salting_partitions") + settings_no_salt["blocking_rules_to_generate_predictions"] = blocking_rules_no_salt + linker_no_salt = helper.Linker( + [input_data_l, input_data_r], settings_no_salt, **helper.extra_linker_args() + ) df_predict_no_salt = linker_no_salt.predict().as_pandas_dataframe() - predictions_no_salt = set(zip(df_predict_no_salt.cluster_l,df_predict_no_salt.cluster_r,df_predict_no_salt.match_key)) - predictions_with_salt = set(zip(df_predict.cluster_l,df_predict.cluster_r,df_predict.match_key)) - + predictions_no_salt = set( + zip( + df_predict_no_salt.cluster_l, + df_predict_no_salt.cluster_r, + df_predict_no_salt.match_key, + ) + ) + predictions_with_salt = set( + zip(df_predict.cluster_l, df_predict.cluster_r, df_predict.match_key) + ) + assert predictions_no_salt == predictions_with_salt From a48b0eccfd8bf12cdaf9bc297494ef781797f265 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Tue, 24 Oct 2023 16:17:05 +1100 Subject: [PATCH 05/37] update table names for consistency with splink conventions --- splink/blocking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index aca178425f..74b3ecd483 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -321,7 +321,7 @@ def block_using_rules_sql(linker: Linker): ] linker._enqueue_sql( f"{linker._gen_explode_sql('__splink__df_concat_with_tf',br.arrays_to_explode,list(input_colnames.difference(arrays_to_explode_quoted)))}", - "unnested_input", + "__splink__df_concat_with_tf_unnested", ) unique_id_col = settings_obj._unique_id_column_name @@ -333,7 +333,7 @@ def block_using_rules_sql(linker: Linker): linker._enqueue_sql( f""" select distinct l.{unique_id_col} as {unique_id_col}_l,r.{unique_id_col} as {unique_id_col}_r - from unnested_input as l inner join unnested_input as r on ({salted_br}) + from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r on ({salted_br}) {where_condition} {br.and_not_preceding_rules_sql(linker)}""", f"ids_to_compare_blocking_rule_{br.match_key}", ) From 3b8222a8063955181f48d58819b5e7e440cdff23 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Tue, 24 Oct 2023 18:47:36 +1100 Subject: [PATCH 06/37] Update tests --- tests/test_array_based_blocking.py | 33 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index 08c3cab017..c8d0c57f73 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -3,6 +3,12 @@ import pandas as pd +from pyspark import SparkContext, SparkConf +from pyspark.sql import SparkSession +from splink.spark.linker import SparkLinker +import splink.spark.comparison_library as cl + + from tests.decorator import mark_with_dialects_including @@ -182,9 +188,9 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 -@mark_with_dialects_including("spark", pass_dialect=True) -def test_array_based_blocking_with_salted_rules(test_helpers, dialect): - helper = test_helpers[dialect] + +@mark_with_dialects_including("spark") +def test_array_based_blocking_with_salted_rules(): input_data_l, input_data_r = generate_array_based_datasets_helper() blocking_rules = [ { @@ -211,13 +217,18 @@ def test_array_based_blocking_with_salted_rules(test_helpers, dialect): "blocking_rules_to_generate_predictions": blocking_rules, "unique_id_column_name": "cluster", "additional_columns_to_retain": ["cluster"], - "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], + "comparisons": [cl.array_intersect_at_sizes("array_column_1", [1])], } + + conf = SparkConf() + sc = SparkContext.getOrCreate(conf=conf) + spark = SparkSession(sc) + input_l_spark = spark.createDataFrame(input_data_l) + input_r_spark = spark.createDataFrame(input_data_r) - linker = helper.Linker( - [input_data_l, input_data_r], settings, **helper.extra_linker_args() - ) - linker.debug_mode = False + linker = SparkLinker( + [input_l_spark, input_r_spark], settings + ) df_predict = linker.predict().as_pandas_dataframe() ## check that there are no duplicates in the output @@ -233,8 +244,8 @@ def test_array_based_blocking_with_salted_rules(test_helpers, dialect): for br in blocking_rules_no_salt: br.pop("salting_partitions") settings_no_salt["blocking_rules_to_generate_predictions"] = blocking_rules_no_salt - linker_no_salt = helper.Linker( - [input_data_l, input_data_r], settings_no_salt, **helper.extra_linker_args() + linker_no_salt = SparkLinker( + [input_l_spark, input_r_spark], settings_no_salt ) df_predict_no_salt = linker_no_salt.predict().as_pandas_dataframe() predictions_no_salt = set( @@ -247,5 +258,5 @@ def test_array_based_blocking_with_salted_rules(test_helpers, dialect): predictions_with_salt = set( zip(df_predict.cluster_l, df_predict.cluster_r, df_predict.match_key) ) - + assert predictions_no_salt == predictions_with_salt From 1f1ea76b581c51b32119b2c01df9d77989783959 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Tue, 24 Oct 2023 18:48:04 +1100 Subject: [PATCH 07/37] ensure that tables names are unique --- splink/blocking.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/splink/blocking.py b/splink/blocking.py index 74b3ecd483..01ecf4d268 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib from sqlglot import parse_one from sqlglot.expressions import Join, Column from sqlglot.optimizer.eliminate_joins import join_condition @@ -330,12 +331,19 @@ def block_using_rules_sql(linker: Linker): where_condition + " and l.source_dataset < r.source_dataset" ) + # ensure that table names are unique + if apply_salt: + to_hash = (salted_br + linker._cache_uid).encode("utf-8") + salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] + else: + salt_id = "" + linker._enqueue_sql( f""" select distinct l.{unique_id_col} as {unique_id_col}_l,r.{unique_id_col} as {unique_id_col}_r from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r on ({salted_br}) {where_condition} {br.and_not_preceding_rules_sql(linker)}""", - f"ids_to_compare_blocking_rule_{br.match_key}", + f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", ) ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) br.ids_to_compare.append(ids_to_compare) From eacdc04d3ccfea7994801945ea3876e31f05ed70 Mon Sep 17 00:00:00 2001 From: Nicholas Erskine Date: Tue, 24 Oct 2023 19:10:06 +1100 Subject: [PATCH 08/37] lint --- splink/blocking.py | 2 +- tests/test_array_based_blocking.py | 21 +++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 01ecf4d268..1aa79dad65 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -331,7 +331,7 @@ def block_using_rules_sql(linker: Linker): where_condition + " and l.source_dataset < r.source_dataset" ) - # ensure that table names are unique + # ensure that table names are unique if apply_salt: to_hash = (salted_br + linker._cache_uid).encode("utf-8") salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index c8d0c57f73..1da579609c 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -2,13 +2,11 @@ import random import pandas as pd - -from pyspark import SparkContext, SparkConf +from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession -from splink.spark.linker import SparkLinker -import splink.spark.comparison_library as cl - +import splink.spark.comparison_library as cl +from splink.spark.linker import SparkLinker from tests.decorator import mark_with_dialects_including @@ -188,7 +186,6 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 - @mark_with_dialects_including("spark") def test_array_based_blocking_with_salted_rules(): input_data_l, input_data_r = generate_array_based_datasets_helper() @@ -219,16 +216,14 @@ def test_array_based_blocking_with_salted_rules(): "additional_columns_to_retain": ["cluster"], "comparisons": [cl.array_intersect_at_sizes("array_column_1", [1])], } - + conf = SparkConf() sc = SparkContext.getOrCreate(conf=conf) spark = SparkSession(sc) input_l_spark = spark.createDataFrame(input_data_l) input_r_spark = spark.createDataFrame(input_data_r) - linker = SparkLinker( - [input_l_spark, input_r_spark], settings - ) + linker = SparkLinker([input_l_spark, input_r_spark], settings) df_predict = linker.predict().as_pandas_dataframe() ## check that there are no duplicates in the output @@ -244,9 +239,7 @@ def test_array_based_blocking_with_salted_rules(): for br in blocking_rules_no_salt: br.pop("salting_partitions") settings_no_salt["blocking_rules_to_generate_predictions"] = blocking_rules_no_salt - linker_no_salt = SparkLinker( - [input_l_spark, input_r_spark], settings_no_salt - ) + linker_no_salt = SparkLinker([input_l_spark, input_r_spark], settings_no_salt) df_predict_no_salt = linker_no_salt.predict().as_pandas_dataframe() predictions_no_salt = set( zip( @@ -258,5 +251,5 @@ def test_array_based_blocking_with_salted_rules(): predictions_with_salt = set( zip(df_predict.cluster_l, df_predict.cluster_r, df_predict.match_key) ) - + assert predictions_no_salt == predictions_with_salt From a9076ea5df5051c59e5d9b7d00fbae226737105d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 1 Nov 2023 14:46:11 +0000 Subject: [PATCH 09/37] wip --- splink/blocking.py | 157 +++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 70 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 1aa79dad65..a81674bff5 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -55,7 +55,7 @@ def __init__( self.sqlglot_dialect = sqlglot_dialect self.salting_partitions = salting_partitions self.arrays_to_explode = arrays_to_explode - self.ids_to_compare = [] + self.ids_to_compare = [] # list of SplinkDataFrames representing ids to compare @property def sql_dialect(self): @@ -78,17 +78,15 @@ def exclude_from_following_rules_sql(self, linker: Linker): unique_id_column = linker._settings_obj._unique_id_column_name if self.ids_to_compare: - ids_to_compare_sql = " union all ".join( [f"select * from {ids.physical_name}" for ids in self.ids_to_compare] ) - # self.ids_to_compare[0].physical_name return f"""EXISTS ( select 1 from ({ids_to_compare_sql}) as ids_to_compare where ( - l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and - r.{unique_id_column} = ids_to_compare.{unique_id_column}_r + l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and + r.{unique_id_column} = ids_to_compare.{unique_id_column}_r ) ) """ @@ -223,8 +221,59 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition +def materialise_array_exploded_id_lookup(linker: Linker, br, link_type, apply_salt): + try: + input_dataframe = linker._intermediate_table_cache[ + "__splink__df_concat_with_tf" + ] + except KeyError: + input_dataframe = linker._initialise_df_concat_with_tf() + + input_colnames = {col.name() for col in input_dataframe.columns} + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() + for colname in br.arrays_to_explode + ] + + explode_sql = linker._gen_explode_sql( + "__splink__df_concat_with_tf", + br.arrays_to_explode, + list(input_colnames.difference(arrays_to_explode_quoted)), + ) + + linker._enqueue_sql( + f"{explode_sql}", + "__splink__df_concat_with_tf_unnested", + ) + unique_id_col = linker.settings_obj._unique_id_column_name + + if link_type == "two_dataset_link_only": + where_condition = where_condition + " and l.source_dataset < r.source_dataset" + + # ensure that table names are unique + if apply_salt: + to_hash = (br + linker._cache_uid).encode("utf-8") + salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] + else: + salt_id = "" + + linker._enqueue_sql( + f""" + select distinct + l.{unique_id_col} as {unique_id_col}_l, + r.{unique_id_col} as {unique_id_col}_r + from __splink__df_concat_with_tf_unnested as l + inner join __splink__df_concat_with_tf_unnested as r + on ({br}) + {where_condition} {br.and_not_preceding_rules_sql(linker)}""", + f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", + ) + ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) + br.ids_to_compare.append(ids_to_compare) + + # flake8: noqa: C901 -def block_using_rules_sql(linker: Linker): +def block_using_rules(linker: Linker): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). @@ -285,76 +334,44 @@ def block_using_rules_sql(linker: Linker): probability = "" sqls = [] - for br in blocking_rules: + all_blocking_rules = [] + for br in blocking_rules: # Apply our salted rules to resolve skew issues. If no salt was # selected to be added, then apply the initial blocking rule. if apply_salt: - salted_blocking_rules = br.salted_blocking_rules + all_blocking_rules.extend(br.salted_blocking_rules) else: - salted_blocking_rules = [br.blocking_rule] - - for salted_br in salted_blocking_rules: - if not br.arrays_to_explode: - sql = f""" - select - {sql_select_expr} - , '{br.match_key}' as match_key + all_blocking_rules.append(br.blocking_rule) + + for br in all_blocking_rules: + materialise_array_exploded_id_lookup(linker, br) + + for br in all_blocking_rules: + if not br.arrays_to_explode: + sql = f""" + select + {sql_select_expr} + , '{br.match_key}' as match_key + {probability} + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({br}) + {where_condition} + {br.and_not_preceding_rules_sql(linker)} + """ + else: + sql = f""" + select {sql_select_expr}, '{br.match_key}' as match_key {probability} - from {linker._input_tablename_l} as l - inner join {linker._input_tablename_r} as r - on - ({salted_br}) - {where_condition} - {br.and_not_preceding_rules_sql(linker)} - """ - else: - try: - input_dataframe = linker._intermediate_table_cache[ - "__splink__df_concat_with_tf" - ] - except KeyError: - input_dataframe = linker._initialise_df_concat_with_tf() - input_colnames = {col.name() for col in input_dataframe.columns} - arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in br.arrays_to_explode - ] - linker._enqueue_sql( - f"{linker._gen_explode_sql('__splink__df_concat_with_tf',br.arrays_to_explode,list(input_colnames.difference(arrays_to_explode_quoted)))}", - "__splink__df_concat_with_tf_unnested", - ) - unique_id_col = settings_obj._unique_id_column_name - - if link_type == "two_dataset_link_only": - where_condition = ( - where_condition + " and l.source_dataset < r.source_dataset" - ) - - # ensure that table names are unique - if apply_salt: - to_hash = (salted_br + linker._cache_uid).encode("utf-8") - salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] - else: - salt_id = "" - - linker._enqueue_sql( - f""" - select distinct l.{unique_id_col} as {unique_id_col}_l,r.{unique_id_col} as {unique_id_col}_r - from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r on ({salted_br}) - {where_condition} {br.and_not_preceding_rules_sql(linker)}""", - f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", - ) - ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) - br.ids_to_compare.append(ids_to_compare) - sql = f""" - select {sql_select_expr}, '{br.match_key}' as match_key - {probability} - from {ids_to_compare.physical_name} as pairs - left join {linker._input_tablename_l} as l on pairs.{unique_id_col}_l=l.{unique_id_col} - left join {linker._input_tablename_r} as r on pairs.{unique_id_col}_r=r.{unique_id_col} - """ - sqls.append(sql) + from {ids_to_compare.physical_name} as pairs + left join {linker._input_tablename_l} as l + on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r + on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + sqls.append(sql) if ( linker._two_dataset_link_only From 8ed833f83b8a55e4a2c7576bd9d54700f6804c85 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 1 Nov 2023 15:19:38 +0000 Subject: [PATCH 10/37] wip --- splink/blocking.py | 212 +++++++++++++++++++++++++++------------------ splink/linker.py | 3 + 2 files changed, 129 insertions(+), 86 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index a81674bff5..8adbb42814 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,16 +1,16 @@ from __future__ import annotations import hashlib +import logging +from typing import TYPE_CHECKING + from sqlglot import parse_one -from sqlglot.expressions import Join, Column +from sqlglot.expressions import Column, Join from sqlglot.optimizer.eliminate_joins import join_condition -from typing import TYPE_CHECKING, Union -import logging +from .input_column import InputColumn from .misc import ensure_is_list from .unique_id_concat import _composite_unique_id_from_nodes_sql -from .pipeline import SQLPipeline -from .input_column import InputColumn logger = logging.getLogger(__name__) @@ -55,7 +55,8 @@ def __init__( self.sqlglot_dialect = sqlglot_dialect self.salting_partitions = salting_partitions self.arrays_to_explode = arrays_to_explode - self.ids_to_compare = [] # list of SplinkDataFrames representing ids to compare + self.ids_to_compare = [] + self.ids_to_use = None @property def sql_dialect(self): @@ -81,6 +82,7 @@ def exclude_from_following_rules_sql(self, linker: Linker): ids_to_compare_sql = " union all ".join( [f"select * from {ids.physical_name}" for ids in self.ids_to_compare] ) + # self.ids_to_compare[0].physical_name return f"""EXISTS ( select 1 from ({ids_to_compare_sql}) as ids_to_compare @@ -221,59 +223,98 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition -def materialise_array_exploded_id_lookup(linker: Linker, br, link_type, apply_salt): - try: - input_dataframe = linker._intermediate_table_cache[ - "__splink__df_concat_with_tf" - ] - except KeyError: - input_dataframe = linker._initialise_df_concat_with_tf() - - input_colnames = {col.name() for col in input_dataframe.columns} - arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in br.arrays_to_explode - ] - - explode_sql = linker._gen_explode_sql( - "__splink__df_concat_with_tf", - br.arrays_to_explode, - list(input_colnames.difference(arrays_to_explode_quoted)), - ) +def materialise_exploded_id_tables(linker: Linker): + if type(linker).__name__ in ["SparkLinker"]: + apply_salt = True + else: + apply_salt = False - linker._enqueue_sql( - f"{explode_sql}", - "__splink__df_concat_with_tf_unnested", - ) - unique_id_col = linker.settings_obj._unique_id_column_name + settings_obj = linker._settings_obj - if link_type == "two_dataset_link_only": - where_condition = where_condition + " and l.source_dataset < r.source_dataset" + link_type = settings_obj._link_type - # ensure that table names are unique - if apply_salt: - to_hash = (br + linker._cache_uid).encode("utf-8") - salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] - else: - salt_id = "" - - linker._enqueue_sql( - f""" - select distinct - l.{unique_id_col} as {unique_id_col}_l, - r.{unique_id_col} as {unique_id_col}_r - from __splink__df_concat_with_tf_unnested as l - inner join __splink__df_concat_with_tf_unnested as r - on ({br}) - {where_condition} {br.and_not_preceding_rules_sql(linker)}""", - f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", + if linker._two_dataset_link_only: + link_type = "two_dataset_link_only" + + if linker._self_link_mode: + link_type = "self_link" + + where_condition = _sql_gen_where_condition( + link_type, settings_obj._unique_id_input_columns ) - ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) - br.ids_to_compare.append(ids_to_compare) + blocking_rules = settings_obj._blocking_rules_to_generate_predictions -# flake8: noqa: C901 -def block_using_rules(linker: Linker): + if settings_obj.salting_required and apply_salt is False: + logger.warning( + "WARNING: Salting is not currently supported by this linker backend and" + " will not be implemented for this run." + ) + + for br in blocking_rules: + # Apply our salted rules to resolve skew issues. If no salt was + # selected to be added, then apply the initial blocking rule. + if apply_salt: + salted_blocking_rules = br.salted_blocking_rules + else: + salted_blocking_rules = [br.blocking_rule] + + for salted_br in salted_blocking_rules: + if br.arrays_to_explode: + try: + input_dataframe = linker._intermediate_table_cache[ + "__splink__df_concat_with_tf" + ] + except KeyError: + input_dataframe = linker._initialise_df_concat_with_tf() + input_colnames = {col.name() for col in input_dataframe.columns} + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() + for colname in br.arrays_to_explode + ] + expl_sql = linker._gen_explode_sql( + "__splink__df_concat_with_tf", + br.arrays_to_explode, + list(input_colnames.difference(arrays_to_explode_quoted)), + ) + + linker._enqueue_sql( + expl_sql, + "__splink__df_concat_with_tf_unnested", + ) + + unique_id_col = settings_obj._unique_id_column_name + + if link_type == "two_dataset_link_only": + where_condition = ( + where_condition + " and l.source_dataset < r.source_dataset" + ) + + # ensure that table names are unique + if apply_salt: + to_hash = (salted_br + linker._cache_uid).encode("utf-8") + salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] + else: + salt_id = "" + + linker._enqueue_sql( + f""" + select distinct + l.{unique_id_col} as {unique_id_col}_l, + r.{unique_id_col} as {unique_id_col}_r + from __splink__df_concat_with_tf_unnested as l + inner join __splink__df_concat_with_tf_unnested as r + on ({salted_br}) + {where_condition} {br.and_not_preceding_rules_sql(linker)}""", + f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", + ) + + ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) + br.ids_to_compare.append(ids_to_compare) + br.ids_to_use = ids_to_compare + + +def block_using_rules_sql(linker: Linker): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). @@ -313,7 +354,7 @@ def block_using_rules(linker: Linker): else: blocking_rules = settings_obj._blocking_rules_to_generate_predictions - if settings_obj.salting_required and apply_salt == False: + if settings_obj.salting_required and apply_salt is False: logger.warning( "WARNING: Salting is not currently supported by this linker backend and" " will not be implemented for this run." @@ -334,44 +375,43 @@ def block_using_rules(linker: Linker): probability = "" sqls = [] - - all_blocking_rules = [] for br in blocking_rules: # Apply our salted rules to resolve skew issues. If no salt was # selected to be added, then apply the initial blocking rule. if apply_salt: - all_blocking_rules.extend(br.salted_blocking_rules) + salted_blocking_rules = br.salted_blocking_rules else: - all_blocking_rules.append(br.blocking_rule) - - for br in all_blocking_rules: - materialise_array_exploded_id_lookup(linker, br) - - for br in all_blocking_rules: - if not br.arrays_to_explode: - sql = f""" - select - {sql_select_expr} - , '{br.match_key}' as match_key - {probability} - from {linker._input_tablename_l} as l - inner join {linker._input_tablename_r} as r - on - ({br}) - {where_condition} - {br.and_not_preceding_rules_sql(linker)} - """ - else: - sql = f""" - select {sql_select_expr}, '{br.match_key}' as match_key + salted_blocking_rules = [br.blocking_rule] + + for salted_br in salted_blocking_rules: + if not br.arrays_to_explode: + sql = f""" + select + {sql_select_expr} + , '{br.match_key}' as match_key {probability} - from {ids_to_compare.physical_name} as pairs - left join {linker._input_tablename_l} as l - on pairs.{unique_id_col}_l=l.{unique_id_col} - left join {linker._input_tablename_r} as r - on pairs.{unique_id_col}_r=r.{unique_id_col} - """ - sqls.append(sql) + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({salted_br}) + {where_condition} + {br.and_not_preceding_rules_sql(linker)} + """ + else: + ids_to_compare = br.ids_to_use + unique_id_col = settings_obj._unique_id_column_name + sql = f""" + select + {sql_select_expr}, + '{br.match_key}' as match_key + {probability} + from {ids_to_compare.physical_name} as pairs + left join {linker._input_tablename_l} as l + on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r + on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + sqls.append(sql) if ( linker._two_dataset_link_only diff --git a/splink/linker.py b/splink/linker.py index d6a293c320..3d12283cac 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -35,6 +35,7 @@ BlockingRule, block_using_rules_sql, blocking_rule_to_obj, + materialise_exploded_id_tables, ) from .cache_dict_with_logging import CacheDictWithLogging from .charts import ( @@ -1728,6 +1729,8 @@ def predict( if nodes_with_tf: input_dataframes.append(nodes_with_tf) + materialise_exploded_id_tables(self) + sql = block_using_rules_sql(self) self._enqueue_sql(sql, "__splink__df_blocked") From 217f633f14c5af68b86aaa06368c15a02ccce8a4 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 2 Nov 2023 11:38:12 +0000 Subject: [PATCH 11/37] move materialisation logic to separate function --- splink/blocking.py | 50 +++++++++----------------------------- splink/logging_messages.py | 4 +++ 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 8adbb42814..665aaf37ec 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -56,7 +56,6 @@ def __init__( self.salting_partitions = salting_partitions self.arrays_to_explode = arrays_to_explode self.ids_to_compare = [] - self.ids_to_use = None @property def sql_dialect(self): @@ -224,11 +223,6 @@ def _sql_gen_where_condition(link_type, unique_id_cols): def materialise_exploded_id_tables(linker: Linker): - if type(linker).__name__ in ["SparkLinker"]: - apply_salt = True - else: - apply_salt = False - settings_obj = linker._settings_obj link_type = settings_obj._link_type @@ -245,20 +239,12 @@ def materialise_exploded_id_tables(linker: Linker): blocking_rules = settings_obj._blocking_rules_to_generate_predictions - if settings_obj.salting_required and apply_salt is False: - logger.warning( - "WARNING: Salting is not currently supported by this linker backend and" - " will not be implemented for this run." - ) - for br in blocking_rules: # Apply our salted rules to resolve skew issues. If no salt was # selected to be added, then apply the initial blocking rule. - if apply_salt: - salted_blocking_rules = br.salted_blocking_rules - else: - salted_blocking_rules = [br.blocking_rule] + salted_blocking_rules = br.salted_blocking_rules + salt_counter = 0 for salted_br in salted_blocking_rules: if br.arrays_to_explode: try: @@ -291,11 +277,10 @@ def materialise_exploded_id_tables(linker: Linker): ) # ensure that table names are unique - if apply_salt: - to_hash = (salted_br + linker._cache_uid).encode("utf-8") - salt_id = "salt_id_" + hashlib.sha256(to_hash).hexdigest()[:9] - else: - salt_id = "" + + to_hash = (salted_br + linker._cache_uid).encode("utf-8") + salt_id = hashlib.sha256(to_hash).hexdigest()[:9] + salt_id = f"salt_{salt_counter}_{salt_id}" linker._enqueue_sql( f""" @@ -311,7 +296,7 @@ def materialise_exploded_id_tables(linker: Linker): ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) br.ids_to_compare.append(ids_to_compare) - br.ids_to_use = ids_to_compare + salt_counter += 1 def block_using_rules_sql(linker: Linker): @@ -323,11 +308,6 @@ def block_using_rules_sql(linker: Linker): so that duplicate comparisons are not generated. """ - if type(linker).__name__ in ["SparkLinker"]: - apply_salt = True - else: - apply_salt = False - settings_obj = linker._settings_obj columns_to_select = settings_obj._columns_to_select_for_blocking @@ -354,12 +334,6 @@ def block_using_rules_sql(linker: Linker): else: blocking_rules = settings_obj._blocking_rules_to_generate_predictions - if settings_obj.salting_required and apply_salt is False: - logger.warning( - "WARNING: Salting is not currently supported by this linker backend and" - " will not be implemented for this run." - ) - # Cover the case where there are no blocking rules # This is a bit of a hack where if you do a self-join on 'true' # you create a cartesian product, rather than having separate code @@ -378,11 +352,10 @@ def block_using_rules_sql(linker: Linker): for br in blocking_rules: # Apply our salted rules to resolve skew issues. If no salt was # selected to be added, then apply the initial blocking rule. - if apply_salt: - salted_blocking_rules = br.salted_blocking_rules - else: - salted_blocking_rules = [br.blocking_rule] + salted_blocking_rules = br.salted_blocking_rules + + salt_counter = 0 for salted_br in salted_blocking_rules: if not br.arrays_to_explode: sql = f""" @@ -398,7 +371,7 @@ def block_using_rules_sql(linker: Linker): {br.and_not_preceding_rules_sql(linker)} """ else: - ids_to_compare = br.ids_to_use + ids_to_compare = br.ids_to_compare[salt_counter] unique_id_col = settings_obj._unique_id_column_name sql = f""" select @@ -412,6 +385,7 @@ def block_using_rules_sql(linker: Linker): on pairs.{unique_id_col}_r=r.{unique_id_col} """ sqls.append(sql) + salt_counter += 1 if ( linker._two_dataset_link_only diff --git a/splink/logging_messages.py b/splink/logging_messages.py index 0761f81a01..780005fc25 100644 --- a/splink/logging_messages.py +++ b/splink/logging_messages.py @@ -1,3 +1,6 @@ +import sqlglot + + def execute_sql_logging_message_info(templated_name, physical_name): return ( f"Executing sql to create " @@ -7,4 +10,5 @@ def execute_sql_logging_message_info(templated_name, physical_name): def log_sql(sql): + # sql = sql).sql(pretty=True) return "\n------Start SQL---------\n" f"{sql}\n" "-------End SQL-----------\n" From 43298b1ad55002e179de5ed1b67e236ca5e4c755 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 3 Nov 2023 14:02:32 +0000 Subject: [PATCH 12/37] rename for clarity --- splink/blocking.py | 51 +++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 665aaf37ec..0ac36279c9 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -2,7 +2,7 @@ import hashlib import logging -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List from sqlglot import parse_one from sqlglot.expressions import Column, Join @@ -11,6 +11,7 @@ from .input_column import InputColumn from .misc import ensure_is_list from .unique_id_concat import _composite_unique_id_from_nodes_sql +from .splink_dataframe import SplinkDataFrame logger = logging.getLogger(__name__) @@ -45,7 +46,7 @@ def __init__( blocking_rule: BlockingRule | dict | str, salting_partitions=1, sqlglot_dialect: str = None, - arrays_to_explode: list = [], + array_columns_to_explode: list = [], ): if sqlglot_dialect: self._sql_dialect = sqlglot_dialect @@ -54,8 +55,8 @@ def __init__( self.preceding_rules = [] self.sqlglot_dialect = sqlglot_dialect self.salting_partitions = salting_partitions - self.arrays_to_explode = arrays_to_explode - self.ids_to_compare = [] + self.array_columns_to_explode: List[str] = array_columns_to_explode + self.exploded_id_pair_tables: List[SplinkDataFrame] = [] @property def sql_dialect(self): @@ -74,12 +75,20 @@ def add_preceding_rules(self, rules): rules = ensure_is_list(rules) self.preceding_rules = rules - def exclude_from_following_rules_sql(self, linker: Linker): + def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): + """A SQL string specifying how to exclude the results + of THIS blocking rule from subseqent blocking statements, + so that subsequent statements do not produce duplicate pairs + """ + unique_id_column = linker._settings_obj._unique_id_column_name - if self.ids_to_compare: + if self.exploded_id_pair_tables: ids_to_compare_sql = " union all ".join( - [f"select * from {ids.physical_name}" for ids in self.ids_to_compare] + [ + f"select * from {ids.physical_name}" + for ids in self.exploded_id_pair_tables + ] ) # self.ids_to_compare[0].physical_name @@ -97,11 +106,15 @@ def exclude_from_following_rules_sql(self, linker: Linker): # meaning these comparisons get lost return f"coalesce(({self.blocking_rule}),false)" - def and_not_preceding_rules_sql(self, linker: Linker): + def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): + """A SQL string that excludes the results of ALL previous blocking rules from + the pairwise comparisons generated. + """ if not self.preceding_rules: return "" or_clauses = [ - br.exclude_from_following_rules_sql(linker) for br in self.preceding_rules + br.exclude_pairs_generated_by_this_rule_sql(linker) + for br in self.preceding_rules ] previous_rules = " OR ".join(or_clauses) return f"AND NOT ({previous_rules})" @@ -176,8 +189,8 @@ def as_dict(self): if self.salting_partitions > 1 and self.sql_dialect == "spark": output["salting_partitions"] = self.salting_partitions - if self.arrays_to_explode: - output["arrays_to_explode"] = self.arrays_to_explode + if self.array_columns_to_explode: + output["arrays_to_explode"] = self.array_columns_to_explode return output @@ -246,7 +259,7 @@ def materialise_exploded_id_tables(linker: Linker): salted_blocking_rules = br.salted_blocking_rules salt_counter = 0 for salted_br in salted_blocking_rules: - if br.arrays_to_explode: + if br.array_columns_to_explode: try: input_dataframe = linker._intermediate_table_cache[ "__splink__df_concat_with_tf" @@ -256,11 +269,11 @@ def materialise_exploded_id_tables(linker: Linker): input_colnames = {col.name() for col in input_dataframe.columns} arrays_to_explode_quoted = [ InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in br.arrays_to_explode + for colname in br.array_columns_to_explode ] expl_sql = linker._gen_explode_sql( "__splink__df_concat_with_tf", - br.arrays_to_explode, + br.array_columns_to_explode, list(input_colnames.difference(arrays_to_explode_quoted)), ) @@ -290,12 +303,12 @@ def materialise_exploded_id_tables(linker: Linker): from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r on ({salted_br}) - {where_condition} {br.and_not_preceding_rules_sql(linker)}""", + {where_condition} {br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""", f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", ) ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) - br.ids_to_compare.append(ids_to_compare) + br.exploded_id_pair_tables.append(ids_to_compare) salt_counter += 1 @@ -357,7 +370,7 @@ def block_using_rules_sql(linker: Linker): salt_counter = 0 for salted_br in salted_blocking_rules: - if not br.arrays_to_explode: + if not br.array_columns_to_explode: sql = f""" select {sql_select_expr} @@ -368,10 +381,10 @@ def block_using_rules_sql(linker: Linker): on ({salted_br}) {where_condition} - {br.and_not_preceding_rules_sql(linker)} + {br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} """ else: - ids_to_compare = br.ids_to_compare[salt_counter] + ids_to_compare = br.exploded_id_pair_tables[salt_counter] unique_id_col = settings_obj._unique_id_column_name sql = f""" select From 90fadb5a520c2e7590d5e6b871ab71f457fac1bb Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 3 Nov 2023 14:18:18 +0000 Subject: [PATCH 13/37] improve clairty of names --- splink/analyse_blocking.py | 2 +- splink/blocking.py | 35 ++++++++++--------- splink/blocking_rule_composition.py | 6 ++-- splink/em_training_session.py | 12 +++---- splink/linker.py | 4 +-- splink/settings.py | 2 +- .../settings_validation/settings_validator.py | 2 +- tests/test_blocking.py | 2 +- 8 files changed, 34 insertions(+), 31 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 201ec6e9c8..be19887180 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -117,7 +117,7 @@ def cumulative_comparisons_generated_by_blocking_rules( for row, br in zip(br_count, brs_as_objs): out_dict = { "row_count": row, - "rule": br.blocking_rule, + "rule": br.blocking_rule_sql, } if output_chart: cumulative_sum += row diff --git a/splink/blocking.py b/splink/blocking.py index 0ac36279c9..fb72de2f23 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -51,10 +51,10 @@ def __init__( if sqlglot_dialect: self._sql_dialect = sqlglot_dialect - self.blocking_rule = blocking_rule + self.blocking_rule_sql = blocking_rule self.preceding_rules = [] self.sqlglot_dialect = sqlglot_dialect - self.salting_partitions = salting_partitions + self.salting_partitions: int = salting_partitions self.array_columns_to_explode: List[str] = array_columns_to_explode self.exploded_id_pair_tables: List[SplinkDataFrame] = [] @@ -69,7 +69,7 @@ def match_key(self): @property def sql(self): # Wrapper to reveal the underlying SQL - return self.blocking_rule + return self.blocking_rule_sql def add_preceding_rules(self, rules): rules = ensure_is_list(rules) @@ -104,7 +104,7 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): # Note the coalesce function is important here - otherwise # you filter out any records with nulls in the previous rules # meaning these comparisons get lost - return f"coalesce(({self.blocking_rule}),false)" + return f"coalesce(({self.blocking_rule_sql}),false)" def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): """A SQL string that excludes the results of ALL previous blocking rules from @@ -120,16 +120,22 @@ def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): return f"AND NOT ({previous_rules})" @property - def salted_blocking_rules(self): + def salted_blocking_rules_as_sql_strings(self) -> List[str]: + """A list of sql strings""" + if self.salting_partitions == 1: - yield self.blocking_rule + yield self.blocking_rule_sql else: for n in range(self.salting_partitions): - yield f"{self.blocking_rule} and ceiling(l.__splink_salt * {self.salting_partitions}) = {n+1}" # noqa: E501 + yield ( + f"{self.blocking_rule_sql} and " + f"ceiling(l.__splink_salt * {self.salting_partitions}) " + f"= {n+1}" + ) @property def _parsed_join_condition(self): - br = self.blocking_rule + br = self.blocking_rule_sql return parse_one("INNER JOIN r", into=Join).on( br, dialect=self.sqlglot_dialect ) # using sqlglot==11.4.1 @@ -183,7 +189,7 @@ def as_dict(self): "The minimal representation of the blocking rule" output = {} - output["blocking_rule"] = self.blocking_rule + output["blocking_rule"] = self.blocking_rule_sql output["sql_dialect"] = self.sql_dialect if self.salting_partitions > 1 and self.sql_dialect == "spark": @@ -196,7 +202,7 @@ def as_dict(self): def _as_completed_dict(self): if not self.salting_partitions > 1 and self.sql_dialect == "spark": - return self.blocking_rule + return self.blocking_rule_sql else: return self.as_dict() @@ -205,7 +211,7 @@ def descr(self): return "Custom" if not hasattr(self, "_description") else self._description def _abbreviated_sql(self, cutoff=75): - sql = self.blocking_rule + sql = self.blocking_rule_sql return (sql[:cutoff] + "...") if len(sql) > cutoff else sql def __repr__(self): @@ -253,10 +259,7 @@ def materialise_exploded_id_tables(linker: Linker): blocking_rules = settings_obj._blocking_rules_to_generate_predictions for br in blocking_rules: - # Apply our salted rules to resolve skew issues. If no salt was - # selected to be added, then apply the initial blocking rule. - - salted_blocking_rules = br.salted_blocking_rules + salted_blocking_rules = br.salted_blocking_rules_as_sql_strings salt_counter = 0 for salted_br in salted_blocking_rules: if br.array_columns_to_explode: @@ -366,7 +369,7 @@ def block_using_rules_sql(linker: Linker): # Apply our salted rules to resolve skew issues. If no salt was # selected to be added, then apply the initial blocking rule. - salted_blocking_rules = br.salted_blocking_rules + salted_blocking_rules = br.salted_blocking_rules_as_sql_strings salt_counter = 0 for salted_br in salted_blocking_rules: diff --git a/splink/blocking_rule_composition.py b/splink/blocking_rule_composition.py index e6c4ce38f4..29cb01d9f6 100644 --- a/splink/blocking_rule_composition.py +++ b/splink/blocking_rule_composition.py @@ -295,7 +295,7 @@ def not_(*brls: BlockingRule | dict | str, salting_partitions: int = 1) -> Block brls, sql_dialect, salt = _parse_blocking_rules(*brls) br = brls[0] - blocking_rule = f"NOT ({br.blocking_rule})" + blocking_rule = f"NOT ({br.blocking_rule_sql})" return BlockingRule( blocking_rule, @@ -314,9 +314,9 @@ def _br_merge( brs, sql_dialect, salt = _parse_blocking_rules(*brls) if len(brs) > 1: - conditions = (f"({br.blocking_rule})" for br in brs) + conditions = (f"({br.blocking_rule_sql})" for br in brs) else: - conditions = (br.blocking_rule for br in brs) + conditions = (br.blocking_rule_sql for br in brs) blocking_rule = f" {clause} ".join(conditions) diff --git a/splink/em_training_session.py b/splink/em_training_session.py index 3f6df5244c..b8a43d7b23 100644 --- a/splink/em_training_session.py +++ b/splink/em_training_session.py @@ -135,7 +135,7 @@ def _training_log_message(self): else: mu = "m and u probabilities" - blocking_rule = self._blocking_rule_for_training.blocking_rule + blocking_rule = self._blocking_rule_for_training.blocking_rule_sql logger.info( f"Estimating the {mu} of the model by blocking on:\n" @@ -175,7 +175,7 @@ def _train(self): # check that the blocking rule actually generates _some_ record pairs, # if not give the user a helpful message if not cvv.as_record_dict(limit=1): - br_sql = f"`{self._blocking_rule_for_training.blocking_rule}`" + br_sql = f"`{self._blocking_rule_for_training.blocking_rule_sql}`" raise EMTrainingException( f"Training rule {br_sql} resulted in no record pairs. " "This means that in the supplied data set " @@ -194,7 +194,7 @@ def _train(self): # in the original (main) setting object expectation_maximisation(self, cvv) - rule = self._blocking_rule_for_training.blocking_rule + rule = self._blocking_rule_for_training.blocking_rule_sql training_desc = f"EM, blocked on: {rule}" # Add m and u values to original settings @@ -253,7 +253,7 @@ def _blocking_adjusted_probability_two_random_records_match(self): comp_levels = self._comparison_levels_to_reverse_blocking_rule if not comp_levels: comp_levels = self._original_settings_obj._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa - self._blocking_rule_for_training.blocking_rule + self._blocking_rule_for_training.blocking_rule_sql ) for cl in comp_levels: @@ -270,7 +270,7 @@ def _blocking_adjusted_probability_two_random_records_match(self): logger.log( 15, f"\nProb two random records match adjusted for blocking on " - f"{self._blocking_rule_for_training.blocking_rule}: " + f"{self._blocking_rule_for_training.blocking_rule_sql}: " f"{adjusted_prop_m:.3f}", ) return adjusted_prop_m @@ -410,7 +410,7 @@ def __repr__(self): for cc in self._comparisons_that_cannot_be_estimated ] ) - blocking_rule = self._blocking_rule_for_training.blocking_rule + blocking_rule = self._blocking_rule_for_training.blocking_rule_sql return ( f"" diff --git a/splink/linker.py b/splink/linker.py index 3d12283cac..c0160ef1ff 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1630,7 +1630,7 @@ def estimate_parameters_using_expectation_maximisation( self._initialise_df_concat_with_tf() # Extract the blocking rule - blocking_rule = blocking_rule_to_obj(blocking_rule).blocking_rule + blocking_rule = blocking_rule_to_obj(blocking_rule).blocking_rule_sql if comparisons_to_deactivate: # If user provided a string, convert to Comparison object @@ -3099,7 +3099,7 @@ def count_num_comparisons_from_blocking_rule( int: The number of comparisons generated by the blocking rule """ - blocking_rule = blocking_rule_to_obj(blocking_rule).blocking_rule + blocking_rule = blocking_rule_to_obj(blocking_rule).blocking_rule_sql sql = vertically_concatenate_sql(self) self._enqueue_sql(sql, "__splink__df_concat") diff --git a/splink/settings.py b/splink/settings.py index 79c53b45f3..acb5f28995 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -125,7 +125,7 @@ def _get_additional_columns_to_retain(self): used_by_brs = [] for br in self._blocking_rules_to_generate_predictions: used_by_brs.extend( - get_columns_used_from_sql(br.blocking_rule, br.sql_dialect) + get_columns_used_from_sql(br.blocking_rule_sql, br.sql_dialect) ) used_by_brs = [InputColumn(c) for c in used_by_brs] diff --git a/splink/settings_validation/settings_validator.py b/splink/settings_validation/settings_validator.py index 06e24acfb9..a4b84743f8 100644 --- a/splink/settings_validation/settings_validator.py +++ b/splink/settings_validation/settings_validator.py @@ -51,7 +51,7 @@ def uid(self): @property def blocking_rules(self): brs = self.settings_obj._blocking_rules_to_generate_predictions - return [br.blocking_rule for br in brs] + return [br.blocking_rule_sql for br in brs] @property def comparisons(self): diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 0eb2113cc5..fd01645275 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -40,7 +40,7 @@ def test_binary_composition_internals_OR(test_helpers, dialect): brl.exact_match_rule("help4"), ] brs_as_objs = settings_tester._brs_as_objs(brs_as_strings) - brs_as_txt = [blocking_rule_to_obj(br).blocking_rule for br in brs_as_strings] + brs_as_txt = [blocking_rule_to_obj(br).blocking_rule_sql for br in brs_as_strings] assert brs_as_objs[0].preceding_rules == [] From efb72c3efad1edfd3f8b1fc93e0acd03e0f3964d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 3 Nov 2023 14:40:35 +0000 Subject: [PATCH 14/37] pushing logic into blockingrule class --- splink/blocking.py | 67 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index fb72de2f23..4356b47d85 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -185,6 +185,39 @@ def _filter_conditions(self): else: return filter_condition.sql(self.sqlglot_dialect) + def exploded_id_pair_table_sql(self, linker: Linker, salted_br): + settings_obj = linker._settings_obj + unique_id_col = settings_obj._unique_id_column_name + + link_type = settings_obj._link_type + + if linker._two_dataset_link_only: + link_type = "two_dataset_link_only" + + if linker._self_link_mode: + link_type = "self_link" + + where_condition = _sql_gen_where_condition( + link_type, settings_obj._unique_id_input_columns + ) + + if link_type == "two_dataset_link_only": + where_condition = ( + where_condition + " and l.source_dataset < r.source_dataset" + ) + + sql = f""" + select distinct + l.{unique_id_col} as {unique_id_col}_l, + r.{unique_id_col} as {unique_id_col}_r + from __splink__df_concat_with_tf_unnested as l + inner join __splink__df_concat_with_tf_unnested as r + on ({salted_br}) + {where_condition} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" + + return sql + def as_dict(self): "The minimal representation of the blocking rule" output = {} @@ -244,24 +277,11 @@ def _sql_gen_where_condition(link_type, unique_id_cols): def materialise_exploded_id_tables(linker: Linker): settings_obj = linker._settings_obj - link_type = settings_obj._link_type - - if linker._two_dataset_link_only: - link_type = "two_dataset_link_only" - - if linker._self_link_mode: - link_type = "self_link" - - where_condition = _sql_gen_where_condition( - link_type, settings_obj._unique_id_input_columns - ) - blocking_rules = settings_obj._blocking_rules_to_generate_predictions for br in blocking_rules: - salted_blocking_rules = br.salted_blocking_rules_as_sql_strings salt_counter = 0 - for salted_br in salted_blocking_rules: + for salted_br in br.salted_blocking_rules_as_sql_strings: if br.array_columns_to_explode: try: input_dataframe = linker._intermediate_table_cache[ @@ -269,6 +289,7 @@ def materialise_exploded_id_tables(linker: Linker): ] except KeyError: input_dataframe = linker._initialise_df_concat_with_tf() + input_colnames = {col.name() for col in input_dataframe.columns} arrays_to_explode_quoted = [ InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() @@ -285,28 +306,16 @@ def materialise_exploded_id_tables(linker: Linker): "__splink__df_concat_with_tf_unnested", ) - unique_id_col = settings_obj._unique_id_column_name - - if link_type == "two_dataset_link_only": - where_condition = ( - where_condition + " and l.source_dataset < r.source_dataset" - ) - # ensure that table names are unique to_hash = (salted_br + linker._cache_uid).encode("utf-8") salt_id = hashlib.sha256(to_hash).hexdigest()[:9] salt_id = f"salt_{salt_counter}_{salt_id}" + sql = br.exploded_id_pair_table_sql(linker, salted_br) + linker._enqueue_sql( - f""" - select distinct - l.{unique_id_col} as {unique_id_col}_l, - r.{unique_id_col} as {unique_id_col}_r - from __splink__df_concat_with_tf_unnested as l - inner join __splink__df_concat_with_tf_unnested as r - on ({salted_br}) - {where_condition} {br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""", + sql, f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", ) From 4caa84709ef832e54bc4476ef07a6c3c0342548e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 06:57:30 +0000 Subject: [PATCH 15/37] better names --- splink/blocking.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 4356b47d85..79d8232239 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -185,7 +185,11 @@ def _filter_conditions(self): else: return filter_condition.sql(self.sqlglot_dialect) - def exploded_id_pair_table_sql(self, linker: Linker, salted_br): + def marginal_exploded_id_pairs_table_sql(self, linker: Linker, salted_br): + """generates a table of the marginal id pairs from the exploded blocking rule + i.e. pairs are only created that match this blocking rule and NOT any of + the preceding blocking rules + """ settings_obj = linker._settings_obj unique_id_col = settings_obj._unique_id_column_name @@ -306,21 +310,18 @@ def materialise_exploded_id_tables(linker: Linker): "__splink__df_concat_with_tf_unnested", ) - # ensure that table names are unique + if br.salting_partitions > 1: + salt_id = f"{salt_counter}" - to_hash = (salted_br + linker._cache_uid).encode("utf-8") - salt_id = hashlib.sha256(to_hash).hexdigest()[:9] - salt_id = f"salt_{salt_counter}_{salt_id}" + base_name = "__splink__marginal_exploded_ids_blocking_rule" + table_name = f"{base_name}_{br.match_key}_salt_{salt_id}" - sql = br.exploded_id_pair_table_sql(linker, salted_br) + sql = br.marginal_exploded_id_pairs_table_sql(linker, salted_br) - linker._enqueue_sql( - sql, - f"ids_to_compare_blocking_rule_{br.match_key}{salt_id}", - ) + linker._enqueue_sql(sql, table_name) - ids_to_compare = linker._execute_sql_pipeline([input_dataframe]) - br.exploded_id_pair_tables.append(ids_to_compare) + marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) + br.exploded_id_pair_tables.append(marginal_ids_table) salt_counter += 1 From 4c40ffd2c66dda6c873edc5850377cd243f6d423 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 08:23:19 +0000 Subject: [PATCH 16/37] remove materialised tables after use --- splink/blocking.py | 16 ++++++++-------- splink/linker.py | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 79d8232239..4237fdd5c5 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -222,6 +222,10 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, salted_br): return sql + def drop_materialised_id_pairs_dataframes(self): + for df in self.exploded_id_pair_tables: + df.drop_table_from_database_and_remove_from_cache() + def as_dict(self): "The minimal representation of the blocking rule" output = {} @@ -287,12 +291,7 @@ def materialise_exploded_id_tables(linker: Linker): salt_counter = 0 for salted_br in br.salted_blocking_rules_as_sql_strings: if br.array_columns_to_explode: - try: - input_dataframe = linker._intermediate_table_cache[ - "__splink__df_concat_with_tf" - ] - except KeyError: - input_dataframe = linker._initialise_df_concat_with_tf() + input_dataframe = linker._initialise_df_concat_with_tf() input_colnames = {col.name() for col in input_dataframe.columns} arrays_to_explode_quoted = [ @@ -310,11 +309,12 @@ def materialise_exploded_id_tables(linker: Linker): "__splink__df_concat_with_tf_unnested", ) + salt_name = "" if br.salting_partitions > 1: - salt_id = f"{salt_counter}" + salt_name = f"_salt_{salt_counter}" base_name = "__splink__marginal_exploded_ids_blocking_rule" - table_name = f"{base_name}_{br.match_key}_salt_{salt_id}" + table_name = f"{base_name}_mk_{br.match_key}{salt_name}" sql = br.marginal_exploded_id_pairs_table_sql(linker, salted_br) diff --git a/splink/linker.py b/splink/linker.py index c0160ef1ff..debc6f50b5 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1754,6 +1754,10 @@ def predict( self._enqueue_sql(sql["sql"], sql["output_table_name"]) predictions = self._execute_sql_pipeline(input_dataframes) + + for br in self._settings_obj._blocking_rules_to_generate_predictions: + br.drop_materialised_id_pairs_dataframes() + self._predict_warning() return predictions From 836ea3604a1aa6b4e7b32184193e7ec5e7f30e33 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 10:30:18 +0000 Subject: [PATCH 17/37] is salted --- splink/blocking.py | 178 ++++++++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 74 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 4237fdd5c5..5d272a87a8 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -66,6 +66,10 @@ def sql_dialect(self): def match_key(self): return len(self.preceding_rules) + @property + def is_salted(self): + return self.salting_partitions > 1 + @property def sql(self): # Wrapper to reveal the underlying SQL @@ -120,18 +124,27 @@ def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): return f"AND NOT ({previous_rules})" @property - def salted_blocking_rules_as_sql_strings(self) -> List[str]: + def salted_blocking_rules(self) -> List[SaltedBlockingRuleSegment]: """A list of sql strings""" - if self.salting_partitions == 1: - yield self.blocking_rule_sql - else: - for n in range(self.salting_partitions): - yield ( + for n in range(self.salting_partitions): + if self.is_salted: + rule_sql = ( f"{self.blocking_rule_sql} and " f"ceiling(l.__splink_salt * {self.salting_partitions}) " f"= {n+1}" ) + else: + rule_sql = self.blocking_rule_sql + + br = SaltedBlockingRuleSegment(self, rule_sql, n) + + try: # If it has a materialised id pairs table, grab it + br.exploded_id_pair_table = self.exploded_id_pair_tables[n] + except IndexError: + pass + + yield br @property def _parsed_join_condition(self): @@ -185,7 +198,9 @@ def _filter_conditions(self): else: return filter_condition.sql(self.sqlglot_dialect) - def marginal_exploded_id_pairs_table_sql(self, linker: Linker, salted_br): + def marginal_exploded_id_pairs_table_sql( + self, linker: Linker, salted_br: BlockingRule + ): """generates a table of the marginal id pairs from the exploded blocking rule i.e. pairs are only created that match this blocking rule and NOT any of the preceding blocking rules @@ -216,7 +231,7 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, salted_br): r.{unique_id_col} as {unique_id_col}_r from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r - on ({salted_br}) + on ({salted_br.blocking_rule_sql}) {where_condition} {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" @@ -264,6 +279,24 @@ def _human_readable_succinct(self): return f"{self.descr} blocking rule using SQL: {sql}" +class SaltedBlockingRuleSegment: + def __init__( + self, + parent_blocking_rule: BlockingRule, + blocking_rule_sql: str, + salt: int = None, + exploded_id_pairs_table: SplinkDataFrame = None, + ): + self.parent_blocking_rule = parent_blocking_rule + self.blocking_rule_sql = blocking_rule_sql + self.salt = salt + self.exploded_id_pairs_tables = exploded_id_pairs_table + + @property + def is_salted(self): + return self.parent_blocking_rule.is_salted + + def _sql_gen_where_condition(link_type, unique_id_cols): id_expr_l = _composite_unique_id_from_nodes_sql(unique_id_cols, "l") id_expr_r = _composite_unique_id_from_nodes_sql(unique_id_cols, "r") @@ -286,43 +319,44 @@ def materialise_exploded_id_tables(linker: Linker): settings_obj = linker._settings_obj blocking_rules = settings_obj._blocking_rules_to_generate_predictions + salted_blocking_rules = ( + salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules + ) - for br in blocking_rules: - salt_counter = 0 - for salted_br in br.salted_blocking_rules_as_sql_strings: - if br.array_columns_to_explode: - input_dataframe = linker._initialise_df_concat_with_tf() - - input_colnames = {col.name() for col in input_dataframe.columns} - arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in br.array_columns_to_explode - ] - expl_sql = linker._gen_explode_sql( - "__splink__df_concat_with_tf", - br.array_columns_to_explode, - list(input_colnames.difference(arrays_to_explode_quoted)), - ) + for salted_br in salted_blocking_rules: + parent_br = salted_br.parent_blocking_rule + if parent_br.array_columns_to_explode: + input_dataframe = linker._initialise_df_concat_with_tf() + + input_colnames = {col.name() for col in input_dataframe.columns} + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() + for colname in parent_br.array_columns_to_explode + ] + expl_sql = linker._gen_explode_sql( + "__splink__df_concat_with_tf", + parent_br.array_columns_to_explode, + list(input_colnames.difference(arrays_to_explode_quoted)), + ) - linker._enqueue_sql( - expl_sql, - "__splink__df_concat_with_tf_unnested", - ) + linker._enqueue_sql( + expl_sql, + "__splink__df_concat_with_tf_unnested", + ) - salt_name = "" - if br.salting_partitions > 1: - salt_name = f"_salt_{salt_counter}" + salt_name = "" + if salted_br.is_salted: + salt_name = f"_salt_{salted_br.salt}" - base_name = "__splink__marginal_exploded_ids_blocking_rule" - table_name = f"{base_name}_mk_{br.match_key}{salt_name}" + base_name = "__splink__marginal_exploded_ids_blocking_rule" + table_name = f"{base_name}_mk_{parent_br.match_key}{salt_name}" - sql = br.marginal_exploded_id_pairs_table_sql(linker, salted_br) + sql = parent_br.marginal_exploded_id_pairs_table_sql(linker, salted_br) - linker._enqueue_sql(sql, table_name) + linker._enqueue_sql(sql, table_name) - marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) - br.exploded_id_pair_tables.append(marginal_ids_table) - salt_counter += 1 + marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) + parent_br.exploded_id_pair_tables.append(marginal_ids_table) def block_using_rules_sql(linker: Linker): @@ -375,43 +409,39 @@ def block_using_rules_sql(linker: Linker): probability = "" sqls = [] - for br in blocking_rules: - # Apply our salted rules to resolve skew issues. If no salt was - # selected to be added, then apply the initial blocking rule. - - salted_blocking_rules = br.salted_blocking_rules_as_sql_strings - - salt_counter = 0 - for salted_br in salted_blocking_rules: - if not br.array_columns_to_explode: - sql = f""" + salted_blocking_rules = ( + salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules + ) + for salted_br in salted_blocking_rules: + parent_br = salted_br.parent_blocking_rule + if not parent_br.array_columns_to_explode: + sql = f""" + select + {sql_select_expr} + , '{parent_br.match_key}' as match_key + {probability} + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({salted_br.blocking_rule_sql}) + {where_condition} + {parent_br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + """ + else: + ids_to_compare = salted_br.exploded_id_pair_table + unique_id_col = settings_obj._unique_id_column_name + sql = f""" select - {sql_select_expr} - , '{br.match_key}' as match_key - {probability} - from {linker._input_tablename_l} as l - inner join {linker._input_tablename_r} as r - on - ({salted_br}) - {where_condition} - {br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} - """ - else: - ids_to_compare = br.exploded_id_pair_tables[salt_counter] - unique_id_col = settings_obj._unique_id_column_name - sql = f""" - select - {sql_select_expr}, - '{br.match_key}' as match_key - {probability} - from {ids_to_compare.physical_name} as pairs - left join {linker._input_tablename_l} as l - on pairs.{unique_id_col}_l=l.{unique_id_col} - left join {linker._input_tablename_r} as r - on pairs.{unique_id_col}_r=r.{unique_id_col} - """ - sqls.append(sql) - salt_counter += 1 + {sql_select_expr}, + '{parent_br.match_key}' as match_key + {probability} + from {ids_to_compare.physical_name} as pairs + left join {linker._input_tablename_l} as l + on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r + on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + sqls.append(sql) if ( linker._two_dataset_link_only From fa096cd6d34d79719427e6f999cd168f39ea31d2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 14:58:44 +0000 Subject: [PATCH 18/37] fix merge --- splink/blocking.py | 40 +++++----------------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index c9daac00ef..082e1c5314 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -359,8 +359,6 @@ def materialise_exploded_id_tables(linker: Linker): parent_br.exploded_id_pair_tables.append(marginal_ids_table) -def block_using_rules_sql(linker: Linker): -# flake8: noqa: C901 def block_using_rules_sqls(linker: Linker): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions @@ -463,7 +461,7 @@ def block_using_rules_sqls(linker: Linker): else: probability = "" - sqls = [] + br_sqls = [] salted_blocking_rules = ( salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules ) @@ -496,37 +494,9 @@ def block_using_rules_sqls(linker: Linker): left join {linker._input_tablename_r} as r on pairs.{unique_id_col}_r=r.{unique_id_col} """ - sqls.append(sql) - - if ( - linker._two_dataset_link_only - and not linker._find_new_matches_mode - and not linker._compare_two_records_mode - ): - source_dataset_col = linker._source_dataset_column_name - # Need df_l to be the one with the lowest id to preeserve the property - # that the left dataset is the one with the lowest concatenated id - keys = linker._input_tables_dict.keys() - keys = list(sorted(keys)) - df_l = linker._input_tables_dict[keys[0]] - df_r = linker._input_tables_dict[keys[1]] - - if linker._train_u_using_random_sample_mode: - sample_switch = "_sample" - else: - sample_switch = "" - - sql = f""" - select * from __splink__df_concat_with_tf{sample_switch} - where {source_dataset_col} = '{df_l.templated_name}' - """ - linker._enqueue_sql(sql, f"__splink__df_concat_with_tf{sample_switch}_left") + br_sqls.append(sql) - sql = f""" - select * from __splink__df_concat_with_tf{sample_switch} - where {source_dataset_col} = '{df_r.templated_name}' - """ - linker._enqueue_sql(sql, f"__splink__df_concat_with_tf{sample_switch}_right") + sql = "union all".join(br_sqls) - sql = "union all".join(sqls) - return sql + sqls.append({"sql": sql, "output_table_name": "__splink__df_blocked"}) + return sqls From 8621d2d74bdca5bdce0a93321ded5525ecea4c0d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 15:26:01 +0000 Subject: [PATCH 19/37] exploding blocking rule class --- splink/blocking.py | 212 +++++++++++++++++++++++++-------------------- splink/linker.py | 4 +- 2 files changed, 121 insertions(+), 95 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 082e1c5314..f858f4623d 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -31,9 +31,12 @@ def blocking_rule_to_obj(br): salting_partitions = br.get("salting_partitions", 1) arrays_to_explode = br.get("arrays_to_explode", list()) - return BlockingRule( - blocking_rule, salting_partitions, sqlglot_dialect, arrays_to_explode - ) + if arrays_to_explode: + return ExplodingBlockingRule( + blocking_rule, salting_partitions, sqlglot_dialect, arrays_to_explode + ) + + return BlockingRule(blocking_rule, salting_partitions, sqlglot_dialect) else: br = BlockingRule(br) @@ -55,8 +58,6 @@ def __init__( self.preceding_rules = [] self.sqlglot_dialect = sqlglot_dialect self.salting_partitions: int = salting_partitions - self.array_columns_to_explode: List[str] = array_columns_to_explode - self.exploded_id_pair_tables: List[SplinkDataFrame] = [] @property def sql_dialect(self): @@ -85,30 +86,10 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): so that subsequent statements do not produce duplicate pairs """ - unique_id_column = linker._settings_obj._unique_id_column_name - - if self.exploded_id_pair_tables: - ids_to_compare_sql = " union all ".join( - [ - f"select * from {ids.physical_name}" - for ids in self.exploded_id_pair_tables - ] - ) - # self.ids_to_compare[0].physical_name - - return f"""EXISTS ( - select 1 from ({ids_to_compare_sql}) as ids_to_compare - where ( - l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and - r.{unique_id_column} = ids_to_compare.{unique_id_column}_r - ) - ) - """ - else: - # Note the coalesce function is important here - otherwise - # you filter out any records with nulls in the previous rules - # meaning these comparisons get lost - return f"coalesce(({self.blocking_rule_sql}),false)" + # Note the coalesce function is important here - otherwise + # you filter out any records with nulls in the previous rules + # meaning these comparisons get lost + return f"coalesce(({self.blocking_rule_sql}),false)" def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): """A SQL string that excludes the results of ALL previous blocking rules from @@ -139,10 +120,13 @@ def salted_blocking_rules(self) -> List[SaltedBlockingRuleSegment]: br = SaltedBlockingRuleSegment(self, rule_sql, n) - try: # If it has a materialised id pairs table, grab it - br.exploded_id_pair_table = self.exploded_id_pair_tables[n] - except IndexError: - pass + # Exploding blocking rules may have a materialised exploded_id_pair_table + # If so, we want to associated it with the SaltedBlockingRuleSegment + if isinstance(self, ExplodingBlockingRule): + try: + br.exploded_id_pair_table = self.exploded_id_pair_tables[n] + except IndexError: + pass yield br @@ -198,6 +182,56 @@ def _filter_conditions(self): else: return filter_condition.sql(self.sqlglot_dialect) + def as_dict(self): + "The minimal representation of the blocking rule" + output = {} + + output["blocking_rule"] = self.blocking_rule_sql + output["sql_dialect"] = self.sql_dialect + + if self.salting_partitions > 1 and self.sql_dialect == "spark": + output["salting_partitions"] = self.salting_partitions + + if self.array_columns_to_explode: + output["arrays_to_explode"] = self.array_columns_to_explode + + return output + + def _as_completed_dict(self): + if not self.salting_partitions > 1 and self.sql_dialect == "spark": + return self.blocking_rule_sql + else: + return self.as_dict() + + @property + def descr(self): + return "Custom" if not hasattr(self, "_description") else self._description + + def _abbreviated_sql(self, cutoff=75): + sql = self.blocking_rule_sql + return (sql[:cutoff] + "...") if len(sql) > cutoff else sql + + def __repr__(self): + return f"<{self._human_readable_succinct}>" + + @property + def _human_readable_succinct(self): + sql = self._abbreviated_sql(75) + return f"{self.descr} blocking rule using SQL: {sql}" + + +class ExplodingBlockingRule(BlockingRule): + def __init__( + self, + blocking_rule: BlockingRule | dict | str, + salting_partitions=1, + sqlglot_dialect: str = None, + array_columns_to_explode: list = [], + ): + super().__init__(blocking_rule, salting_partitions, sqlglot_dialect) + self.array_columns_to_explode: List[str] = array_columns_to_explode + self.exploded_id_pair_tables: List[SplinkDataFrame] = [] + def marginal_exploded_id_pairs_table_sql( self, linker: Linker, salted_br: BlockingRule ): @@ -241,42 +275,29 @@ def drop_materialised_id_pairs_dataframes(self): for df in self.exploded_id_pair_tables: df.drop_table_from_database_and_remove_from_cache() - def as_dict(self): - "The minimal representation of the blocking rule" - output = {} - - output["blocking_rule"] = self.blocking_rule_sql - output["sql_dialect"] = self.sql_dialect - - if self.salting_partitions > 1 and self.sql_dialect == "spark": - output["salting_partitions"] = self.salting_partitions - - if self.array_columns_to_explode: - output["arrays_to_explode"] = self.array_columns_to_explode - - return output - - def _as_completed_dict(self): - if not self.salting_partitions > 1 and self.sql_dialect == "spark": - return self.blocking_rule_sql - else: - return self.as_dict() - - @property - def descr(self): - return "Custom" if not hasattr(self, "_description") else self._description + def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): + """A SQL string specifying how to exclude the results + of THIS blocking rule from subseqent blocking statements, + so that subsequent statements do not produce duplicate pairs + """ - def _abbreviated_sql(self, cutoff=75): - sql = self.blocking_rule_sql - return (sql[:cutoff] + "...") if len(sql) > cutoff else sql + unique_id_column = linker._settings_obj._unique_id_column_name - def __repr__(self): - return f"<{self._human_readable_succinct}>" + ids_to_compare_sql = " union all ".join( + [ + f"select * from {ids.physical_name}" + for ids in self.exploded_id_pair_tables + ] + ) - @property - def _human_readable_succinct(self): - sql = self._abbreviated_sql(75) - return f"{self.descr} blocking rule using SQL: {sql}" + return f"""EXISTS ( + select 1 from ({ids_to_compare_sql}) as ids_to_compare + where ( + l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and + r.{unique_id_column} = ids_to_compare.{unique_id_column}_r + ) + ) + """ class SaltedBlockingRuleSegment: @@ -319,44 +340,47 @@ def materialise_exploded_id_tables(linker: Linker): settings_obj = linker._settings_obj blocking_rules = settings_obj._blocking_rules_to_generate_predictions - salted_blocking_rules = ( + blocking_rules = [ + br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) + ] + salted_exploded_blocking_rules = ( salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules ) - for salted_br in salted_blocking_rules: + for salted_br in salted_exploded_blocking_rules: parent_br = salted_br.parent_blocking_rule - if parent_br.array_columns_to_explode: - input_dataframe = linker._initialise_df_concat_with_tf() - input_colnames = {col.name() for col in input_dataframe.columns} - arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in parent_br.array_columns_to_explode - ] - expl_sql = linker._gen_explode_sql( - "__splink__df_concat_with_tf", - parent_br.array_columns_to_explode, - list(input_colnames.difference(arrays_to_explode_quoted)), - ) + input_dataframe = linker._initialise_df_concat_with_tf() - linker._enqueue_sql( - expl_sql, - "__splink__df_concat_with_tf_unnested", - ) + input_colnames = {col.name() for col in input_dataframe.columns} + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() + for colname in parent_br.array_columns_to_explode + ] + expl_sql = linker._gen_explode_sql( + "__splink__df_concat_with_tf", + parent_br.array_columns_to_explode, + list(input_colnames.difference(arrays_to_explode_quoted)), + ) + + linker._enqueue_sql( + expl_sql, + "__splink__df_concat_with_tf_unnested", + ) - salt_name = "" - if salted_br.is_salted: - salt_name = f"_salt_{salted_br.salt}" + salt_name = "" + if salted_br.is_salted: + salt_name = f"_salt_{salted_br.salt}" - base_name = "__splink__marginal_exploded_ids_blocking_rule" - table_name = f"{base_name}_mk_{parent_br.match_key}{salt_name}" + base_name = "__splink__marginal_exploded_ids_blocking_rule" + table_name = f"{base_name}_mk_{parent_br.match_key}{salt_name}" - sql = parent_br.marginal_exploded_id_pairs_table_sql(linker, salted_br) + sql = parent_br.marginal_exploded_id_pairs_table_sql(linker, salted_br) - linker._enqueue_sql(sql, table_name) + linker._enqueue_sql(sql, table_name) - marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) - parent_br.exploded_id_pair_tables.append(marginal_ids_table) + marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) + parent_br.exploded_id_pair_tables.append(marginal_ids_table) def block_using_rules_sqls(linker: Linker): @@ -467,7 +491,7 @@ def block_using_rules_sqls(linker: Linker): ) for salted_br in salted_blocking_rules: parent_br = salted_br.parent_blocking_rule - if not parent_br.array_columns_to_explode: + if not isinstance(parent_br, ExplodingBlockingRule): sql = f""" select {sql_select_expr} diff --git a/splink/linker.py b/splink/linker.py index 24292995a8..33427adea9 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -33,6 +33,7 @@ ) from .blocking import ( BlockingRule, + ExplodingBlockingRule, block_using_rules_sqls, blocking_rule_to_obj, materialise_exploded_id_tables, @@ -1758,7 +1759,8 @@ def predict( predictions = self._execute_sql_pipeline(input_dataframes) for br in self._settings_obj._blocking_rules_to_generate_predictions: - br.drop_materialised_id_pairs_dataframes() + if isinstance(br, ExplodingBlockingRule): + br.drop_materialised_id_pairs_dataframes() self._predict_warning() return predictions From 4d40bd9d50f877853db5aa82fe445478a8055ee1 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 15:50:04 +0000 Subject: [PATCH 20/37] all logic now pushed into blocking rules classes --- splink/blocking.py | 82 ++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index f858f4623d..e6e4679b97 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,6 +1,5 @@ from __future__ import annotations -import hashlib import logging from typing import TYPE_CHECKING, List @@ -10,8 +9,8 @@ from .input_column import InputColumn from .misc import ensure_is_list -from .unique_id_concat import _composite_unique_id_from_nodes_sql from .splink_dataframe import SplinkDataFrame +from .unique_id_concat import _composite_unique_id_from_nodes_sql logger = logging.getLogger(__name__) @@ -104,6 +103,28 @@ def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): previous_rules = " OR ".join(or_clauses) return f"AND NOT ({previous_rules})" + def create_pairwise_comparisons_sql( + self, + linker: Linker, + sql_select_expr: str, + salted_br: SaltedBlockingRuleSegment, + probability: str, + where_condition: str, + ): + sql = f""" + select + {sql_select_expr} + , '{self.match_key}' as match_key + {probability} + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({salted_br.blocking_rule_sql}) + {where_condition} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + """ + return sql + @property def salted_blocking_rules(self) -> List[SaltedBlockingRuleSegment]: """A list of sql strings""" @@ -299,6 +320,29 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): ) """ + def create_pairwise_comparisons_sql( + self, + linker: Linker, + sql_select_expr: str, + salted_br: SaltedBlockingRuleSegment, + probability: str, + where_condition: str, + ): + exploded_id_pair_table = salted_br.exploded_id_pair_table + unique_id_col = linker._settings_obj._unique_id_column_name + sql = f""" + select + {sql_select_expr}, + '{self.match_key}' as match_key + {probability} + from {exploded_id_pair_table.physical_name} as pairs + left join {linker._input_tablename_l} as l + on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r + on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + return sql + class SaltedBlockingRuleSegment: def __init__( @@ -441,9 +485,9 @@ def block_using_rules_sqls(linker: Linker): ) if type(linker).__name__ in ["SparkLinker"]: - apply_salt = True + pass else: - apply_salt = False + pass settings_obj = linker._settings_obj @@ -491,33 +535,9 @@ def block_using_rules_sqls(linker: Linker): ) for salted_br in salted_blocking_rules: parent_br = salted_br.parent_blocking_rule - if not isinstance(parent_br, ExplodingBlockingRule): - sql = f""" - select - {sql_select_expr} - , '{parent_br.match_key}' as match_key - {probability} - from {linker._input_tablename_l} as l - inner join {linker._input_tablename_r} as r - on - ({salted_br.blocking_rule_sql}) - {where_condition} - {parent_br.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} - """ - else: - ids_to_compare = salted_br.exploded_id_pair_table - unique_id_col = settings_obj._unique_id_column_name - sql = f""" - select - {sql_select_expr}, - '{parent_br.match_key}' as match_key - {probability} - from {ids_to_compare.physical_name} as pairs - left join {linker._input_tablename_l} as l - on pairs.{unique_id_col}_l=l.{unique_id_col} - left join {linker._input_tablename_r} as r - on pairs.{unique_id_col}_r=r.{unique_id_col} - """ + sql = parent_br.create_pairwise_comparisons_sql( + linker, sql_select_expr, salted_br, probability, where_condition + ) br_sqls.append(sql) sql = "union all".join(br_sqls) From 1117737bf66777498f2fc6b67beaa4f6f7c33629 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 6 Nov 2023 15:51:51 +0000 Subject: [PATCH 21/37] better names --- splink/blocking.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index e6e4679b97..0eff876136 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -48,7 +48,6 @@ def __init__( blocking_rule: BlockingRule | dict | str, salting_partitions=1, sqlglot_dialect: str = None, - array_columns_to_explode: list = [], ): if sqlglot_dialect: self._sql_dialect = sqlglot_dialect @@ -126,7 +125,7 @@ def create_pairwise_comparisons_sql( return sql @property - def salted_blocking_rules(self) -> List[SaltedBlockingRuleSegment]: + def salted_blocking_rule_segments(self) -> List[SaltedBlockingRuleSegment]: """A list of sql strings""" for n in range(self.salting_partitions): @@ -139,17 +138,17 @@ def salted_blocking_rules(self) -> List[SaltedBlockingRuleSegment]: else: rule_sql = self.blocking_rule_sql - br = SaltedBlockingRuleSegment(self, rule_sql, n) + br_seg = SaltedBlockingRuleSegment(self, rule_sql, n) # Exploding blocking rules may have a materialised exploded_id_pair_table # If so, we want to associated it with the SaltedBlockingRuleSegment if isinstance(self, ExplodingBlockingRule): try: - br.exploded_id_pair_table = self.exploded_id_pair_tables[n] + br_seg.exploded_id_pair_table = self.exploded_id_pair_tables[n] except IndexError: pass - yield br + yield br_seg @property def _parsed_join_condition(self): @@ -388,7 +387,9 @@ def materialise_exploded_id_tables(linker: Linker): br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) ] salted_exploded_blocking_rules = ( - salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules + salted_br + for br in blocking_rules + for salted_br in br.salted_blocking_rule_segments ) for salted_br in salted_exploded_blocking_rules: @@ -531,7 +532,9 @@ def block_using_rules_sqls(linker: Linker): br_sqls = [] salted_blocking_rules = ( - salted_br for br in blocking_rules for salted_br in br.salted_blocking_rules + salted_br + for br in blocking_rules + for salted_br in br.salted_blocking_rule_segments ) for salted_br in salted_blocking_rules: parent_br = salted_br.parent_blocking_rule From 7c71849d9e68858c90dd339687f4b5f5a989cfd5 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 30 Nov 2023 10:16:42 +0000 Subject: [PATCH 22/37] change all files to current master --- CHANGELOG.md | 17 +- benchmarking/time_series.json | 22 ++ docs/demos/tutorials/03_Blocking.ipynb | 8 +- docs/dev_guides/changing_splink/lint.md | 24 -- docs/dev_guides/index.md | 2 +- .../extending_settings_validator.md | 187 +++++++++ .../settings_validation_overview.md | 55 +++ docs/img/settings_validation/error_logger.png | Bin 0 -> 8360 bytes .../splink_fundamentals/backends.md | 101 ----- .../backends/backends.md | 0 .../backends/postgres.md | 0 docs/topic_guides/topic_guides_index.md | 2 +- mkdocs.yml | 18 +- poetry.lock | 208 +++++----- pyproject.toml | 2 +- splink/__init__.py | 2 +- splink/accuracy.py | 9 +- .../athena_blocking_rule_imports.py | 9 +- splink/athena/linker.py | 1 + splink/block_from_labels.py | 2 +- splink/blocking.py | 331 ++++------------ splink/blocking_rule_composition.py | 36 +- splink/blocking_rules_library.py | 129 ++---- splink/cluster_metrics.py | 65 +++ splink/comparison.py | 16 +- splink/comparison_level.py | 18 +- splink/comparison_level_library.py | 26 +- splink/cost_of_blocking_rules.py | 114 ++++++ .../duckdb_blocking_rule_imports.py | 9 +- splink/duckdb/linker.py | 19 +- ..._with_comparison_counts_below_threshold.py | 250 ++++++++++++ splink/find_matches_to_new_records.py | 10 +- splink/input_column.py | 147 ++++--- splink/labelling_tool.py | 2 +- splink/linker.py | 372 ++++++++++++++---- splink/logging_messages.py | 4 - splink/lower_id_on_lhs.py | 2 +- splink/misc.py | 9 - splink/missingness.py | 8 +- splink/optimise_cost_of_brs.py | 214 ++++++++++ .../postgres_blocking_rule_imports.py | 9 +- splink/profile_data.py | 54 ++- splink/settings.py | 50 ++- .../settings_validation/settings_validator.py | 3 +- splink/spark/linker.py | 24 +- .../spark_blocking_rule_imports.py | 9 +- splink/splink_comparison_viewer.py | 4 +- splink/splink_dataframe.py | 2 +- .../sqlite_blocking_rule_imports.py | 9 +- splink/term_frequencies.py | 14 +- splink/unique_id_concat.py | 8 +- splink/vertically_concatenate.py | 10 +- splink/waterfall_chart.py | 8 +- tests/conftest.py | 6 +- tests/helpers.py | 2 +- tests/test_analyse_blocking.py | 150 +++++-- tests/test_array_based_blocking.py | 255 ------------ tests/test_blocking.py | 7 +- tests/test_blocking_rule_composition.py | 6 +- tests/test_cluster_metrics.py | 50 +++ tests/test_correctness_of_convergence.py | 2 +- tests/test_input_column.py | 29 +- tests/test_link_only_verification.py | 48 --- tests/test_missingness.py | 35 ++ tests/test_profile_data.py | 20 + tests/test_sql_transform.py | 22 +- tests/test_u_train.py | 2 +- 67 files changed, 1984 insertions(+), 1304 deletions(-) delete mode 100644 docs/dev_guides/changing_splink/lint.md create mode 100644 docs/dev_guides/settings_validation/extending_settings_validator.md create mode 100644 docs/dev_guides/settings_validation/settings_validation_overview.md create mode 100644 docs/img/settings_validation/error_logger.png delete mode 100644 docs/topic_guides/splink_fundamentals/backends.md rename docs/topic_guides/{ => splink_fundamentals}/backends/backends.md (100%) rename docs/topic_guides/{ => splink_fundamentals}/backends/postgres.md (100%) create mode 100644 splink/cluster_metrics.py create mode 100644 splink/cost_of_blocking_rules.py create mode 100644 splink/find_brs_with_comparison_counts_below_threshold.py create mode 100644 splink/optimise_cost_of_brs.py delete mode 100644 tests/test_array_based_blocking.py create mode 100644 tests/test_cluster_metrics.py delete mode 100644 tests/test_link_only_verification.py create mode 100644 tests/test_missingness.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fdf396cd4..83509bdacb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## Unreleased + +### Changed + +### Fixed + + +## [3.9.9] - 2023-11-14 ### Changed - Upgraded [sqlglot](https://github.com/tobymao/sqlglot) to versions >= 13.0.0 ([#1642](https://github.com/moj-analytical-services/splink/pull/1642)) -- Improved logging output from settings validation ([#1636](https://github.com/moj-analytical-services/splink/pull/1636)) +- Improved logging output from settings validation ([#1636](https://github.com/moj-analytical-services/splink/pull/1636)) and corresponding documentation ([#1674](https://github.com/moj-analytical-services/splink/pull/1674)) - Emit a warning when using a default (i.e. non-trained) value for `probability_two_random_records_match` ([#1653](https://github.com/moj-analytical-services/splink/pull/1653)) ### Fixed @@ -19,6 +26,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed issue where comparison vector grid not synced with corresponding histogram values in comparison viewer dashboard ([#1652](https://github.com/moj-analytical-services/splink/pull/1652)) - Fixed issue where composing null levels would mistakenly sometimes result in a non-null level ([#1672](https://github.com/moj-analytical-services/splink/pull/1672)) - Labelling tool correctly works even when offline ([#1646](https://github.com/moj-analytical-services/splink/pull/1646)) +- Explicitly cast values when using the postgres linker ([#1693](https://github.com/moj-analytical-services/splink/pull/1693)) +- Fixed issue where parameters to `completeness_chart` were not being applied ([#1662](https://github.com/moj-analytical-services/splink/pull/1662)) +- Fixed issue passing boto3_session into the Athena linker ([#1733](https://github.com/moj-analytical-services/splink/pull/1733/files)) ## [3.9.8] - 2023-10-05 @@ -36,5 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Corrected path for Spark `.jar` file containing UDFs to work correctly for Spark < 3.0 ([#1622](https://github.com/moj-analytical-services/splink/pull/1622)) - Spark UDF `damerau_levensthein` is now only registered for Spark >= 3.0, as it is not compatible with earlier versions ([#1622](https://github.com/moj-analytical-services/splink/pull/1622)) -[unreleased]: https://github.com/moj-analytical-services/splink/compare/v3.9.8...HEAD +[unreleased]: https://github.com/moj-analytical-services/splink/compare/3.9.9...HEAD +[3.9.9]: https://github.com/moj-analytical-services/splink/compare/v3.9.8...3.9.9 [3.9.8]: https://github.com/moj-analytical-services/splink/compare/v3.9.7...v3.9.8 diff --git a/benchmarking/time_series.json b/benchmarking/time_series.json index 91e2355680..b929d511ab 100644 --- a/benchmarking/time_series.json +++ b/benchmarking/time_series.json @@ -711,3 +711,25 @@ {"machine_info": {"node": "fv-az165-740", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.0952 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2095185000, 0], "stepping": 4, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "a0240cf2cefec4491edb3ea4638c2b967ee50126", "time": "2023-11-03T13:34:43+00:00", "author_time": "2023-11-03T13:34:43+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.3599004839999225, "max": 1.384163209999997, "mean": 1.3720318469999597, "stddev": 0.017156338084723843, "rounds": 2, "median": 1.3720318469999597, "iqr": 0.024262726000074508, "q1": 1.3599004839999225, "q3": 1.384163209999997, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.3599004839999225, "hd15iqr": 1.384163209999997, "ops": 0.728846055714062, "total": 2.7440636939999195, "data": [1.384163209999997, 1.3599004839999225], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.4042946920000077, "max": 1.4228623420000304, "mean": 1.413578517000019, "stddev": 0.013129311225714448, "rounds": 2, "median": 1.413578517000019, "iqr": 0.018567650000022695, "q1": 1.4042946920000077, "q3": 1.4228623420000304, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.4042946920000077, "hd15iqr": 1.4228623420000304, "ops": 0.7074244465190798, "total": 2.827157034000038, "data": [1.4228623420000304, 1.4042946920000077], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.644396762000042, "max": 3.7966250789999094, "mean": 3.720510920499976, "stddev": 0.10764167523922155, "rounds": 2, "median": 3.720510920499976, "iqr": 0.15222831699986727, "q1": 3.644396762000042, "q3": 3.7966250789999094, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.644396762000042, "hd15iqr": 3.7966250789999094, "ops": 0.2687802888818336, "total": 7.441021840999952, "data": [3.644396762000042, 3.7966250789999094], "iterations": 1}}], "datetime": "2023-11-03T13:36:06.933676", "version": "4.0.0"} {"machine_info": {"node": "fv-az502-812", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.0952 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2095210000, 0], "stepping": 4, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "a924aa902d73d197fc1736d736dde148ae9afeb0", "time": "2023-11-06T13:15:15+00:00", "author_time": "2023-11-06T13:15:15+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.4697954879999884, "max": 1.488425057000029, "mean": 1.4791102725000087, "stddev": 0.013173094570511431, "rounds": 2, "median": 1.4791102725000087, "iqr": 0.018629569000040647, "q1": 1.4697954879999884, "q3": 1.488425057000029, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.4697954879999884, "hd15iqr": 1.488425057000029, "ops": 0.6760821140872674, "total": 2.9582205450000174, "data": [1.488425057000029, 1.4697954879999884], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.5181381870000337, "max": 1.5407626510000227, "mean": 1.5294504190000282, "stddev": 0.015997911915103198, "rounds": 2, "median": 1.5294504190000282, "iqr": 0.022624463999989075, "q1": 1.5181381870000337, "q3": 1.5407626510000227, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.5181381870000337, "hd15iqr": 1.5407626510000227, "ops": 0.6538296289812462, "total": 3.0589008380000564, "data": [1.5407626510000227, 1.5181381870000337], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.800174019999986, "max": 3.9249604589999763, "mean": 3.862567239499981, "stddev": 0.08823733721701468, "rounds": 2, "median": 3.862567239499981, "iqr": 0.12478643899999042, "q1": 3.800174019999986, "q3": 3.9249604589999763, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.800174019999986, "hd15iqr": 3.9249604589999763, "ops": 0.25889516945456525, "total": 7.725134478999962, "data": [3.800174019999986, 3.9249604589999763], "iterations": 1}}], "datetime": "2023-11-06T13:16:39.417659", "version": "4.0.0"} {"machine_info": {"node": "fv-az1431-141", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445435000, 0], "hz_actual": [2445435000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "e575a4e1874b7d4624d28b853e80d6cb41c045c6", "time": "2023-11-06T13:40:35+00:00", "author_time": "2023-11-06T13:40:35+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9561546780000185, "max": 0.9879204030000039, "mean": 0.9720375405000112, "stddev": 0.022461759556796735, "rounds": 2, "median": 0.9720375405000112, "iqr": 0.03176572499998542, "q1": 0.9561546780000185, "q3": 0.9879204030000039, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9561546780000185, "hd15iqr": 0.9879204030000039, "ops": 1.0287668514176984, "total": 1.9440750810000225, "data": [0.9879204030000039, 0.9561546780000185], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9849128150000013, "max": 0.9895065880000118, "mean": 0.9872097015000065, "stddev": 0.003248288039539105, "rounds": 2, "median": 0.9872097015000065, "iqr": 0.004593773000010515, "q1": 0.9849128150000013, "q3": 0.9895065880000118, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9849128150000013, "hd15iqr": 0.9895065880000118, "ops": 1.0129560097318324, "total": 1.974419403000013, "data": [0.9895065880000118, 0.9849128150000013], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3401754460000177, "max": 2.4327504759999954, "mean": 2.3864629610000065, "stddev": 0.0654604314815323, "rounds": 2, "median": 2.3864629610000065, "iqr": 0.09257502999997769, "q1": 2.3401754460000177, "q3": 2.4327504759999954, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3401754460000177, "hd15iqr": 2.4327504759999954, "ops": 0.4190301782772975, "total": 4.772925922000013, "data": [2.3401754460000177, 2.4327504759999954], "iterations": 1}}], "datetime": "2023-11-06T13:41:33.139736", "version": "4.0.0"} +{"machine_info": {"node": "fv-az200-592", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz", "hz_advertised_friendly": "2.4000 GHz", "hz_actual_friendly": "2.3945 GHz", "hz_advertised": [2400000000, 0], "hz_actual": [2394452000, 0], "stepping": 2, "model": 63, "family": 6, "flags": ["abm", "aes", "apic", "avx", "avx2", "bmi1", "bmi2", "clflush", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdtscp", "rep_good", "sep", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsaveopt", "xtopology"], "l3_cache_size": 31457280, "l2_cache_size": 524288, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "eb697e5cb6a6640676dbde8164a1464d4e4f78b9", "time": "2023-11-06T17:50:03+00:00", "author_time": "2023-11-06T17:50:03+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.6033809460000157, "max": 1.6872883610000144, "mean": 1.645334653500015, "stddev": 0.05933150213833296, "rounds": 2, "median": 1.645334653500015, "iqr": 0.08390741499999876, "q1": 1.6033809460000157, "q3": 1.6872883610000144, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.6033809460000157, "hd15iqr": 1.6872883610000144, "ops": 0.6077790909422373, "total": 3.29066930700003, "data": [1.6872883610000144, 1.6033809460000157], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.6994018249999954, "max": 1.6997110539999767, "mean": 1.699556439499986, "stddev": 0.00021865792282632242, "rounds": 2, "median": 1.699556439499986, "iqr": 0.0003092289999813147, "q1": 1.6994018249999954, "q3": 1.6997110539999767, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.6994018249999954, "hd15iqr": 1.6997110539999767, "ops": 0.5883888153159554, "total": 3.399112878999972, "data": [1.6994018249999954, 1.6997110539999767], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.910154970999997, "max": 4.116282279000018, "mean": 4.0132186250000075, "stddev": 0.14575401727454262, "rounds": 2, "median": 4.0132186250000075, "iqr": 0.20612730800002055, "q1": 3.910154970999997, "q3": 4.116282279000018, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.910154970999997, "hd15iqr": 4.116282279000018, "ops": 0.2491765571331161, "total": 8.026437250000015, "data": [3.910154970999997, 4.116282279000018], "iterations": 1}}], "datetime": "2023-11-06T17:51:29.496527", "version": "4.0.0"} +{"machine_info": {"node": "fv-az457-102", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz", "hz_advertised_friendly": "2.8000 GHz", "hz_actual_friendly": "2.7934 GHz", "hz_advertised": [2800000000, 0], "hz_actual": [2793438000, 0], "stepping": 6, "model": 106, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 50331648, "l2_cache_size": "2.5 MiB", "l1_data_cache_size": 98304, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "f91af6ae8233eab2e5c444643f1135e94c05b231", "time": "2023-11-06T17:51:33+00:00", "author_time": "2023-11-06T17:51:33+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.1987115019999806, "max": 1.2181845909999822, "mean": 1.2084480464999814, "stddev": 0.013769553282550343, "rounds": 2, "median": 1.2084480464999814, "iqr": 0.019473089000001664, "q1": 1.1987115019999806, "q3": 1.2181845909999822, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.1987115019999806, "hd15iqr": 1.2181845909999822, "ops": 0.827507647429521, "total": 2.4168960929999628, "data": [1.2181845909999822, 1.1987115019999806], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.264409201999996, "max": 1.267484995000018, "mean": 1.265947098500007, "stddev": 0.002174914087841693, "rounds": 2, "median": 1.265947098500007, "iqr": 0.0030757930000220313, "q1": 1.264409201999996, "q3": 1.267484995000018, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.264409201999996, "hd15iqr": 1.267484995000018, "ops": 0.7899224234447696, "total": 2.531894197000014, "data": [1.267484995000018, 1.264409201999996], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.073732800999977, "max": 3.168990464999979, "mean": 3.121361632999978, "stddev": 0.06735734017439093, "rounds": 2, "median": 3.121361632999978, "iqr": 0.0952576640000018, "q1": 3.073732800999977, "q3": 3.168990464999979, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.073732800999977, "hd15iqr": 3.168990464999979, "ops": 0.3203730030598499, "total": 6.242723265999956, "data": [3.073732800999977, 3.168990464999979], "iterations": 1}}], "datetime": "2023-11-07T13:50:05.499393", "version": "4.0.0"} +{"machine_info": {"node": "fv-az664-608", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "3.2105 GHz", "hz_actual_friendly": "3.2105 GHz", "hz_advertised": [3210455000, 0], "hz_actual": [3210455000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "1c65b8ba118b213e715695cc5d0a07c3a63a8b4b", "time": "2023-11-07T13:50:08+00:00", "author_time": "2023-11-07T13:50:08+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9548512529999869, "max": 0.95494924099998, "mean": 0.9549002469999834, "stddev": 6.928797927007465e-05, "rounds": 2, "median": 0.9549002469999834, "iqr": 9.798799999316543e-05, "q1": 0.9548512529999869, "q3": 0.95494924099998, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9548512529999869, "hd15iqr": 0.95494924099998, "ops": 1.047229805565248, "total": 1.9098004939999669, "data": [0.9548512529999869, 0.95494924099998], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.0176550980000059, "max": 1.5303969220000226, "mean": 1.2740260100000143, "stddev": 0.3625632207483711, "rounds": 2, "median": 1.2740260100000143, "iqr": 0.5127418240000168, "q1": 1.0176550980000059, "q3": 1.5303969220000226, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.0176550980000059, "hd15iqr": 1.5303969220000226, "ops": 0.7849133315574843, "total": 2.5480520200000285, "data": [1.5303969220000226, 1.0176550980000059], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.360819907000007, "max": 2.4638852029999896, "mean": 2.4123525549999982, "stddev": 0.07287816970658653, "rounds": 2, "median": 2.4123525549999982, "iqr": 0.10306529599998271, "q1": 2.360819907000007, "q3": 2.4638852029999896, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.360819907000007, "hd15iqr": 2.4638852029999896, "ops": 0.41453310708144014, "total": 4.8247051099999965, "data": [2.360819907000007, 2.4638852029999896], "iterations": 1}}], "datetime": "2023-11-07T18:02:58.141132", "version": "4.0.0"} +{"machine_info": {"node": "fv-az588-790", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.5939 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2593909000, 0], "stepping": 7, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "5104945b50d3ce63e36f067c0bbeaf53ca2b7cf2", "time": "2023-11-07T18:11:54+00:00", "author_time": "2023-11-07T18:11:54+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2311240029999908, "max": 1.2352957990000277, "mean": 1.2332099010000093, "stddev": 0.002949905241353021, "rounds": 2, "median": 1.2332099010000093, "iqr": 0.00417179600003692, "q1": 1.2311240029999908, "q3": 1.2352957990000277, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2311240029999908, "hd15iqr": 1.2352957990000277, "ops": 0.8108919650978317, "total": 2.4664198020000185, "data": [1.2352957990000277, 1.2311240029999908], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.272710396999969, "max": 1.3587639890000105, "mean": 1.3157371929999897, "stddev": 0.06084907844868984, "rounds": 2, "median": 1.3157371929999897, "iqr": 0.08605359200004159, "q1": 1.272710396999969, "q3": 1.3587639890000105, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.272710396999969, "hd15iqr": 1.3587639890000105, "ops": 0.7600301985230935, "total": 2.6314743859999794, "data": [1.272710396999969, 1.3587639890000105], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.1817582659999744, "max": 3.3074018369999862, "mean": 3.2445800514999803, "stddev": 0.08884342106660185, "rounds": 2, "median": 3.2445800514999803, "iqr": 0.1256435710000119, "q1": 3.1817582659999744, "q3": 3.3074018369999862, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.1817582659999744, "hd15iqr": 3.3074018369999862, "ops": 0.3082062960775761, "total": 6.489160102999961, "data": [3.1817582659999744, 3.3074018369999862], "iterations": 1}}], "datetime": "2023-11-07T18:13:01.623304", "version": "4.0.0"} +{"machine_info": {"node": "fv-az571-253", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445433000, 0], "hz_actual": [2445433000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "1cf6f546fec5c3f285b4e040c150f1a611a6058c", "time": "2023-11-08T14:34:54+00:00", "author_time": "2023-11-08T14:34:54+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.931415747999992, "max": 0.939872164999997, "mean": 0.9356439564999945, "stddev": 0.005979589805244729, "rounds": 2, "median": 0.9356439564999945, "iqr": 0.00845641700000499, "q1": 0.931415747999992, "q3": 0.939872164999997, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.931415747999992, "hd15iqr": 0.939872164999997, "ops": 1.0687826208387483, "total": 1.871287912999989, "data": [0.931415747999992, 0.939872164999997], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.97045514200002, "max": 1.3590523579999854, "mean": 1.1647537500000027, "stddev": 0.274779726583789, "rounds": 2, "median": 1.1647537500000027, "iqr": 0.38859721599996533, "q1": 0.97045514200002, "q3": 1.3590523579999854, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.97045514200002, "hd15iqr": 1.3590523579999854, "ops": 0.8585505734581217, "total": 2.3295075000000054, "data": [1.3590523579999854, 0.97045514200002], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3393676510000034, "max": 2.4370354620000114, "mean": 2.3882015565000074, "stddev": 0.06906157146175172, "rounds": 2, "median": 2.3882015565000074, "iqr": 0.09766781100000799, "q1": 2.3393676510000034, "q3": 2.4370354620000114, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3393676510000034, "hd15iqr": 2.4370354620000114, "ops": 0.4187251269802934, "total": 4.776403113000015, "data": [2.3393676510000034, 2.4370354620000114], "iterations": 1}}], "datetime": "2023-11-08T14:35:49.136885", "version": "4.0.0"} +{"machine_info": {"node": "fv-az345-568", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.0952 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2095173000, 0], "stepping": 4, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "3d215414e8826001ec851ffc63fb841ae144de44", "time": "2023-11-08T14:35:51+00:00", "author_time": "2023-11-08T14:35:51+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.4932050419999996, "max": 1.49962266, "mean": 1.4964138509999998, "stddev": 0.004537941206865134, "rounds": 2, "median": 1.4964138509999998, "iqr": 0.006417618000000402, "q1": 1.4932050419999996, "q3": 1.49962266, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.4932050419999996, "hd15iqr": 1.49962266, "ops": 0.6682643303065765, "total": 2.9928277019999996, "data": [1.4932050419999996, 1.49962266], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.5132854119999593, "max": 1.5525165409999886, "mean": 1.532900976499974, "stddev": 0.027740597349524967, "rounds": 2, "median": 1.532900976499974, "iqr": 0.03923112900002934, "q1": 1.5132854119999593, "q3": 1.5525165409999886, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.5132854119999593, "hd15iqr": 1.5525165409999886, "ops": 0.6523578595946031, "total": 3.065801952999948, "data": [1.5525165409999886, 1.5132854119999593], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.809388207999973, "max": 3.8346344409999915, "mean": 3.8220113244999823, "stddev": 0.01785178255372867, "rounds": 2, "median": 3.8220113244999823, "iqr": 0.025246233000018492, "q1": 3.809388207999973, "q3": 3.8346344409999915, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.809388207999973, "hd15iqr": 3.8346344409999915, "ops": 0.2616423435456005, "total": 7.6440226489999645, "data": [3.809388207999973, 3.8346344409999915], "iterations": 1}}], "datetime": "2023-11-08T14:47:23.593385", "version": "4.0.0"} +{"machine_info": {"node": "fv-az189-557", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.5939 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2593906000, 0], "stepping": 7, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "4b3d3658df58d8f3677755efc054add2ce7f50c7", "time": "2023-11-08T14:48:51+00:00", "author_time": "2023-11-08T14:48:51+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2100502150000239, "max": 1.2277092350000203, "mean": 1.218879725000022, "stddev": 0.012486812791106333, "rounds": 2, "median": 1.218879725000022, "iqr": 0.017659019999996417, "q1": 1.2100502150000239, "q3": 1.2277092350000203, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2100502150000239, "hd15iqr": 1.2277092350000203, "ops": 0.8204254935818067, "total": 2.437759450000044, "data": [1.2277092350000203, 1.2100502150000239], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2407188699999097, "max": 1.267324553000094, "mean": 1.2540217115000019, "stddev": 0.018813058867530038, "rounds": 2, "median": 1.2540217115000019, "iqr": 0.0266056830001844, "q1": 1.2407188699999097, "q3": 1.267324553000094, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2407188699999097, "hd15iqr": 1.267324553000094, "ops": 0.7974343592535148, "total": 2.5080434230000037, "data": [1.267324553000094, 1.2407188699999097], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.1671885140000313, "max": 3.290464277999945, "mean": 3.228826395999988, "stddev": 0.08716912868029136, "rounds": 2, "median": 3.228826395999988, "iqr": 0.12327576399991358, "q1": 3.1671885140000313, "q3": 3.290464277999945, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.1671885140000313, "hd15iqr": 3.290464277999945, "ops": 0.30971005478611174, "total": 6.457652791999976, "data": [3.1671885140000313, 3.290464277999945], "iterations": 1}}], "datetime": "2023-11-08T14:49:47.701352", "version": "4.0.0"} +{"machine_info": {"node": "fv-az572-585", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445433000, 0], "hz_actual": [2445433000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "442810b2790a45aedf91f1e61f138b396d4347b2", "time": "2023-11-08T16:18:06+00:00", "author_time": "2023-11-08T16:18:06+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9434332649999959, "max": 0.9548338640000082, "mean": 0.9491335645000021, "stddev": 0.008061440862497294, "rounds": 2, "median": 0.9491335645000021, "iqr": 0.011400599000012335, "q1": 0.9434332649999959, "q3": 0.9548338640000082, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9434332649999959, "hd15iqr": 0.9548338640000082, "ops": 1.0535924946735964, "total": 1.8982671290000042, "data": [0.9434332649999959, 0.9548338640000082], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9671726040000124, "max": 1.4133997509999858, "mean": 1.190286177499999, "stddev": 0.3155302415932076, "rounds": 2, "median": 1.190286177499999, "iqr": 0.44622714699997346, "q1": 0.9671726040000124, "q3": 1.4133997509999858, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9671726040000124, "hd15iqr": 1.4133997509999858, "ops": 0.8401340945589538, "total": 2.380572354999998, "data": [1.4133997509999858, 0.9671726040000124], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3621614810000153, "max": 2.461645940000011, "mean": 2.411903710500013, "stddev": 0.07034613558157202, "rounds": 2, "median": 2.411903710500013, "iqr": 0.0994844589999957, "q1": 2.3621614810000153, "q3": 2.461645940000011, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3621614810000153, "hd15iqr": 2.461645940000011, "ops": 0.41461024983982026, "total": 4.823807421000026, "data": [2.3621614810000153, 2.461645940000011], "iterations": 1}}], "datetime": "2023-11-08T16:19:00.496182", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1099-29", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.5939 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2593907000, 0], "stepping": 7, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "c9dc7110b2b66e037eda4c0fd3e404655cf9937d", "time": "2023-11-08T16:19:02+00:00", "author_time": "2023-11-08T16:19:02+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2140923229999885, "max": 1.2301550080000254, "mean": 1.222123665500007, "stddev": 0.011358033487589557, "rounds": 2, "median": 1.222123665500007, "iqr": 0.016062685000036936, "q1": 1.2140923229999885, "q3": 1.2301550080000254, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2140923229999885, "hd15iqr": 1.2301550080000254, "ops": 0.8182477994899726, "total": 2.444247331000014, "data": [1.2301550080000254, 1.2140923229999885], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.269475258, "max": 1.270498386999975, "mean": 1.2699868224999875, "stddev": 0.0007234614539110343, "rounds": 2, "median": 1.2699868224999875, "iqr": 0.0010231289999751425, "q1": 1.269475258, "q3": 1.270498386999975, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.269475258, "hd15iqr": 1.270498386999975, "ops": 0.7874097449542709, "total": 2.539973644999975, "data": [1.269475258, 1.270498386999975], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.179296198000003, "max": 3.320312804999986, "mean": 3.2498045014999946, "stddev": 0.09971379906960648, "rounds": 2, "median": 3.2498045014999946, "iqr": 0.1410166069999832, "q1": 3.179296198000003, "q3": 3.320312804999986, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.179296198000003, "hd15iqr": 3.320312804999986, "ops": 0.307710817539466, "total": 6.499609002999989, "data": [3.179296198000003, 3.320312804999986], "iterations": 1}}], "datetime": "2023-11-08T16:31:20.935575", "version": "4.0.0"} +{"machine_info": {"node": "fv-az528-981", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445428000, 0], "hz_actual": [2445428000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "68aee62985b9e6aa02f252723fa3f622b9157a1d", "time": "2023-11-08T17:27:15+00:00", "author_time": "2023-11-08T17:27:15+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.0022991969999566, "max": 1.0028250700000285, "mean": 1.0025621334999926, "stddev": 0.00037184836439375213, "rounds": 2, "median": 1.0025621334999926, "iqr": 0.000525873000071897, "q1": 1.0022991969999566, "q3": 1.0028250700000285, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.0022991969999566, "hd15iqr": 1.0028250700000285, "ops": 0.9974444142518648, "total": 2.005124266999985, "data": [1.0028250700000285, 1.0022991969999566], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.034631657000034, "max": 1.2477009689999932, "mean": 1.1411663130000136, "stddev": 0.15066275537792334, "rounds": 2, "median": 1.1411663130000136, "iqr": 0.21306931199995915, "q1": 1.034631657000034, "q3": 1.2477009689999932, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.034631657000034, "hd15iqr": 1.2477009689999932, "ops": 0.8762964596905237, "total": 2.282332626000027, "data": [1.2477009689999932, 1.034631657000034], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.4006382750000057, "max": 2.4872832240000093, "mean": 2.4439607495000075, "stddev": 0.06126723099346514, "rounds": 2, "median": 2.4439607495000075, "iqr": 0.08664494900000363, "q1": 2.4006382750000057, "q3": 2.4872832240000093, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.4006382750000057, "hd15iqr": 2.4872832240000093, "ops": 0.40917187405918154, "total": 4.887921499000015, "data": [2.4006382750000057, 2.4872832240000093], "iterations": 1}}], "datetime": "2023-11-08T17:28:12.256531", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1114-352", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445425000, 0], "hz_actual": [2445425000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "868fefba022483f4cb0d05822aa55d62ee9b92fa", "time": "2023-11-08T17:28:14+00:00", "author_time": "2023-11-08T17:28:14+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9621416300000192, "max": 0.9656969050000157, "mean": 0.9639192675000174, "stddev": 0.0025139590614805473, "rounds": 2, "median": 0.9639192675000174, "iqr": 0.003555274999996527, "q1": 0.9621416300000192, "q3": 0.9656969050000157, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9621416300000192, "hd15iqr": 0.9656969050000157, "ops": 1.0374312805195398, "total": 1.9278385350000349, "data": [0.9621416300000192, 0.9656969050000157], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9901588589999903, "max": 1.0090765639999972, "mean": 0.9996177114999938, "stddev": 0.013376837489991529, "rounds": 2, "median": 0.9996177114999938, "iqr": 0.01891770500000689, "q1": 0.9901588589999903, "q3": 1.0090765639999972, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9901588589999903, "hd15iqr": 1.0090765639999972, "ops": 1.0003824347003942, "total": 1.9992354229999876, "data": [1.0090765639999972, 0.9901588589999903], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.332413972000012, "max": 2.4353592070000047, "mean": 2.3838865895000083, "stddev": 0.07279327375933767, "rounds": 2, "median": 2.3838865895000083, "iqr": 0.10294523499999286, "q1": 2.332413972000012, "q3": 2.4353592070000047, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.332413972000012, "hd15iqr": 2.4353592070000047, "ops": 0.4194830426936283, "total": 4.767773179000017, "data": [2.332413972000012, 2.4353592070000047], "iterations": 1}}], "datetime": "2023-11-10T13:47:16.282100", "version": "4.0.0"} +{"machine_info": {"node": "fv-az198-997", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz", "hz_advertised_friendly": "2.3000 GHz", "hz_actual_friendly": "2.2947 GHz", "hz_advertised": [2300000000, 0], "hz_actual": [2294686000, 0], "stepping": 1, "model": 79, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "bmi1", "bmi2", "clflush", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsaveopt", "xtopology"], "l3_cache_size": 52428800, "l2_cache_size": 524288, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "05a3715a01e07091809c72cc840a9f0ecb1e1008", "time": "2023-11-10T13:47:18+00:00", "author_time": "2023-11-10T13:47:18+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.6431525759999914, "max": 1.6458846939999887, "mean": 1.64451863499999, "stddev": 0.0019318991647999085, "rounds": 2, "median": 1.64451863499999, "iqr": 0.0027321179999972856, "q1": 1.6431525759999914, "q3": 1.6458846939999887, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.6431525759999914, "hd15iqr": 1.6458846939999887, "ops": 0.6080806740143787, "total": 3.28903726999998, "data": [1.6431525759999914, 1.6458846939999887], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.7169424680000134, "max": 1.7179142769999913, "mean": 1.7174283725000024, "stddev": 0.0006871727339025274, "rounds": 2, "median": 1.7174283725000024, "iqr": 0.0009718089999779522, "q1": 1.7169424680000134, "q3": 1.7179142769999913, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.7169424680000134, "hd15iqr": 1.7179142769999913, "ops": 0.5822659133925532, "total": 3.4348567450000047, "data": [1.7169424680000134, 1.7179142769999913], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 4.029156957999987, "max": 4.095325204000005, "mean": 4.062241080999996, "stddev": 0.04678801544583232, "rounds": 2, "median": 4.062241080999996, "iqr": 0.06616824600001792, "q1": 4.029156957999987, "q3": 4.095325204000005, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 4.029156957999987, "hd15iqr": 4.095325204000005, "ops": 0.2461695355002986, "total": 8.124482161999993, "data": [4.029156957999987, 4.095325204000005], "iterations": 1}}], "datetime": "2023-11-13T09:26:33.907270", "version": "4.0.0"} +{"machine_info": {"node": "fv-az807-730", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.5939 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2593907000, 0], "stepping": 7, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "8c50b2aaae14fc0b20d4bb0f4719823394b43bf1", "time": "2023-11-13T09:26:37+00:00", "author_time": "2023-11-13T09:26:37+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2160081230000515, "max": 1.224797943999988, "mean": 1.2204030335000198, "stddev": 0.006215342034471116, "rounds": 2, "median": 1.2204030335000198, "iqr": 0.008789820999936637, "q1": 1.2160081230000515, "q3": 1.224797943999988, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2160081230000515, "hd15iqr": 1.224797943999988, "ops": 0.8194014375169806, "total": 2.4408060670000395, "data": [1.224797943999988, 1.2160081230000515], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2509048089999624, "max": 1.2657038369999896, "mean": 1.258304322999976, "stddev": 0.010464493053788817, "rounds": 2, "median": 1.258304322999976, "iqr": 0.014799028000027192, "q1": 1.2509048089999624, "q3": 1.2657038369999896, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2509048089999624, "hd15iqr": 1.2657038369999896, "ops": 0.7947203086896008, "total": 2.516608645999952, "data": [1.2509048089999624, 1.2657038369999896], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.2236348810000095, "max": 3.346483650000039, "mean": 3.285059265500024, "stddev": 0.08686719762034043, "rounds": 2, "median": 3.285059265500024, "iqr": 0.12284876900002928, "q1": 3.2236348810000095, "q3": 3.346483650000039, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.2236348810000095, "hd15iqr": 3.346483650000039, "ops": 0.30440851113466544, "total": 6.570118531000048, "data": [3.2236348810000095, 3.346483650000039], "iterations": 1}}], "datetime": "2023-11-13T09:36:55.782010", "version": "4.0.0"} +{"machine_info": {"node": "fv-az186-957", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.5939 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2593905000, 0], "stepping": 7, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "9ae301414adf7199e29bdde98c1c32e9ca0c7dcc", "time": "2023-11-13T10:12:56+00:00", "author_time": "2023-11-13T10:12:56+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2505425419999767, "max": 1.28938504300001, "mean": 1.2699637924999934, "stddev": 0.027465795855368787, "rounds": 2, "median": 1.2699637924999934, "iqr": 0.03884250100003328, "q1": 1.2505425419999767, "q3": 1.28938504300001, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2505425419999767, "hd15iqr": 1.28938504300001, "ops": 0.7874240241380781, "total": 2.5399275849999867, "data": [1.28938504300001, 1.2505425419999767], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.6338690769999857, "max": 5.967250677999999, "mean": 3.8005598774999925, "stddev": 3.0641635155361278, "rounds": 2, "median": 3.8005598774999925, "iqr": 4.3333816010000135, "q1": 1.6338690769999857, "q3": 5.967250677999999, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.6338690769999857, "hd15iqr": 5.967250677999999, "ops": 0.2631191277685644, "total": 7.601119754999985, "data": [5.967250677999999, 1.6338690769999857], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.215292209000012, "max": 3.397409056999976, "mean": 3.306350632999994, "stddev": 0.1287760581890944, "rounds": 2, "median": 3.306350632999994, "iqr": 0.1821168479999642, "q1": 3.215292209000012, "q3": 3.397409056999976, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.215292209000012, "hd15iqr": 3.397409056999976, "ops": 0.302448261239812, "total": 6.612701265999988, "data": [3.215292209000012, 3.397409056999976], "iterations": 1}}], "datetime": "2023-11-13T10:14:25.059231", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1132-333", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz", "hz_advertised_friendly": "2.8000 GHz", "hz_actual_friendly": "2.7934 GHz", "hz_advertised": [2800000000, 0], "hz_actual": [2793437000, 0], "stepping": 6, "model": 106, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 50331648, "l2_cache_size": "2.5 MiB", "l1_data_cache_size": 98304, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "c3206c4d98781f97cf3dbed02a5a2ef499c0dda9", "time": "2023-11-13T10:14:28+00:00", "author_time": "2023-11-13T10:14:28+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.222971868000002, "max": 1.2295520209999893, "mean": 1.2262619444999956, "stddev": 0.004652870807536078, "rounds": 2, "median": 1.2262619444999956, "iqr": 0.0065801529999873765, "q1": 1.222971868000002, "q3": 1.2295520209999893, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.222971868000002, "hd15iqr": 1.2295520209999893, "ops": 0.8154864500893786, "total": 2.452523888999991, "data": [1.2295520209999893, 1.222971868000002], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2591928180000025, "max": 1.9306485699999882, "mean": 1.5949206939999954, "stddev": 0.47479091550590263, "rounds": 2, "median": 1.5949206939999954, "iqr": 0.6714557519999858, "q1": 1.2591928180000025, "q3": 1.9306485699999882, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2591928180000025, "hd15iqr": 1.9306485699999882, "ops": 0.6269904226347714, "total": 3.1898413879999907, "data": [1.9306485699999882, 1.2591928180000025], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.0543808450000256, "max": 3.1690542850000156, "mean": 3.1117175650000206, "stddev": 0.08108636704598163, "rounds": 2, "median": 3.1117175650000206, "iqr": 0.11467343999999002, "q1": 3.0543808450000256, "q3": 3.1690542850000156, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.0543808450000256, "hd15iqr": 3.1690542850000156, "ops": 0.3213659270519281, "total": 6.223435130000041, "data": [3.0543808450000256, 3.1690542850000156], "iterations": 1}}], "datetime": "2023-11-13T11:54:24.705571", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1204-651", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445430000, 0], "hz_actual": [2445430000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "1409e7875ba1ae60423863b39029e28fbd772736", "time": "2023-11-13T11:54:27+00:00", "author_time": "2023-11-13T11:54:27+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9412004809999956, "max": 0.9496664779999975, "mean": 0.9454334794999966, "stddev": 0.005986363888206265, "rounds": 2, "median": 0.9454334794999966, "iqr": 0.008465997000001835, "q1": 0.9412004809999956, "q3": 0.9496664779999975, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9412004809999956, "hd15iqr": 0.9496664779999975, "ops": 1.0577158749750026, "total": 1.8908669589999931, "data": [0.9496664779999975, 0.9412004809999956], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9642870280000011, "max": 0.9894374889999966, "mean": 0.9768622584999989, "stddev": 0.017784061523064627, "rounds": 2, "median": 0.9768622584999989, "iqr": 0.025150460999995516, "q1": 0.9642870280000011, "q3": 0.9894374889999966, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9642870280000011, "hd15iqr": 0.9894374889999966, "ops": 1.0236857768827408, "total": 1.9537245169999977, "data": [0.9894374889999966, 0.9642870280000011], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.359381280000008, "max": 2.4417958969999916, "mean": 2.4005885885, "stddev": 0.05827593454958048, "rounds": 2, "median": 2.4005885885, "iqr": 0.08241461699998354, "q1": 2.359381280000008, "q3": 2.4417958969999916, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.359381280000008, "hd15iqr": 2.4417958969999916, "ops": 0.4165645062175551, "total": 4.801177177, "data": [2.359381280000008, 2.4417958969999916], "iterations": 1}}], "datetime": "2023-11-13T13:47:37.621265", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1150-775", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.6229 GHz", "hz_actual_friendly": "2.6229 GHz", "hz_advertised": [2622946000, 0], "hz_actual": [2622946000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "3e41935e40fac0d901889084a7eeb28e739b61c1", "time": "2023-11-13T22:07:36+00:00", "author_time": "2023-11-13T22:07:36+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.001577931, "max": 1.0117544210000062, "mean": 1.006666176000003, "stddev": 0.007195865087681551, "rounds": 2, "median": 1.006666176000003, "iqr": 0.01017649000000631, "q1": 1.001577931, "q3": 1.0117544210000062, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.001577931, "hd15iqr": 1.0117544210000062, "ops": 0.9933779676332315, "total": 2.013332352000006, "data": [1.0117544210000062, 1.001577931], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.0383666559999938, "max": 1.0405114490000074, "mean": 1.0394390525000006, "stddev": 0.0015165976745510402, "rounds": 2, "median": 1.0394390525000006, "iqr": 0.0021447930000135784, "q1": 1.0383666559999938, "q3": 1.0405114490000074, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.0383666559999938, "hd15iqr": 1.0405114490000074, "ops": 0.9620573689191839, "total": 2.078878105000001, "data": [1.0383666559999938, 1.0405114490000074], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3833499370000055, "max": 2.4892771640000007, "mean": 2.436313550500003, "stddev": 0.0749018605239833, "rounds": 2, "median": 2.436313550500003, "iqr": 0.10592722699999513, "q1": 2.3833499370000055, "q3": 2.4892771640000007, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3833499370000055, "hd15iqr": 2.4892771640000007, "ops": 0.4104561992009488, "total": 4.872627101000006, "data": [2.3833499370000055, 2.4892771640000007], "iterations": 1}}], "datetime": "2023-11-13T22:08:41.483989", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1204-651", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4842 GHz", "hz_actual_friendly": "2.4842 GHz", "hz_advertised": [2484166000, 0], "hz_actual": [2484166000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "4374aa1ad3a4ecc0c196cacef69778b8081ca389", "time": "2023-11-14T11:14:26+00:00", "author_time": "2023-11-14T11:14:26+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9616797179999139, "max": 0.9645071970001027, "mean": 0.9630934575000083, "stddev": 0.0019993295746960314, "rounds": 2, "median": 0.9630934575000083, "iqr": 0.0028274790001887595, "q1": 0.9616797179999139, "q3": 0.9645071970001027, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9616797179999139, "hd15iqr": 0.9645071970001027, "ops": 1.038320831911571, "total": 1.9261869150000166, "data": [0.9645071970001027, 0.9616797179999139], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9607199209999635, "max": 0.9642473939998126, "mean": 0.9624836574998881, "stddev": 0.002494300078645769, "rounds": 2, "median": 0.9624836574998881, "iqr": 0.003527472999849124, "q1": 0.9607199209999635, "q3": 0.9642473939998126, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9607199209999635, "hd15iqr": 0.9642473939998126, "ops": 1.0389786800095526, "total": 1.9249673149997761, "data": [0.9607199209999635, 0.9642473939998126], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3218471299999237, "max": 2.40942713000004, "mean": 2.365637129999982, "stddev": 0.061928411896400096, "rounds": 2, "median": 2.365637129999982, "iqr": 0.08758000000011634, "q1": 2.3218471299999237, "q3": 2.40942713000004, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3218471299999237, "hd15iqr": 2.40942713000004, "ops": 0.4227191006255501, "total": 4.731274259999964, "data": [2.3218471299999237, 2.40942713000004], "iterations": 1}}], "datetime": "2023-11-14T11:15:26.367371", "version": "4.0.0"} +{"machine_info": {"node": "fv-az443-55", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz", "hz_advertised_friendly": "2.8000 GHz", "hz_actual_friendly": "2.7934 GHz", "hz_advertised": [2800000000, 0], "hz_actual": [2793437000, 0], "stepping": 6, "model": 106, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 50331648, "l2_cache_size": "2.5 MiB", "l1_data_cache_size": 98304, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "c273a41de7525529da4b9a0513d9d91fef6a4ad0", "time": "2023-11-14T13:36:00+00:00", "author_time": "2023-11-14T13:36:00+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2033099939999943, "max": 1.238169379999988, "mean": 1.2207396869999911, "stddev": 0.024649308228594957, "rounds": 2, "median": 1.2207396869999911, "iqr": 0.03485938599999372, "q1": 1.2033099939999943, "q3": 1.238169379999988, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2033099939999943, "hd15iqr": 1.238169379999988, "ops": 0.8191754643920307, "total": 2.4414793739999823, "data": [1.238169379999988, 1.2033099939999943], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.2479037479999988, "max": 1.44233907200001, "mean": 1.3451214100000044, "stddev": 0.13748653610261138, "rounds": 2, "median": 1.3451214100000044, "iqr": 0.19443532400001118, "q1": 1.2479037479999988, "q3": 1.44233907200001, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.2479037479999988, "hd15iqr": 1.44233907200001, "ops": 0.7434273163490845, "total": 2.690242820000009, "data": [1.44233907200001, 1.2479037479999988], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.055246579999988, "max": 3.1523890929999823, "mean": 3.103817836499985, "stddev": 0.06869012968379833, "rounds": 2, "median": 3.103817836499985, "iqr": 0.09714251299999432, "q1": 3.055246579999988, "q3": 3.1523890929999823, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.055246579999988, "hd15iqr": 3.1523890929999823, "ops": 0.322183856359189, "total": 6.20763567299997, "data": [3.055246579999988, 3.1523890929999823], "iterations": 1}}], "datetime": "2023-11-14T13:37:13.034004", "version": "4.0.0"} +{"machine_info": {"node": "fv-az1209-479", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445435000, 0], "hz_actual": [2445435000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "89d4143e34d51500e9b6a12ce404149ab2a7a959", "time": "2023-11-15T08:16:12+00:00", "author_time": "2023-11-15T08:16:12+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9436884730000088, "max": 1.0003212869999913, "mean": 0.9720048800000001, "stddev": 0.0400454468170641, "rounds": 2, "median": 0.9720048800000001, "iqr": 0.056632813999982545, "q1": 0.9436884730000088, "q3": 1.0003212869999913, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9436884730000088, "hd15iqr": 1.0003212869999913, "ops": 1.0288014191862904, "total": 1.9440097600000001, "data": [1.0003212869999913, 0.9436884730000088], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9924654269999849, "max": 0.9972088529999894, "mean": 0.9948371399999871, "stddev": 0.0033541086906597956, "rounds": 2, "median": 0.9948371399999871, "iqr": 0.004743426000004547, "q1": 0.9924654269999849, "q3": 0.9972088529999894, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9924654269999849, "hd15iqr": 0.9972088529999894, "ops": 1.0051896534542457, "total": 1.9896742799999743, "data": [0.9972088529999894, 0.9924654269999849], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3477884309999695, "max": 2.375456285000041, "mean": 2.3616223580000053, "stddev": 0.019564127184330014, "rounds": 2, "median": 2.3616223580000053, "iqr": 0.027667854000071657, "q1": 2.3477884309999695, "q3": 2.375456285000041, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3477884309999695, "hd15iqr": 2.375456285000041, "ops": 0.423437725601003, "total": 4.723244716000011, "data": [2.375456285000041, 2.3477884309999695], "iterations": 1}}], "datetime": "2023-11-15T08:17:38.729309", "version": "4.0.0"} +{"machine_info": {"node": "fv-az133-888", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 2, "arch_string_raw": "x86_64", "vendor_id_raw": "GenuineIntel", "brand_raw": "Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz", "hz_advertised_friendly": "2.6000 GHz", "hz_actual_friendly": "2.0951 GHz", "hz_advertised": [2600000000, 0], "hz_actual": [2095078000, 0], "stepping": 4, "model": 85, "family": 6, "flags": ["3dnowprefetch", "abm", "adx", "aes", "apic", "avx", "avx2", "avx512bw", "avx512cd", "avx512dq", "avx512f", "avx512vl", "bmi1", "bmi2", "clflush", "clflushopt", "cmov", "constant_tsc", "cpuid", "cx16", "cx8", "de", "erms", "f16c", "fma", "fpu", "fsgsbase", "fxsr", "hle", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "md_clear", "mmx", "movbe", "msr", "mtrr", "nopl", "nx", "osxsave", "pae", "pat", "pcid", "pclmulqdq", "pdpe1gb", "pge", "pni", "popcnt", "pse", "pse36", "pti", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "rtm", "sep", "smap", "smep", "ss", "sse", "sse2", "sse4_1", "sse4_2", "ssse3", "syscall", "tsc", "vme", "xsave", "xsavec", "xsaveopt", "xsaves", "xtopology"], "l3_cache_size": 37486592, "l2_cache_size": 2097152, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 256, "l2_cache_associativity": 6}}, "commit_info": {"id": "5db11da16fe4737a055d342ae66c7573749da37e", "time": "2023-11-15T14:18:16+00:00", "author_time": "2023-11-15T14:18:16+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.4092843529999755, "max": 1.4329079739999884, "mean": 1.421096163499982, "stddev": 0.016704422605290065, "rounds": 2, "median": 1.421096163499982, "iqr": 0.023623621000012918, "q1": 1.4092843529999755, "q3": 1.4329079739999884, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.4092843529999755, "hd15iqr": 1.4329079739999884, "ops": 0.7036821474045255, "total": 2.842192326999964, "data": [1.4329079739999884, 1.4092843529999755], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 1.457658572000014, "max": 1.4684790439999915, "mean": 1.4630688080000027, "stddev": 0.007651229126823324, "rounds": 2, "median": 1.4630688080000027, "iqr": 0.010820471999977599, "q1": 1.457658572000014, "q3": 1.4684790439999915, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 1.457658572000014, "hd15iqr": 1.4684790439999915, "ops": 0.6834948531005782, "total": 2.9261376160000054, "data": [1.4684790439999915, 1.457658572000014], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 3.7696876119999843, "max": 3.7856553129999497, "mean": 3.777671462499967, "stddev": 0.011290869657034802, "rounds": 2, "median": 3.777671462499967, "iqr": 0.015967700999965473, "q1": 3.7696876119999843, "q3": 3.7856553129999497, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 3.7696876119999843, "hd15iqr": 3.7856553129999497, "ops": 0.26471333198949637, "total": 7.555342924999934, "data": [3.7696876119999843, 3.7856553129999497], "iterations": 1}}], "datetime": "2023-11-15T14:19:34.287857", "version": "4.0.0"} +{"machine_info": {"node": "fv-az734-833", "processor": "x86_64", "machine": "x86_64", "python_compiler": "GCC 9.3.0", "python_implementation": "CPython", "python_implementation_version": "3.9.10", "python_version": "3.9.10", "python_build": ["main", "Feb 3 2022 07:33:39"], "release": "5.15.0-1050-azure", "system": "Linux", "cpu": {"python_version": "3.9.10.final.0 (64 bit)", "cpuinfo_version": [9, 0, 0], "cpuinfo_version_string": "9.0.0", "arch": "X86_64", "bits": 64, "count": 4, "arch_string_raw": "x86_64", "vendor_id_raw": "AuthenticAMD", "brand_raw": "AMD EPYC 7763 64-Core Processor", "hz_advertised_friendly": "2.4454 GHz", "hz_actual_friendly": "2.4454 GHz", "hz_advertised": [2445423000, 0], "hz_actual": [2445423000, 0], "stepping": 1, "model": 1, "family": 25, "flags": ["3dnowext", "3dnowprefetch", "abm", "adx", "aes", "aperfmperf", "apic", "arat", "avx", "avx2", "bmi1", "bmi2", "clflush", "clflushopt", "clwb", "clzero", "cmov", "cmp_legacy", "constant_tsc", "cpuid", "cr8_legacy", "cx16", "cx8", "de", "decodeassists", "erms", "extd_apicid", "f16c", "flushbyasid", "fma", "fpu", "fsgsbase", "fsrm", "fxsr", "fxsr_opt", "ht", "hypervisor", "invpcid", "invpcid_single", "lahf_lm", "lm", "mca", "mce", "misalignsse", "mmx", "mmxext", "movbe", "msr", "mtrr", "nonstop_tsc", "nopl", "npt", "nrip_save", "nx", "osvw", "osxsave", "pae", "pat", "pausefilter", "pcid", "pclmulqdq", "pdpe1gb", "pfthreshold", "pge", "pni", "popcnt", "pse", "pse36", "rdpid", "rdpru", "rdrand", "rdrnd", "rdseed", "rdtscp", "rep_good", "sep", "sha", "sha_ni", "smap", "smep", "sse", "sse2", "sse4_1", "sse4_2", "sse4a", "ssse3", "svm", "syscall", "topoext", "tsc", "tsc_reliable", "tsc_scale", "umip", "v_vmsave_vmload", "vaes", "vmcb_clean", "vme", "vmmcall", "vpclmulqdq", "xgetbv1", "xsave", "xsavec", "xsaveerptr", "xsaveopt", "xsaves"], "l3_cache_size": 524288, "l2_cache_size": 1048576, "l1_data_cache_size": 65536, "l1_instruction_cache_size": 65536, "l2_cache_line_size": 512, "l2_cache_associativity": 6}}, "commit_info": {"id": "94779baf85dd028c723165728dfd1ee9b75c2e6c", "time": "2023-11-15T16:26:53+00:00", "author_time": "2023-11-15T16:26:53+00:00", "dirty": false, "project": "splink", "branch": "master"}, "benchmarks": [{"group": null, "name": "test_2_rounds_1k_duckdb", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9633971350000081, "max": 0.9723507479999967, "mean": 0.9678739415000024, "stddev": 0.006331160468411921, "rounds": 2, "median": 0.9678739415000024, "iqr": 0.008953612999988536, "q1": 0.9633971350000081, "q3": 0.9723507479999967, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9633971350000081, "hd15iqr": 0.9723507479999967, "ops": 1.0331923994670307, "total": 1.9357478830000048, "data": [0.9723507479999967, 0.9633971350000081], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_duckdb_on_disk_performance", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_duckdb_on_disk_performance", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 0.9985922079999909, "max": 1.0012474090000012, "mean": 0.9999198084999961, "stddev": 0.001877510632520561, "rounds": 2, "median": 0.9999198084999961, "iqr": 0.0026552010000102655, "q1": 0.9985922079999909, "q3": 1.0012474090000012, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 0.9985922079999909, "hd15iqr": 1.0012474090000012, "ops": 1.0000801979311964, "total": 1.9998396169999921, "data": [1.0012474090000012, 0.9985922079999909], "iterations": 1}}, {"group": null, "name": "test_2_rounds_1k_sqlite", "fullname": "benchmarking/test_performance.py::test_2_rounds_1k_sqlite", "params": null, "param": null, "extra_info": {}, "options": {"disable_gc": false, "timer": "perf_counter", "min_rounds": 5, "max_time": 1.0, "min_time": 5e-06, "warmup": false}, "stats": {"min": 2.3476611810000065, "max": 2.365693077000003, "mean": 2.356677129000005, "stddev": 0.012750475939248142, "rounds": 2, "median": 2.356677129000005, "iqr": 0.01803189599999655, "q1": 2.3476611810000065, "q3": 2.365693077000003, "iqr_outliers": 0, "stddev_outliers": 0, "outliers": "0;0", "ld15iqr": 2.3476611810000065, "hd15iqr": 2.365693077000003, "ops": 0.4243262633198822, "total": 4.71335425800001, "data": [2.365693077000003, 2.3476611810000065], "iterations": 1}}], "datetime": "2023-11-15T16:27:57.640466", "version": "4.0.0"} diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb index 929a13490a..83e86a4418 100644 --- a/docs/demos/tutorials/03_Blocking.ipynb +++ b/docs/demos/tutorials/03_Blocking.ipynb @@ -153,19 +153,19 @@ "\n", "blocking_rule_1 = block_on([\"substr(first_name, 1,1)\", \"surname\"])\n", "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_1)\n", - "print(f\"Number of comparisons generated by '{blocking_rule_1.sql}': {count:,.0f}\")\n", + "print(f\"Number of comparisons generated by '{blocking_rule_1.blocking_rule_sql}': {count:,.0f}\")\n", "\n", "blocking_rule_2 = block_on(\"surname\")\n", "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_2)\n", - "print(f\"Number of comparisons generated by '{blocking_rule_2.sql}': {count:,.0f}\")\n", + "print(f\"Number of comparisons generated by '{blocking_rule_2.blocking_rule_sql}': {count:,.0f}\")\n", "\n", "blocking_rule_3 = block_on(\"email\")\n", "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_3)\n", - "print(f\"Number of comparisons generated by '{blocking_rule_3.sql}': {count:,.0f}\")\n", + "print(f\"Number of comparisons generated by '{blocking_rule_3.blocking_rule_sql}': {count:,.0f}\")\n", "\n", "blocking_rule_4 = block_on([\"city\", \"first_name\"])\n", "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_4)\n", - "print(f\"Number of comparisons generated by '{blocking_rule_4.sql}': {count:,.0f}\")\n" + "print(f\"Number of comparisons generated by '{blocking_rule_4.blocking_rule_sql}': {count:,.0f}\")\n" ] }, { diff --git a/docs/dev_guides/changing_splink/lint.md b/docs/dev_guides/changing_splink/lint.md deleted file mode 100644 index 0c3bf670e7..0000000000 --- a/docs/dev_guides/changing_splink/lint.md +++ /dev/null @@ -1,24 +0,0 @@ -## Linting your code - -For linting, we currently make use of both [ruff](https://github.com/charliermarsh/ruff) and [black](https://github.com/psf/black). - -These are used to ensure we produce readable, maintainable, and more consistent code. - -To quickly run both linters, simply run this bash script. The *-f* flag is called to run an automatic fix with ruff. -If you simply wish for ruff to print the errors it finds to the console, remove this flag. - -```sh -source scripts/lint_and_format.sh -f # with the fix flag -``` -```sh -source scripts/lint_and_format.sh # without -``` - -Alternatively, you can run ruff and black separately from a terminal instance. - -For black, you need to run: -`black .` - -and ruff requires: -`ruff --fix .` for automatic fixes and error printouts -or `ruff --show-source .` for a more thorough breakdown. \ No newline at end of file diff --git a/docs/dev_guides/index.md b/docs/dev_guides/index.md index de3e98f83b..8c4b5b1826 100644 --- a/docs/dev_guides/index.md +++ b/docs/dev_guides/index.md @@ -29,4 +29,4 @@ Splink is quite a large, complex codebase. The guides in this section lay out so * [Comparison and Comparison Level Libraries](./comparisons/new_library_comparisons_and_levels.md) - demonstrates how `Comparison` Library and `ComparisonLevel` Library functions are structured within Splink, including how to add new functions and edit existing functions. * [Charts](./charts.ipynb) - demonstrates how charts are built in Splink, including how to add new charts and edit existing charts. * [User-Defined Functions](./udfs.md) - demonstrates how User Defined Functions (UDFs) are used to provide functionality within Splink that is not native to a given SQL backend. - +* [Settings Validation](./settings_validation/settings_validation_overview.md) - summarises how to use and expand the existing settings schema and validation functions. diff --git a/docs/dev_guides/settings_validation/extending_settings_validator.md b/docs/dev_guides/settings_validation/extending_settings_validator.md new file mode 100644 index 0000000000..ab6eb2c8bd --- /dev/null +++ b/docs/dev_guides/settings_validation/extending_settings_validator.md @@ -0,0 +1,187 @@ +## Expanding the Settings Validator + +If a validation check is currently missing, you might want to expand the existing validation codebase. + +Before adding any code, it's essential to determine whether the checks you want to include fit into any of the general validation categories already in place. + +In summary, the following validation checks are currently carried out: + +* Verifying that the user's blocking rules and comparison levels have been [imported from the correct library](https://github.com/moj-analytical-services/splink/pull/1579) and contain sufficient information for Splink model usage. +* [Performing column lookups](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) to ensure that columns specified in the user's settings dictionary exist within **all** of the user's input dataframes. +* Various miscellaneous checks designed to generate more informative error messages for the user if they happen to employ Splink in an unintended manner. + +If you plan to introduce checks that differ from those currently in place, it's advisable to create a new script within `splink/settings_validation`. + +
+ +## Splink Exceptions and Warnings + +While working on extending the settings validation tools suite, it's important to consider how we notify users when they've included invalid settings or features. + +Exception handling and warnings should be integrated into your validation functions to either halt the program or inform the user when errors occur, raising informative error messages as needed. + +### Warnings in Splink + +Warnings should be employed when you want to alert the user that an included setting might lead to unintended consequences, allowing the user to decide if it warrants further action. + +This could be applicable in scenarios such as: + +* Parsing SQL where the potential for failure or incorrect column parsing exists. +* Situations where the user is better positioned to determine whether the issue should be treated as an error, like when dealing with exceptionally high values for [probability_two_random_records_match](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json#L29). + +Implementing warnings is straightforward and involves creating a logger instance within your script, followed by a warning call. + +??? note "Warnings in practice:" + ```py + import logging + logger = logging.getLogger(__name__) + + logger.warning("My warning message") + ``` + + Which will print: + + > `My warning message` + + to both the console and your log file. + +### Splink Exceptions + +Exceptions should be raised when you want the program to halt due to an unequivocal error. + +In addition to the built-in exception types, such as [SyntaxError](https://docs.python.org/3/library/exceptions.html#SyntaxError), we have several Splink-specific exceptions available for use. + +These exceptions serve to raise issues specific to Splink or to customize exception behavior. For instance, you can specify a message prefix by modifying the constructor of an exception, as exemplified in the [ComparisonSettingsException](https://github.com/moj-analytical-services/splink/blob/f7c155c27ccf3c906c92180411b527a4cfd1111b/splink/exceptions.py#L14). + +It's crucial to also consider how to inform the user that such behavior is not permitted. For guidelines on crafting effective error messages, refer to [How to Write Good Error Messages](https://uxplanet.org/how-to-write-good-error-messages-858e4551cd4). + +For a comprehensive list of exceptions native to Splink, visit [the exceptions.py script](https://github.com/moj-analytical-services/splink/blob/master/splink/exceptions.py). + +#### Raising Multiple Exceptions + +Raising multiple errors sequentially without disrupting the program, is a feature we commonly wish to implement across the validation steps. + +In numerous instances, it makes sense to wait until all checks have been performed before raising exceptions captured to the user in one go. + +To enable the logging of multiple errors in a singular check, or across multiple checks, an [`ErrorLogger`](https://github.com/moj-analytical-services/splink/blob/settings_validation_refactor_and_improved_logging/splink/exceptions.py#L34) class is available for use. + +The `ErrorLogger` operates in a similar way to working with a list, allowing you to add additional errors using the `append` method. Once you've logged all of your errors, you can raise them with the `raise_and_log_all_errors` method. + +??? note "`ErrorLogger` in practice" + ```py + from splink.exceptions import ErrorLogger + + # Create an error logger instance + e = ErrorLogger() + + # Log your errors + e.append(SyntaxError("The syntax is wrong")) + e.append(NameError("Invalid name entered")) + + # Raise your errors + e.raise_and_log_all_errors() + ``` + + ![](https://raw.githubusercontent.com/moj-analytical-services/splink/master/docs/img/settings_validation/error_logger.png) + +
+ +## Expanding our Miscellaneous Checks + +Miscellaneous checks should typically be added as standalone functions. These functions can then be integrated into the linker's startup process for validation. + +In most cases, you have more flexibility in how you structure your solutions. You can place the checks in a script that corresponds to the specific checks being performed, or, if one doesn't already exist, create a new script with a descriptive name. + +A prime example of a miscellaneous check is [`validate_dialect`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L31), which assesses whether the settings dialect aligns with the linker's dialect. + +
+ +## Additional Comparison and Blocking Rule Checks + +If your checks pertain to comparisons or blocking rules, most of these checks are currently implemented within the [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) script. + +Currently, comparison and blocking rule checks are organised in a modular format. + +To expand the current suite of tests, you should: + +1. Create a function to inspect the presence of the error you're evaluating. +2. Define an error message that you intend to add to the `ErrorLogger` class. +3. Integrate these elements into either the [`validate_comparison_levels`](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py#L43) function (or something similar), which appends any detected errors to an `ErrorLogger`. +4. Finally, work out where this function should live in the setup process of the linker object. Typically, you should look to add these checks before any processing of the settings dictionary is performed. + +The above steps are set to change as we are looking to refactor our settings object. + +
+ +## Checking that columns exist + +Should you need to include extra checks to assess the validity of columns supplied by a user, your primary focus should be on the [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) script. + +There are currently three classes employed to construct the current log strings. These can be extended to perform additional column checks. + +??? note "`InvalidCols`" + `InvalidCols` is a NamedTuple, used to construct the bulk of our log strings. This accepts a list of columns and the type of error, producing a complete log string when requested. + + In practice, this is used as follows: + ```py + # Store the invalid columns and why they're invalid + my_invalid_cols = InvalidCols("invalid_cols", ["first_col", "second_col"]) + # Construct the corresponding log string + my_invalid_cols.construct_log_string() + ``` + +??? note "`InvalidColValidator`" + `InvalidColValidator` houses a series of validation checks to evaluate whether the column(s) contained within either a SQL string or a user's raw input string, are present within the underlying dataframes. + + To achieve this, it employs a range of cleaning functions to standardise our column inputs and conducts a series of checks on these cleaned columns. It utilises `InvalidCols` tuples to log any identified invalid columns. + + It inherits from our the `SettingsValidator` class. + +??? note "`InvalidColumnsLogger`" + The principal logging class for our invalid column checks. + + This class primarily calls our builder functions outlined in `InvalidColValidator`, constructing a series of log strings for output to both the console and the user's log file (if it exists). + + +To extend the column checks, you simply need to add an additional validation method to the [`InvalidColValidator`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L15) class, followed by an extension of the [`InvalidColumnsLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L164). + +### A Practical Example of a Column Check + +For an example of column checks in practice, see [`validate_uid`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L195). + +Here, we call `validate_settings_column`, checking whether the unique ID column submitted by the user is valid. The output of this call yields either an `InvalidCols` tuple, or `None`. + +From there, we can use the built-in log constructor [`construct_generic_settings_log_string`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L329C27-L329C27) to construct and print the required logs. Where the output above was `None`, nothing is logged. + +If your checks aren't part of the initial settings check (say you want to assess additional columns found in blocking rules supplied at a later stage by the user), you should add a new method to `InvalidColumnsLogger`, similar in functionality to [`construct_output_logs`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L319). + +However, it is worth noting that not all checks are performed on a simple string columns. Where you require checks to be performed on SQL strings, there's an additional step required, outlined below. + +### Single Column Checks + +To review single columns, [`validate_settings_column`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L144) should be used. This takes in a `setting_id` (analogous to the title you want to give your log string) and a list of columns to be checked. + +A working example of this in practice can be found in the section above. + +### Checking Columns in SQL statements + +For raw SQL statements, you should make use of the [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) method. + +This takes in a list of SQL strings and spits out a list of `InvalidCols` tuples, depending on the checks you ask it to perform. + +Should you need more control, the process is similar to that of the single column case, just with an additional parsing step. + +Parsing is handled by [`parse_columns_in_sql`](https://github.com/moj-analytical-services/splink/blob/master/splink/parse_sql.py#L45). This will spit out a list of column names that were identified by sqlglot. + +> Note that as this is handled by SQLglot, it's not always 100% accurate. For our purposes though, its flexibility is unparalleled and allows us to more easily and efficiently extract column names. + +Once your columns have been parsed, you can again run a series of lookups against your input dataframe(s). This is identical to the steps outlined in the **Single Column Checks** section. + +You may also wish to perform additional checks on the columns, to assess whether they contain valid prefixes, suffixes or some other quality of the column. + +Additional checks can be passed to [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) and should be specified as methods in the `InvalidColValidator` class. + +See [validate_blocking_rules](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L209) for a practical example where we loop through each blocking rule, parse it and then assess whether it: + +1. Contains a valid list of columns +2. Each column contains a valid table prefix. diff --git a/docs/dev_guides/settings_validation/settings_validation_overview.md b/docs/dev_guides/settings_validation/settings_validation_overview.md new file mode 100644 index 0000000000..3654b30a6a --- /dev/null +++ b/docs/dev_guides/settings_validation/settings_validation_overview.md @@ -0,0 +1,55 @@ +## Settings Validation + +A common issue within Splink is users providing invalid settings dictionaries. To prevent this, the settings validator scans through a settings dictionary and provides user-friendly feedback on what needs to be fixed. + +At a high level, this includes: + +1. Assessing the structure of the settings dictionary. See the [Settings Schema Validation](#settings-schema-validation) section. +2. The contents of the settings dictionary. See the [Settings Vaildator](#settings-validator) section. + +
+ +## Settings Schema Validation + +Our custom settings schema can be found within [settings_jsonschema.json](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json). + +This is a json file, outlining the required data type, key and value(s) to be specified by the user while constructing their settings. Where values devivate from this specified schema, an error will be thrown. + +[Schema validation](https://github.com/moj-analytical-services/splink/blob/master/splink/validate_jsonschema.py) is currently performed inside the [settings.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings.py#L44C17-L44C17) script. + +You can modify the schema by manually editing the [json schema](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json). + +Modifications can be used to (amongst other uses): + +* Set or remove default values for schema keys. +* Set the required data type for a given key. +* Expand or refine previous titles and descriptions to help with clarity. + +Any updates you wish to make to the schema should be discussed with the wider team, to ensure it won't break backwards compatability and makes sense as a design decision. + +Detailed information on the arguments that can be suppled to the json schema can be found within the [json schema documentation](https://json-schema.org/learn/getting-started-step-by-step). + +
+ +## Settings Validator + +The settings validation code currently resides in the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) directory of Splink. This code is responsible for executing a secondary series of tests to determine whether all values within the settings dictionary will generate valid SQL. + +Numerous inputs pass our initial schema checks before breaking other parts of the codebase. These breaks are typically due to the construction of invalid SQL, that is then passed to the database engine, [commonly resulting in uninformative errors](https://github.com/moj-analytical-services/splink/issues/1362). + +Frequently encountered problems include: + +* Usage of invalid column names. For example, specifying a [`unique_id_column_name`](https://github.com/moj-analytical-services/splink/blob/settings_validation_docs/splink/files/settings_jsonschema.json#L61) that doesn't exist in the underlying dataframe(s). Such names satisfy the schema requirements as long as they are strings. +* Users not updating default values in the settings schema, even when these values are inappropriate for their provided input dataframes. +* Importing comparisons and blocking rules from incorrect sections of the codebase, or using an inappropriate data type (comparison level vs. comparison). +* Using Splink for an invalid form of linkage. See the [following dicsussion](https://github.com/moj-analytical-services/splink/issues/1362). + +Currently, the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) scripts are setup in a modular fashion, to allow each to inherit the checks it needs. + +The folder is comprised of three scripts, each of which inspects the settings dictionary at different stages of its journey: + +* [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) - This script includes various miscellaneous checks for comparison levels, blocking rules, and linker objects. These checks are primarily performed within settings.py. +* [settings_validator.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/settings_validator.py) - This script includes the core `SettingsValidator` class and contains a series of methods that retrieve information on fields within the user's settings dictionary that contain information on columns to be used in training and prediction. Additionally, it provides supplementary cleaning functions to assist in the removal of quotes, prefixes, and suffixes that may be present in a given column name. +* [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/column_lookups.py) - This script contains helper functions that generate a series of log strings outlining invalid columns identified within your settings dictionary. It primarily consists of methods that run validation checks on either raw SQL or input columns and assesses their presence in **all** dataframes supplied by the user. + +For information on expanding the range of checks available to the validator, see [Extending the Settings Validator](./extending_settings_validator.md). diff --git a/docs/img/settings_validation/error_logger.png b/docs/img/settings_validation/error_logger.png new file mode 100644 index 0000000000000000000000000000000000000000..a5cf2a785f0aadf0ac6403b3901a4ba7b9c52ab1 GIT binary patch literal 8360 zcmZX31ymf(67FJ)I|P^DF2Ogr6Wl$>!r~Gf7Kh*-+})kvt^tB4xGfMMI3W-$fw#H$ zzW?6y-kj6Z)m`;fP1Q{Iobz?GhMEF41~~=*0KisKl+}XwkMJgg287>@PTczd00ezI z85s>F85vp)Hy4PVqcs4)lI)bU$u5IHIu-=3#OKpkMwP_)s6mJjN5)p={Jc>)Jk^?;jI=6iY-5AkeFDnn3#sOK0l{5>Oxj7 zH4YA_!UmG(IV)_N<}_GVX*f-T5g+~j!dPUCUhqWx+MvHJvMpVI_NGmLZ+F0ga`gFu zeGEU|TDQ}sd9%9waA1j%(&)3@iw4a0?*wmrW_M+iaXAx+?mxn^uu#KY!gCf&35Wv; zyyPaYWyvbE*^l*s>;W}8}$AF+%2E~-&XfvXC zBHkVRSC!Yh1YVzogs5xwwl~v0<{6WYV>+gPoCgb2q_0ph*16+kbM5%)YkTZhXy#bV zj)|SqEptmw_t^E_75>5z?d|DZzVfu$#5Dv%9T} zhttK@2He%6osNNrfvO4!?Bc{_VdY|J&E@0d`WFHa^#Q>}CutRNsQS^59K;Uh6RTMrLc z5I48Cw>OtJAD4@p4L7fdhzK{&EACgXIN=tY?!L|*7CxNL?)3k5@_+rvTDybY>|8zU zT%2kD`n9lh@$?X*qx&1^pX=YxY3*b8-$>5x|49p;AopJlH!l|t_dmYjP|?4$APqYo zYez#_J12N~;4#Dng+%{l{{Jcd8}NUC2LA=V65#z`RbLk-RmnC{T&1wF;IRSi#E_5bf3R+l1%UiOWTfns z8~Hg>MMl!Ok^DC=jMTNkl7!t`NaRRWe|6du(kFKIShkkSP)RUtnzs!`gk2ot@SNnxv%X;H79}%@`ihJ zv-3GPL3U*Av@5efXfqBfS@iDK>A;xz#WYN29+zfr^gDFPkoDK9DP`$O^l=C?J8*;P zmljz-rP8lBorPHipdaf2_04PO@KIrJ)(Yl>-#lO6R%*gUqU^>;!wOxTpf$JGF@jq$ z%~>va20{)%42!WwTF?e4wrd}|Q2&*UVp2j|f4>{El#!dbPsPjQ6d}IpZfZ4)TSq>A z?{`VLd&>c3CRiPEr#a$3t@;Cx@#6RE$1RS-+35hw-z?W~aeO3S~AB`bTK5K@muVK%E*1E}) zPI@Ava7wUi`&L3a7v)ajYX@>UQlVf3loqtT)tM=@pD4mZ2nS2N>cO@;bL9}Teg_c@ z3ohLIi%zVqVv#?A^r7i)IT$B~Es&x;1sn|>D!@N$`^0 z<&sK;()7v-a-WHJ#Ya!BC0xTpC-L(~;c^zOc6yGOJ8@p8~{F10{Cu!^q+~@#H^C4_Cde**8Z6C0_Mk0Nnp1Wsy$N56^Cwf|K9zCuPY6ch1(v-O8`NdSk4e^|E(b zg^hbpCPW{^eWhBE|H=kOL+K$KgBP!#-X!xkQu-Wyn6sA5mVub>Pq~(_;Y6krvc-Ui z_GyTn*{DPWW3muc;zy@AZBJM1!#v8sXX}-z=594%#IFp}kL`JuDElK_J*He6fWwxm0^DpxPOB2FUS72!gK$7kF* zN8Bj~JD$1O5@uR>ui`n#!jsPg6E7rvu%wg$xi*wilV%WXfUA&bu3nv?viHRpjSYj$ zv$8I!YwSu0nc^*t0o$|{>mwUQ|5}Tz=(v`l;V?I8?Jh~@=}*gXWyRp=!A5(1{2DVO zMJ>Xn42vUdX*S0sNOdADb)wb>-R!PV-&`xipnS50uAF4HEJQ<9)LcQre#S7DW?!jW z+R5opsT)ibjCtk|V6lGTSd5K2jFu;_YE~I?5J68wij_c!Aqm46S`oMW+ErX!T&dcv z#%64$%?jU3yxucA3M(2{CdwHevsgxs=q7~;mAuinqg6Lc&pi(X1+7)1t`FEVEltcq zDzc}11UFwP2^(9-+X>T?SSn#l!;AQWj|WgH4p0b-g&eV<|Gp3;Bru(wa7=A!IZ1R1lyQ1dW5S%g0M-f42<4_-^q zq{Bt_<7HC&h(<~{uVXRfNIy6p2kQq`($ebl1ac)ZH)!2G*df37RdkrBHTbWxrbgabVn>lGz_BUk6 z`3A0k4&N>mcJ}yP#seV=;cu-xBBn#b9}`5GaSiru-Pyyf{Vses`>hX<;0kz@cA@5b zYl^6$kaxdt9hp{6Yp0d+eK5=idmy)zy9y7E;&uqwp$)0CkS8P_zR^Idxfo4^x>uE5gFvc-=wt+BkRMaRK*>L$>-amAhpt1DpoI3f*YTW=er3Lba? zB#=98BbO$B`$w1#0KLgC_lOUxuoEb#?__urAJnB$Xj8(Q1mc0E3y6Cb%Fb$mVMimu zOus7JO>b)Xfz*R=SwW+ikRO<@2LC4qftGb=-+*EGSPa=K1*}?`SBJB=ieaFa_q=mm zx|r3s2PYHT?=VhysUAtOzCW_=PbJg8kKf=M_RLw4IGvB@$`z49y*)21c|I+JeU{26 zLvL?}>}wrw-2^1IdfRTRby4{-4&vIeVsoMX$uP$m6Dhm>Q55pW?+dC}cUh+0{o~`8 ze$-oyDlNto&nYrKNNLEQM~%*AZU%>b^*1)oy*8*elpr6#*-B>)$n!|5kIdh01r|b` zqwM=O-2HF{THDM2dvtZtVYx~-=U2Mm@X)S=Lo0*$j!^5iqa@iYt3nP#M7!19`|Cf6 ze71wdjIWL}4GzzT)z_~5{a3t;B?Bju#h}eNoVlg}Cjz$8sJK*WOMwNcGaD(Vp7y=+ z;v7nS**wwGE*pi)8C=?j&NSmD7$j`6%iIOZg=&FUn(CoCq05U=J7IE!EB#Q4O^nW| z>_IQKCx70m*C&TGax#EJr_JiljRH{McxZ|4XOb`R4kWT&MTuLz;we2D4mug)6aux} zvnZG^lAeILr#4j-#KcOd&mva$D0zuCOlAqxQmDTiE7JNoOB-RzSI`PO!YbpRiy38S zvyp*JdxM$%kC5VTi+*Ri^UkyuMXTod@+&pX@i*Z!b?P644Mc^89ut`>6GI19+e4ZqWazu2;!cj$&kQU#Uv9S-7BMS0 zA8WkoveB>VXbM$LdwrM&-zt!$Jsq5Qmd5l31h*o50jf1I6{357tki35K24PSX)}?X zClQeQHD9*mqK$t((nvY3#H}o5p`azm8UJxRM=(}p){3gg?Xdat-3Z%k1^VtF~Ik+z2lmeW?V>bY_2 zk-6RBOkVf3-FC=On)lfnJ+;ZK*dvzHW?P=u#g+_TVCI|EX6v$6_Mno=@3s$?8_3G0 z*GN{?1@HVt+z$$E3RtDNQ(0xeV+j@LgcWlEEwAmEAfi#{3%w3DMjbv?*FiVO0&hMC zUej6$XZZiNlK=%A#NkOb70((zOl9#g_v+qa$5=_Y#X%R3x67z87<9j8+QbkQjVcF~ z{4i55ZbT5DKL{lQ@>oH5yFp1plo(g*8IFMslP_C%4#qLHppD=5nLJf6rl#l=at zpJV&|`}g-SYSVo(A4KHfZAa0qGf}!4 zV7crLx}Y}g@|TNRF+h*S9SfBcAt7R*sBtSJ$(oy4c+I(8qWuZ?OVjY-e8tgLS`Swg7!&j{VwjwP3k)C6iVXO-rSXPy8cHkshU71m7bD5>bRd$x!JPTc_- zkBu62(J!a1jqlfsM4*AkG?%8b`XAACJ1x)V+@l!NK6>$(Sw90LW0f-ZB5_HXzJ|J+4?(8X^S zqP8_8;d-0uHm4&wg5kW`rHY-yUqQmIR|s@jOnV2$Wp!;^s?M!Dc|t;c|88|s+s2+qYpkd}) z2GrdZp|H4|!>6;l+Mibb^KhNQfhNqexf=#{QNu|1Eq~ zd0t5`6{Zm&k|fjN7n7 z40Ki~Memmo_Bx&ih^B$J7_O%PNk6?>t>LR0?e-$J-OHFgjf9@}a&-|gVn-+M(yxTq zMn*;!!kfS&D}wv$cIl!UE98l6!Jfc7pPW6vEI#0`3(hseK54lskNe5%<%fPo=J?2^ zwT)?d5?ewM-~6QVSt3>y`l+^P1q^b6K;u3wWEF=c4gKdnHU3IE6YnnvoDo{5PmW+3 z$?yBTwj7MHBqX3$dODw!z-wj{DtDDeWz7sVVzwN6-?um|xebej2MzSfJy=)24M$?p zadUHLKmAlurXgesppV71dv)-+f#axtBUj8uYyKr;S_paydXkVj^-k%M{pU5^&)0h)wR*HVj@mDl4Li8W#}7t`1U(`4 zzO6{F93?e4GZLs$XrICP6s3}jBtbE!FL(`14_pqDVF9s$qY$)*D{a}(cqLb-ac|0|@t~KwRl4;qeioDfvOXXw4sB#ASyu}3w z$@Z1-lDJtd%MA^Gcu!{Z-tQidRSX+lyY+-T%VGQV@-(yx`^D0%w+tS{+U>z@!HWeSlb!sdO7#gV9Wb zpgZRn(E?GwBc)HDWP_S~A!h3yE?4H1nKEAZ*lYI4;Jf02}~(w7=bxQsz>a9~o+ zdNZt$zzY#u5__w9S9*&-BCaPxU5({QsdxV2Iqwbr%k05xj2Bbt3W<4*js!N9!KtGs znu^Y(GIa^Pdv3(Bc}z(=)(?$_kOb!~#4-1cko_&PN{q<^X2 z61%9DCxmU$q5O2Bv~a%RZBsZRpG`SW8;Z_e%VWJ5v}*7$E9XfXa+jfhwDl!bxngWVT+`S z)})v4fpy21FYtlpUy86eT^UyoO#9=n?6Yx{O^(Eh@s}t3>UK0}kHtU@mkATQi8KlO zkS|k*)H@N4aVgE%L9wvV+cTDum%h9(rJioN)61e!ZkH|5bh2S)lBe%V=$8VYNHoT1 z`o!PZ0Ydp^5N_|td0C49K^-C72W!2{*{}j<%uC;<7BsGg<$5`!6#+hh&Po2nX3W**6%MuKaejy{a}84$_`a1`ufMOIpA(;Hmg0U z{Ob1l?pu?15Z1dr$UUPN5WRmo`qY4jo6S-c_Y9f>HFxu+QZGpUeW~JECTPWqK2e$^_4uFcB zkckH?dH`Q#sDF+9nm)0{-TC6M?}mX|3n$?e#eb1dN}HhAhQ8ESff8;H)UYTe5=u?1 z`kbDL#jS4nBxM-%l+@F^=B(h?F>#&et5*0rjx{i=7A>C$ltzE(XAD0alt&r(V2i2n zs&gQRVqWV3=T7CTvT!rt2ikr3MZ=C@?8#TwA6k?*If8ap(Sl*vh%SsM8ZxgxLe-;C z19zdfqpuJCroPrnQYC+r_1fF{2b#=dK41!A+jc!hquFQTl|_9F0^LVqlOTLyWEk4T z%ov0_6jStJt!G{9Ay{6kQPN+B&c^9)=}3X>dUZt)@!4+}(nBd@ozH8@78s6Svf)$4 zzR6%?O%OzQLZDnx)zfr{A3ll~cB!CY}e86H>l~SpXXZk>i3LIHffa@KABL zDzcL5Kw zPwHSS4&Ds4_*fyP_saS>nk&;Cjlv*;q>}(5rATebSO)>sfnXIK{rV>aBOpP?$>FXy1$_uX25TC*D%%o`Kd zWb=C!1q{@~kNj%3a(3TExb&oblkW|9#6{DzJbTFlYIR{hpGM0?i0E)%ruKWfFAkse zY^o1?xGid<>MpD{mDFMEh@hKOoJbixT8Spq=`~tJuAe)X*KqR`ZTpp9@x?dh z4^E}=3ts=!ls+A<%l*BBuvk6%dMIya7vmh2#ta3YvV#fYLy36ADV2R%?mD=;YGst z1Zsx=HrxvWJn=* zfk-B!P)8F-#IX~{9@=vf_Vjil_P4c}`&(%g@GRsLDVY!iCi`qb5rf`he9(8S8nX5I zivld)dp77aMjo&!beom+@%X~gi*Fc!JWMBOZ|W01ZEdv(<$oIOF1havfau5XOvV5F zKnHQul0BCMh|7xH2OfVz1~Tf-E_og_xR;(@U#MzY4&LJ>18uZX(DDn5r(rQI(|eMZ zCGRW6QLoQ9HJx8KXCJW*Fw=lP{ciMiP2^Fm2Yj_$c^!dG#xk#CzKGfpb9?sMD|Xgv zkuA+CKKDihy5(=d5G5N6NLm!hEM^91+Ap+}j!_RbA;*0tzRMF3LRKH_ll=ISr_t8F_A(3! zQt@8s`VWLCk}u`1J&@htp%B+YRmp`)TbQ4TKl3>vqG^``iqhinZOYZu?((V5R*+7_ zHD@$G;(=fP{igEmwM})qN*td1daZEKtixRmGiOiD%9V*16ME4YS;Ap?Xh?0|+X#a~ z3nrtlrB%!h#(RTm4K1SA&sjn-LtUvs{<1S8jVbh34K=}?mG_mPW3u^(7RUshp7)BQ zqn%nc$FZAsxPYWADe1EL+Lbll82ty`5KRm#KBROEPBNB`H9DJtVd#GeM`o*+*F5~d6ms}){{#7OS3rCS dGcwXMDuEJV%}>{d=D%mtN^)wl_0r~{{|BAue8K<# literal 0 HcmV?d00001 diff --git a/docs/topic_guides/splink_fundamentals/backends.md b/docs/topic_guides/splink_fundamentals/backends.md deleted file mode 100644 index 66e806ca18..0000000000 --- a/docs/topic_guides/splink_fundamentals/backends.md +++ /dev/null @@ -1,101 +0,0 @@ ---- -tags: - - Spark - - DuckDB - - Athena - - SQLite - - Postgres - - Backends ---- - -# Splink's SQL backends: Spark, DuckDB, etc - -Splink is a Python library. It implements all data linking computations by generating SQL, and submitting the SQL statements to a backend of the user's choosing for execution. - -For smaller input datasets of up to 1-2 million records, users can link data in Python on their laptop using the DuckDB backend. This is the recommended approach because the DuckDB backend is installed automatically when the user installs Splink using `pip install splink`. No additional configuration is needed. - -Linking larger datasets requires highly computationally intensive calculations, and generates datasets which are too large to be processed on a standard laptop. For these scenarios, we recommend using one of Splink's big data backend - currently Spark or AWS Athena. When these backends are used, the SQL generated by Splink is sent to the chosen backend for execution. - -The Splink code you write is almost identical between backends, so it's straightforward to migrate between backends. Often, it's a good idea to start working using DuckDB on a sample of data, because it will produce results very quickly. When you're comfortable with your model, you may wish to migrate to a big data backend to estimate/predict on the full dataset. - -## Choosing a backend - -Import the linker from the backend of your choosing, and the backend-specific comparison libraries. - -Once you have initialised the `linker` object, there is no difference in the subsequent code between backends. - -Note however, that not all comparison functions are available in all backends. -There are tables detailing the available functions for each backend on -the [comparison library API page](../../comparison_library.md) and the [comparison level library API page](../../comparison_level_library.md). - -=== ":simple-duckdb: DuckDB" - - ```python - from splink.duckdb.linker import DuckDBLinker - import splink.duckdb.comparison_library as cl - import splink.duckdb.comparison_level_library as cll - - linker = DuckDBLinker(your_args) - ``` - -=== ":simple-apachespark: Spark" - - ```python - from splink.spark.linker import SparkLinker - import splink.spark.comparison_library as cl - import splink.spark.comparison_level_library as cll - - linker = SparkLinker(your_args) - ``` - -=== ":simple-amazonaws: Athena" - - ```python - from splink.athena.linker import AthenaLinker - import splink.athena.comparison_library as cl - import splink.athena.comparison_level_library as cll - - linker = AthenaLinker(your_args) - ``` - -=== ":simple-sqlite: SQLite" - - ```python - from splink.sqlite.linker import SQLiteLinker - import splink.sqlite.comparison_library as cl - import splink.sqlite.comparison_level_library as cll - - linker = SQLiteLinker(your_args) - - ``` - -=== ":simple-postgresql: PostgreSql" - - ```python - from splink.postgres.linker import PostgresLinker - import splink.postgres.comparison_library as cl - import splink.postgres.comparison_level_library as cll - - linker = PostgresLinker(your_args) - - ``` - -## Information for specific backends - -### :simple-sqlite: SQLite - -[**SQLite**](https://www.sqlite.org/index.html) does not have native support for [fuzzy string-matching](../comparisons/comparators.html) functions. -However, some are available for Splink users as python [user-defined functions (UDFs)](../../dev_guides/udfs.html#sqlite): - -* [`levenshtein`](../../comparison_level_library.html#splink.comparison_level_library.LevenshteinLevelBase) -* [`damerau_levenshtein`](../../comparison_level_library.html#splink.comparison_level_library.DamerauLevenshteinLevelBase) -* [`jaro`](../../comparison_level_library.html#splink.comparison_level_libraryJaroLevelBase) -* [`jaro_winkler`](../../comparison_level_library.html#splink.comparison_level_library.JaroWinklerLevelBase) - -However, there are a couple of points to note: - -* These functions are implemented using the [rapidfuzz](https://maxbachmann.github.io/RapidFuzz/) package, which must be installed if you wish to make use of them, via e.g. `pip install rapidfuzz`. If you do not wish to do so you can disable the use of these functions when creating your linker: -```py -linker = SQLiteLinker(df, settings, ..., register_udfs=False) -``` -* As these functions are implemented in python they will be considerably slower than any native-SQL comparisons. If you find that your model-training or predictions are taking a large time to run, you may wish to consider instead switching to DuckDB (or some other backend). diff --git a/docs/topic_guides/backends/backends.md b/docs/topic_guides/splink_fundamentals/backends/backends.md similarity index 100% rename from docs/topic_guides/backends/backends.md rename to docs/topic_guides/splink_fundamentals/backends/backends.md diff --git a/docs/topic_guides/backends/postgres.md b/docs/topic_guides/splink_fundamentals/backends/postgres.md similarity index 100% rename from docs/topic_guides/backends/postgres.md rename to docs/topic_guides/splink_fundamentals/backends/postgres.md diff --git a/docs/topic_guides/topic_guides_index.md b/docs/topic_guides/topic_guides_index.md index 34e7439701..703e638a2a 100644 --- a/docs/topic_guides/topic_guides_index.md +++ b/docs/topic_guides/topic_guides_index.md @@ -5,7 +5,7 @@ This section contains in-depth guides on a variety of topics and concepts within The topic guides are broken up into the following categories: 1. [Record Linkage Theory](theory/record_linkage.md) - for an introduction to data linkage from a theoretical perspective, and to help build some intuition around the parameters being estimated in Splink models. -2. [Linkage Models in Splink](backends/backends.md) - for an introduction to the building blocks of a Splink model. Including the supported SQL Backends and how to define a model with a Splink Settings dictionary. +2. [Linkage Models in Splink](splink_fundamentals/backends/backends.md) - for an introduction to the building blocks of a Splink model. Including the supported SQL Backends and how to define a model with a Splink Settings dictionary. 3. [Data Preparation](data_preparation/feature_engineering.md) - for guidance on preparing your data for linkage. Including guidance on feature engineering to help improve Splink models. 4. [Blocking](blocking/blocking_rules.md) - for an introduction to Blocking Rules and their purpose within record linkage. Including how blocking rules are used in different contexts within Splink. 5. [Comparing Records](comparisons/customising_comparisons.ipynb) - for guidance on defining `Comparison`s withing a Splink model. Including how comparing records are structured within `Comparison`s, how to utilise string comparators for fuzzy matching and how deal with skewed data with Term Frequency Adjustments. diff --git a/mkdocs.yml b/mkdocs.yml index 376261992d..877d6bf1f0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -52,7 +52,6 @@ plugins: python: rendering: show_source: false - custom_templates: templates - git-revision-date-localized: enable_creation_date: true type: timeago @@ -131,8 +130,8 @@ nav: - The Fellegi-Sunter Model: "topic_guides/theory/fellegi_sunter.md" - Linkage Models in Splink: - Splink's SQL backends - Spark, DuckDB etc: - - Backends overview: "topic_guides/backends/backends.md" - - PostgreSQL: "topic_guides/backends/postgres.md" + - Backends overview: "topic_guides/splink_fundamentals/backends/backends.md" + - PostgreSQL: "topic_guides/splink_fundamentals/backends/postgres.md" - Link type - linking vs deduping: "topic_guides/splink_fundamentals/link_type.md" - Defining Splink models: "topic_guides/splink_fundamentals/settings.md" - Retrieving and querying Splink results: "topic_guides/splink_fundamentals/querying_splink_results.md" @@ -140,7 +139,7 @@ nav: - Feature Engineering: "topic_guides/data_preparation/feature_engineering.md" - Blocking: - What are Blocking Rules?: "topic_guides/blocking/blocking_rules.md" - - Prediction Blocking Rules: "topic_guides/blocking/predictions.md" + - Prediction Blocking Rules: "topic_guides/blocking/predictions.md" - Model Training Blocking Rules: "topic_guides/blocking/model_training.md" - Computational Performance: "topic_guides/blocking/performance.md" - Comparing Records: @@ -182,13 +181,13 @@ nav: - Comparisons API: - Comparison: "comparison.md" - Comparison Level: "comparison_level.md" - - Charts Gallery: + - Charts Gallery: - "charts/index.md" - Exploratory Analysis: - completeness chart: "charts/completeness_chart.ipynb" - missingness chart: "charts/missingness_chart.ipynb" - profile columns: "charts/profile_columns.ipynb" - - Blocking: + - Blocking: - cumulative num comparisons from blocking rules chart: "charts/cumulative_num_comparisons_from_blocking_rules_chart.ipynb" - Comparison Helpers: - comparator score chart: "topic_guides/comparisons/choosing_comparators.html#comparing-string-similarity-and-distance-scores" @@ -231,12 +230,15 @@ nav: - Comparison and comparison level libraries: - Creating new comparisons and comparison levels: "dev_guides/comparisons/new_library_comparisons_and_levels.md" - Extending existing comparisons and comparison levels: "dev_guides/comparisons/extending_library_comparisons_and_levels.md" - - Charts: + - Charts: - Understanding and editing charts: "dev_guides/charts/understanding_and_editing_charts.md" - Building new charts: "dev_guides/charts/building_charts.ipynb" - User-Defined Functions: "dev_guides/udfs.md" + - Settings Validation: + - Settings Validation Overview: "dev_guides/settings_validation/settings_validation_overview.md" + - Extending the Settings Validator: "dev_guides/settings_validation/extending_settings_validator.md" - Blog: - - blog/index.md + - blog/index.md extra_css: - css/custom.css - css/neoteroi-mkdocs.css diff --git a/poetry.lock b/poetry.lock index 19a96f6b0f..c16d4920ed 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,98 +25,98 @@ files = [ [[package]] name = "aiohttp" -version = "3.8.5" +version = "3.8.6" description = "Async http client/server framework (asyncio)" -optional = true +optional = false python-versions = ">=3.6" files = [ - {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"}, - {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"}, - {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"}, - {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"}, - {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"}, - {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"}, - {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"}, - {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"}, - {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"}, - {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"}, - {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"}, - {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"}, - {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"}, - {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"}, - {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"}, - {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"}, + {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41d55fc043954cddbbd82503d9cc3f4814a40bcef30b3569bc7b5e34130718c1"}, + {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d84166673694841d8953f0a8d0c90e1087739d24632fe86b1a08819168b4566"}, + {file = "aiohttp-3.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:253bf92b744b3170eb4c4ca2fa58f9c4b87aeb1df42f71d4e78815e6e8b73c9e"}, + {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd194939b1f764d6bb05490987bfe104287bbf51b8d862261ccf66f48fb4096"}, + {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c5f938d199a6fdbdc10bbb9447496561c3a9a565b43be564648d81e1102ac22"}, + {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2817b2f66ca82ee699acd90e05c95e79bbf1dc986abb62b61ec8aaf851e81c93"}, + {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa375b3d34e71ccccf172cab401cd94a72de7a8cc01847a7b3386204093bb47"}, + {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9de50a199b7710fa2904be5a4a9b51af587ab24c8e540a7243ab737b45844543"}, + {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e1d8cb0b56b3587c5c01de3bf2f600f186da7e7b5f7353d1bf26a8ddca57f965"}, + {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8e31e9db1bee8b4f407b77fd2507337a0a80665ad7b6c749d08df595d88f1cf5"}, + {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7bc88fc494b1f0311d67f29fee6fd636606f4697e8cc793a2d912ac5b19aa38d"}, + {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ec00c3305788e04bf6d29d42e504560e159ccaf0be30c09203b468a6c1ccd3b2"}, + {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad1407db8f2f49329729564f71685557157bfa42b48f4b93e53721a16eb813ed"}, + {file = "aiohttp-3.8.6-cp310-cp310-win32.whl", hash = "sha256:ccc360e87341ad47c777f5723f68adbb52b37ab450c8bc3ca9ca1f3e849e5fe2"}, + {file = "aiohttp-3.8.6-cp310-cp310-win_amd64.whl", hash = "sha256:93c15c8e48e5e7b89d5cb4613479d144fda8344e2d886cf694fd36db4cc86865"}, + {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e2f9cc8e5328f829f6e1fb74a0a3a939b14e67e80832975e01929e320386b34"}, + {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e6a00ffcc173e765e200ceefb06399ba09c06db97f401f920513a10c803604ca"}, + {file = "aiohttp-3.8.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:41bdc2ba359032e36c0e9de5a3bd00d6fb7ea558a6ce6b70acedf0da86458321"}, + {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14cd52ccf40006c7a6cd34a0f8663734e5363fd981807173faf3a017e202fec9"}, + {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2d5b785c792802e7b275c420d84f3397668e9d49ab1cb52bd916b3b3ffcf09ad"}, + {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1bed815f3dc3d915c5c1e556c397c8667826fbc1b935d95b0ad680787896a358"}, + {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96603a562b546632441926cd1293cfcb5b69f0b4159e6077f7c7dbdfb686af4d"}, + {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d76e8b13161a202d14c9584590c4df4d068c9567c99506497bdd67eaedf36403"}, + {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e3f1e3f1a1751bb62b4a1b7f4e435afcdade6c17a4fd9b9d43607cebd242924a"}, + {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:76b36b3124f0223903609944a3c8bf28a599b2cc0ce0be60b45211c8e9be97f8"}, + {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:a2ece4af1f3c967a4390c284797ab595a9f1bc1130ef8b01828915a05a6ae684"}, + {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:16d330b3b9db87c3883e565340d292638a878236418b23cc8b9b11a054aaa887"}, + {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42c89579f82e49db436b69c938ab3e1559e5a4409eb8639eb4143989bc390f2f"}, + {file = "aiohttp-3.8.6-cp311-cp311-win32.whl", hash = "sha256:efd2fcf7e7b9d7ab16e6b7d54205beded0a9c8566cb30f09c1abe42b4e22bdcb"}, + {file = "aiohttp-3.8.6-cp311-cp311-win_amd64.whl", hash = "sha256:3b2ab182fc28e7a81f6c70bfbd829045d9480063f5ab06f6e601a3eddbbd49a0"}, + {file = "aiohttp-3.8.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fdee8405931b0615220e5ddf8cd7edd8592c606a8e4ca2a00704883c396e4479"}, + {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d25036d161c4fe2225d1abff2bd52c34ed0b1099f02c208cd34d8c05729882f0"}, + {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d791245a894be071d5ab04bbb4850534261a7d4fd363b094a7b9963e8cdbd31"}, + {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0cccd1de239afa866e4ce5c789b3032442f19c261c7d8a01183fd956b1935349"}, + {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f13f60d78224f0dace220d8ab4ef1dbc37115eeeab8c06804fec11bec2bbd07"}, + {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a9b5a0606faca4f6cc0d338359d6fa137104c337f489cd135bb7fbdbccb1e39"}, + {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:13da35c9ceb847732bf5c6c5781dcf4780e14392e5d3b3c689f6d22f8e15ae31"}, + {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:4d4cbe4ffa9d05f46a28252efc5941e0462792930caa370a6efaf491f412bc66"}, + {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:229852e147f44da0241954fc6cb910ba074e597f06789c867cb7fb0621e0ba7a"}, + {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:713103a8bdde61d13490adf47171a1039fd880113981e55401a0f7b42c37d071"}, + {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:45ad816b2c8e3b60b510f30dbd37fe74fd4a772248a52bb021f6fd65dff809b6"}, + {file = "aiohttp-3.8.6-cp36-cp36m-win32.whl", hash = "sha256:2b8d4e166e600dcfbff51919c7a3789ff6ca8b3ecce16e1d9c96d95dd569eb4c"}, + {file = "aiohttp-3.8.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0912ed87fee967940aacc5306d3aa8ba3a459fcd12add0b407081fbefc931e53"}, + {file = "aiohttp-3.8.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e2a988a0c673c2e12084f5e6ba3392d76c75ddb8ebc6c7e9ead68248101cd446"}, + {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebf3fd9f141700b510d4b190094db0ce37ac6361a6806c153c161dc6c041ccda"}, + {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3161ce82ab85acd267c8f4b14aa226047a6bee1e4e6adb74b798bd42c6ae1f80"}, + {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d95fc1bf33a9a81469aa760617b5971331cdd74370d1214f0b3109272c0e1e3c"}, + {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c43ecfef7deaf0617cee936836518e7424ee12cb709883f2c9a1adda63cc460"}, + {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca80e1b90a05a4f476547f904992ae81eda5c2c85c66ee4195bb8f9c5fb47f28"}, + {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:90c72ebb7cb3a08a7f40061079817133f502a160561d0675b0a6adf231382c92"}, + {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb54c54510e47a8c7c8e63454a6acc817519337b2b78606c4e840871a3e15349"}, + {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:de6a1c9f6803b90e20869e6b99c2c18cef5cc691363954c93cb9adeb26d9f3ae"}, + {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:a3628b6c7b880b181a3ae0a0683698513874df63783fd89de99b7b7539e3e8a8"}, + {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fc37e9aef10a696a5a4474802930079ccfc14d9f9c10b4662169671ff034b7df"}, + {file = "aiohttp-3.8.6-cp37-cp37m-win32.whl", hash = "sha256:f8ef51e459eb2ad8e7a66c1d6440c808485840ad55ecc3cafefadea47d1b1ba2"}, + {file = "aiohttp-3.8.6-cp37-cp37m-win_amd64.whl", hash = "sha256:b2fe42e523be344124c6c8ef32a011444e869dc5f883c591ed87f84339de5976"}, + {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9e2ee0ac5a1f5c7dd3197de309adfb99ac4617ff02b0603fd1e65b07dc772e4b"}, + {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01770d8c04bd8db568abb636c1fdd4f7140b284b8b3e0b4584f070180c1e5c62"}, + {file = "aiohttp-3.8.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c68330a59506254b556b99a91857428cab98b2f84061260a67865f7f52899f5"}, + {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89341b2c19fb5eac30c341133ae2cc3544d40d9b1892749cdd25892bbc6ac951"}, + {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71783b0b6455ac8f34b5ec99d83e686892c50498d5d00b8e56d47f41b38fbe04"}, + {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f628dbf3c91e12f4d6c8b3f092069567d8eb17814aebba3d7d60c149391aee3a"}, + {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b04691bc6601ef47c88f0255043df6f570ada1a9ebef99c34bd0b72866c217ae"}, + {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee912f7e78287516df155f69da575a0ba33b02dd7c1d6614dbc9463f43066e3"}, + {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9c19b26acdd08dd239e0d3669a3dddafd600902e37881f13fbd8a53943079dbc"}, + {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:99c5ac4ad492b4a19fc132306cd57075c28446ec2ed970973bbf036bcda1bcc6"}, + {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f0f03211fd14a6a0aed2997d4b1c013d49fb7b50eeb9ffdf5e51f23cfe2c77fa"}, + {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:8d399dade330c53b4106160f75f55407e9ae7505263ea86f2ccca6bfcbdb4921"}, + {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ec4fd86658c6a8964d75426517dc01cbf840bbf32d055ce64a9e63a40fd7b771"}, + {file = "aiohttp-3.8.6-cp38-cp38-win32.whl", hash = "sha256:33164093be11fcef3ce2571a0dccd9041c9a93fa3bde86569d7b03120d276c6f"}, + {file = "aiohttp-3.8.6-cp38-cp38-win_amd64.whl", hash = "sha256:bdf70bfe5a1414ba9afb9d49f0c912dc524cf60141102f3a11143ba3d291870f"}, + {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d52d5dc7c6682b720280f9d9db41d36ebe4791622c842e258c9206232251ab2b"}, + {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ac39027011414dbd3d87f7edb31680e1f430834c8cef029f11c66dad0670aa5"}, + {file = "aiohttp-3.8.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f5c7ce535a1d2429a634310e308fb7d718905487257060e5d4598e29dc17f0b"}, + {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b30e963f9e0d52c28f284d554a9469af073030030cef8693106d918b2ca92f54"}, + {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:918810ef188f84152af6b938254911055a72e0f935b5fbc4c1a4ed0b0584aed1"}, + {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:002f23e6ea8d3dd8d149e569fd580c999232b5fbc601c48d55398fbc2e582e8c"}, + {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fcf3eabd3fd1a5e6092d1242295fa37d0354b2eb2077e6eb670accad78e40e1"}, + {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:255ba9d6d5ff1a382bb9a578cd563605aa69bec845680e21c44afc2670607a95"}, + {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d67f8baed00870aa390ea2590798766256f31dc5ed3ecc737debb6e97e2ede78"}, + {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:86f20cee0f0a317c76573b627b954c412ea766d6ada1a9fcf1b805763ae7feeb"}, + {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:39a312d0e991690ccc1a61f1e9e42daa519dcc34ad03eb6f826d94c1190190dd"}, + {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e827d48cf802de06d9c935088c2924e3c7e7533377d66b6f31ed175c1620e05e"}, + {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bd111d7fc5591ddf377a408ed9067045259ff2770f37e2d94e6478d0f3fc0c17"}, + {file = "aiohttp-3.8.6-cp39-cp39-win32.whl", hash = "sha256:caf486ac1e689dda3502567eb89ffe02876546599bbf915ec94b1fa424eeffd4"}, + {file = "aiohttp-3.8.6-cp39-cp39-win_amd64.whl", hash = "sha256:3f0e27e5b733803333bb2371249f41cf42bae8884863e8e8965ec69bebe53132"}, + {file = "aiohttp-3.8.6.tar.gz", hash = "sha256:b0cf2a4501bff9330a8a5248b4ce951851e415bdcce9dc158e76cfd55e15085c"}, ] [package.dependencies] @@ -137,7 +137,7 @@ speedups = ["Brotli", "aiodns", "cchardet"] name = "aiosignal" version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, @@ -310,7 +310,7 @@ files = [ name = "async-timeout" version = "4.0.3" description = "Timeout context manager for asyncio programs" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, @@ -324,7 +324,7 @@ typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""} name = "asynctest" version = "0.13.0" description = "Enhance the standard unittest package with features for testing asyncio libraries" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "asynctest-0.13.0-py3-none-any.whl", hash = "sha256:5da6118a7e6d6b54d83a8f7197769d046922a44d2a99c21382f0a6e4fadae676"}, @@ -969,7 +969,7 @@ cached-property = {version = ">=1.3.0", markers = "python_version < \"3.8\""} name = "frozenlist" version = "1.3.3" description = "A list-like structure which implements collections.abc.MutableSequence" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"}, @@ -1907,7 +1907,7 @@ files = [ name = "multidict" version = "6.0.4" description = "multidict implementation" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, @@ -3522,6 +3522,7 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, + {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -3531,26 +3532,35 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, + {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, + {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, + {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, + {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -3783,17 +3793,17 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake [[package]] name = "urllib3" -version = "1.26.16" +version = "1.26.18" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ - {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"}, - {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"}, + {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, + {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] @@ -3940,7 +3950,7 @@ files = [ name = "yarl" version = "1.9.2" description = "Yet another URL library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"}, diff --git a/pyproject.toml b/pyproject.toml index 412c35203d..13a5fca5cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "splink" -version = "3.9.8" +version = "3.9.9" description = "Fast probabilistic data linkage at scale" authors = ["Robin Linacre ", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"] license = "MIT" diff --git a/splink/__init__.py b/splink/__init__.py index 046dce5ac1..c691b0029a 100644 --- a/splink/__init__.py +++ b/splink/__init__.py @@ -1 +1 @@ -__version__ = "3.9.8" +__version__ = "3.9.9" diff --git a/splink/accuracy.py b/splink/accuracy.py index d2b140ff08..d8a92a0491 100644 --- a/splink/accuracy.py +++ b/splink/accuracy.py @@ -1,4 +1,5 @@ from copy import deepcopy +from typing import TYPE_CHECKING from .block_from_labels import block_from_labels from .blocking import BlockingRule @@ -6,6 +7,9 @@ from .predict import predict_from_comparison_vectors_sqls from .sql_transform import move_l_r_table_prefix_to_column_suffix +if TYPE_CHECKING: + from .linker import Linker + def truth_space_table_from_labels_with_predictions_sqls( threshold_actual=0.5, match_weight_round_to_nearest=None @@ -143,10 +147,11 @@ def truth_space_table_from_labels_with_predictions_sqls( return sqls -def _select_found_by_blocking_rules(linker): +def _select_found_by_blocking_rules(linker: "Linker"): brs = linker._settings_obj._blocking_rules_to_generate_predictions + if brs: - brs = [move_l_r_table_prefix_to_column_suffix(b.blocking_rule) for b in brs] + brs = [move_l_r_table_prefix_to_column_suffix(b.blocking_rule_sql) for b in brs] brs = [f"(coalesce({b}, false))" for b in brs] brs = " OR ".join(brs) br_col = f" ({brs}) " diff --git a/splink/athena/athena_helpers/athena_blocking_rule_imports.py b/splink/athena/athena_helpers/athena_blocking_rule_imports.py index 6f3340077c..7c76a32db1 100644 --- a/splink/athena/athena_helpers/athena_blocking_rule_imports.py +++ b/splink/athena/athena_helpers/athena_blocking_rule_imports.py @@ -1,5 +1,7 @@ from __future__ import annotations +from functools import partial + from ...blocking_rules_library import ( BlockingRule, exact_match_rule, @@ -7,13 +9,8 @@ from ...blocking_rules_library import ( block_on as _block_on_, ) -from .athena_base import ( - AthenaBase, -) - -class exact_match_rule(AthenaBase, exact_match_rule): - pass +exact_match_rule = partial(exact_match_rule, _sql_dialect="presto") def block_on( diff --git a/splink/athena/linker.py b/splink/athena/linker.py index acb97c9754..b455f629fc 100644 --- a/splink/athena/linker.py +++ b/splink/athena/linker.py @@ -270,6 +270,7 @@ def check_table_exists(self, db, tb): table_exists = wr.catalog.does_table_exist( database=db, table=tb, + boto3_session=self.boto3_session, ) if not table_exists: raise wr.exceptions.InvalidTable( diff --git a/splink/block_from_labels.py b/splink/block_from_labels.py index 97fae40fb7..b88be6cdc5 100644 --- a/splink/block_from_labels.py +++ b/splink/block_from_labels.py @@ -32,7 +32,7 @@ def block_from_labels( unique_id_col = linker._settings_obj._unique_id_column_name - source_dataset_col = linker._settings_obj._source_dataset_input_column + source_dataset_col = linker._settings_obj._source_dataset_column_name sql = lower_id_to_left_hand_side(df, source_dataset_col, unique_id_col) diff --git a/splink/blocking.py b/splink/blocking.py index 0eff876136..47160a499c 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -7,9 +7,7 @@ from sqlglot.expressions import Column, Join from sqlglot.optimizer.eliminate_joins import join_condition -from .input_column import InputColumn from .misc import ensure_is_list -from .splink_dataframe import SplinkDataFrame from .unique_id_concat import _composite_unique_id_from_nodes_sql logger = logging.getLogger(__name__) @@ -27,16 +25,15 @@ def blocking_rule_to_obj(br): if blocking_rule is None: raise ValueError("No blocking rule submitted...") sqlglot_dialect = br.get("sql_dialect", None) - salting_partitions = br.get("salting_partitions", 1) - arrays_to_explode = br.get("arrays_to_explode", list()) - if arrays_to_explode: - return ExplodingBlockingRule( - blocking_rule, salting_partitions, sqlglot_dialect, arrays_to_explode + salting_partitions = br.get("salting_partitions", None) + if salting_partitions is None: + return BlockingRule(blocking_rule, sqlglot_dialect) + else: + return SaltedBlockingRule( + blocking_rule, sqlglot_dialect, salting_partitions ) - return BlockingRule(blocking_rule, salting_partitions, sqlglot_dialect) - else: br = BlockingRule(br) return br @@ -45,17 +42,20 @@ def blocking_rule_to_obj(br): class BlockingRule: def __init__( self, - blocking_rule: BlockingRule | dict | str, - salting_partitions=1, + blocking_rule_sql: str, sqlglot_dialect: str = None, ): if sqlglot_dialect: self._sql_dialect = sqlglot_dialect - self.blocking_rule_sql = blocking_rule - self.preceding_rules = [] + # Temporarily just to see if tests still pass + if not isinstance(blocking_rule_sql, str): + raise ValueError( + f"Blocking rule must be a string, not {type(blocking_rule_sql)}" + ) + self.blocking_rule_sql = blocking_rule_sql + self.preceding_rules: List[BlockingRule] = [] self.sqlglot_dialect = sqlglot_dialect - self.salting_partitions: int = salting_partitions @property def sql_dialect(self): @@ -65,20 +65,12 @@ def sql_dialect(self): def match_key(self): return len(self.preceding_rules) - @property - def is_salted(self): - return self.salting_partitions > 1 - - @property - def sql(self): - # Wrapper to reveal the underlying SQL - return self.blocking_rule_sql - def add_preceding_rules(self, rules): rules = ensure_is_list(rules) self.preceding_rules = rules - def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): + @property + def exclude_pairs_generated_by_this_rule_sql(self): """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs @@ -89,27 +81,23 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): # meaning these comparisons get lost return f"coalesce(({self.blocking_rule_sql}),false)" - def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): + @property + def exclude_pairs_generated_by_all_preceding_rules_sql(self): """A SQL string that excludes the results of ALL previous blocking rules from the pairwise comparisons generated. """ if not self.preceding_rules: return "" or_clauses = [ - br.exclude_pairs_generated_by_this_rule_sql(linker) - for br in self.preceding_rules + br.exclude_pairs_generated_by_this_rule_sql for br in self.preceding_rules ] previous_rules = " OR ".join(or_clauses) return f"AND NOT ({previous_rules})" - def create_pairwise_comparisons_sql( - self, - linker: Linker, - sql_select_expr: str, - salted_br: SaltedBlockingRuleSegment, - probability: str, - where_condition: str, - ): + def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability): + columns_to_select = linker._settings_obj._columns_to_select_for_blocking + sql_select_expr = ", ".join(columns_to_select) + sql = f""" select {sql_select_expr} @@ -118,38 +106,12 @@ def create_pairwise_comparisons_sql( from {linker._input_tablename_l} as l inner join {linker._input_tablename_r} as r on - ({salted_br.blocking_rule_sql}) + ({self.blocking_rule_sql}) + {self.exclude_pairs_generated_by_all_preceding_rules_sql} {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} """ return sql - @property - def salted_blocking_rule_segments(self) -> List[SaltedBlockingRuleSegment]: - """A list of sql strings""" - - for n in range(self.salting_partitions): - if self.is_salted: - rule_sql = ( - f"{self.blocking_rule_sql} and " - f"ceiling(l.__splink_salt * {self.salting_partitions}) " - f"= {n+1}" - ) - else: - rule_sql = self.blocking_rule_sql - - br_seg = SaltedBlockingRuleSegment(self, rule_sql, n) - - # Exploding blocking rules may have a materialised exploded_id_pair_table - # If so, we want to associated it with the SaltedBlockingRuleSegment - if isinstance(self, ExplodingBlockingRule): - try: - br_seg.exploded_id_pair_table = self.exploded_id_pair_tables[n] - except IndexError: - pass - - yield br_seg - @property def _parsed_join_condition(self): br = self.blocking_rule_sql @@ -209,19 +171,10 @@ def as_dict(self): output["blocking_rule"] = self.blocking_rule_sql output["sql_dialect"] = self.sql_dialect - if self.salting_partitions > 1 and self.sql_dialect == "spark": - output["salting_partitions"] = self.salting_partitions - - if self.array_columns_to_explode: - output["arrays_to_explode"] = self.array_columns_to_explode - return output def _as_completed_dict(self): - if not self.salting_partitions > 1 and self.sql_dialect == "spark": - return self.blocking_rule_sql - else: - return self.as_dict() + return self.blocking_rule_sql @property def descr(self): @@ -240,125 +193,52 @@ def _human_readable_succinct(self): return f"{self.descr} blocking rule using SQL: {sql}" -class ExplodingBlockingRule(BlockingRule): +class SaltedBlockingRule(BlockingRule): def __init__( self, - blocking_rule: BlockingRule | dict | str, - salting_partitions=1, + blocking_rule: str, sqlglot_dialect: str = None, - array_columns_to_explode: list = [], + salting_partitions: int = 1, ): - super().__init__(blocking_rule, salting_partitions, sqlglot_dialect) - self.array_columns_to_explode: List[str] = array_columns_to_explode - self.exploded_id_pair_tables: List[SplinkDataFrame] = [] - - def marginal_exploded_id_pairs_table_sql( - self, linker: Linker, salted_br: BlockingRule - ): - """generates a table of the marginal id pairs from the exploded blocking rule - i.e. pairs are only created that match this blocking rule and NOT any of - the preceding blocking rules - """ - settings_obj = linker._settings_obj - unique_id_col = settings_obj._unique_id_column_name + if salting_partitions is None or salting_partitions <= 1: + raise ValueError("Salting partitions must be specified and > 1") - link_type = settings_obj._link_type + super().__init__(blocking_rule, sqlglot_dialect) + self.salting_partitions = salting_partitions - if linker._two_dataset_link_only: - link_type = "two_dataset_link_only" - - if linker._self_link_mode: - link_type = "self_link" - - where_condition = _sql_gen_where_condition( - link_type, settings_obj._unique_id_input_columns - ) - - if link_type == "two_dataset_link_only": - where_condition = ( - where_condition + " and l.source_dataset < r.source_dataset" - ) - - sql = f""" - select distinct - l.{unique_id_col} as {unique_id_col}_l, - r.{unique_id_col} as {unique_id_col}_r - from __splink__df_concat_with_tf_unnested as l - inner join __splink__df_concat_with_tf_unnested as r - on ({salted_br.blocking_rule_sql}) - {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" - - return sql - - def drop_materialised_id_pairs_dataframes(self): - for df in self.exploded_id_pair_tables: - df.drop_table_from_database_and_remove_from_cache() - - def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): - """A SQL string specifying how to exclude the results - of THIS blocking rule from subseqent blocking statements, - so that subsequent statements do not produce duplicate pairs - """ + def as_dict(self): + output = super().as_dict() + output["salting_partitions"] = self.salting_partitions + return output - unique_id_column = linker._settings_obj._unique_id_column_name + def _as_completed_dict(self): + return self.as_dict() - ids_to_compare_sql = " union all ".join( - [ - f"select * from {ids.physical_name}" - for ids in self.exploded_id_pair_tables - ] - ) + def _salting_condition(self, salt): + return f"AND ceiling(l.__splink_salt * {self.salting_partitions}) = {salt + 1}" - return f"""EXISTS ( - select 1 from ({ids_to_compare_sql}) as ids_to_compare - where ( - l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and - r.{unique_id_column} = ids_to_compare.{unique_id_column}_r - ) - ) - """ + def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability): + columns_to_select = linker._settings_obj._columns_to_select_for_blocking + sql_select_expr = ", ".join(columns_to_select) - def create_pairwise_comparisons_sql( - self, - linker: Linker, - sql_select_expr: str, - salted_br: SaltedBlockingRuleSegment, - probability: str, - where_condition: str, - ): - exploded_id_pair_table = salted_br.exploded_id_pair_table - unique_id_col = linker._settings_obj._unique_id_column_name - sql = f""" + sqls = [] + for salt in range(self.salting_partitions): + salt_condition = self._salting_condition(salt) + sql = f""" select - {sql_select_expr}, - '{self.match_key}' as match_key - {probability} - from {exploded_id_pair_table.physical_name} as pairs - left join {linker._input_tablename_l} as l - on pairs.{unique_id_col}_l=l.{unique_id_col} - left join {linker._input_tablename_r} as r - on pairs.{unique_id_col}_r=r.{unique_id_col} - """ - return sql - - -class SaltedBlockingRuleSegment: - def __init__( - self, - parent_blocking_rule: BlockingRule, - blocking_rule_sql: str, - salt: int = None, - exploded_id_pairs_table: SplinkDataFrame = None, - ): - self.parent_blocking_rule = parent_blocking_rule - self.blocking_rule_sql = blocking_rule_sql - self.salt = salt - self.exploded_id_pairs_tables = exploded_id_pairs_table + {sql_select_expr} + , '{self.match_key}' as match_key + {probability} + from {linker._input_tablename_l} as l + inner join {linker._input_tablename_r} as r + on + ({self.blocking_rule_sql} {salt_condition}) + {self.exclude_pairs_generated_by_all_preceding_rules_sql} + {where_condition} + """ - @property - def is_salted(self): - return self.parent_blocking_rule.is_salted + sqls.append(sql) + return " UNION ALL ".join(sqls) def _sql_gen_where_condition(link_type, unique_id_cols): @@ -373,61 +253,12 @@ def _sql_gen_where_condition(link_type, unique_id_cols): source_dataset_col = unique_id_cols[0] where_condition = ( f"where {id_expr_l} < {id_expr_r} " - f"and l.{source_dataset_col.name()} != r.{source_dataset_col.name()}" + f"and l.{source_dataset_col.name} != r.{source_dataset_col.name}" ) return where_condition -def materialise_exploded_id_tables(linker: Linker): - settings_obj = linker._settings_obj - - blocking_rules = settings_obj._blocking_rules_to_generate_predictions - blocking_rules = [ - br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) - ] - salted_exploded_blocking_rules = ( - salted_br - for br in blocking_rules - for salted_br in br.salted_blocking_rule_segments - ) - - for salted_br in salted_exploded_blocking_rules: - parent_br = salted_br.parent_blocking_rule - - input_dataframe = linker._initialise_df_concat_with_tf() - - input_colnames = {col.name() for col in input_dataframe.columns} - arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name() - for colname in parent_br.array_columns_to_explode - ] - expl_sql = linker._gen_explode_sql( - "__splink__df_concat_with_tf", - parent_br.array_columns_to_explode, - list(input_colnames.difference(arrays_to_explode_quoted)), - ) - - linker._enqueue_sql( - expl_sql, - "__splink__df_concat_with_tf_unnested", - ) - - salt_name = "" - if salted_br.is_salted: - salt_name = f"_salt_{salted_br.salt}" - - base_name = "__splink__marginal_exploded_ids_blocking_rule" - table_name = f"{base_name}_mk_{parent_br.match_key}{salt_name}" - - sql = parent_br.marginal_exploded_id_pairs_table_sql(linker, salted_br) - - linker._enqueue_sql(sql, table_name) - - marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) - parent_br.exploded_id_pair_tables.append(marginal_ids_table) - - def block_using_rules_sqls(linker: Linker): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions @@ -449,13 +280,11 @@ def block_using_rules_sqls(linker: Linker): and not linker._find_new_matches_mode and not linker._compare_two_records_mode ): - source_dataset_col = linker._source_dataset_column_name + source_dataset_col = ( + source_dataset_col + ) = linker._settings_obj._source_dataset_column_name # Need df_l to be the one with the lowest id to preeserve the property # that the left dataset is the one with the lowest concatenated id - keys = linker._input_tables_dict.keys() - keys = list(sorted(keys)) - df_l = linker._input_tables_dict[keys[0]] - df_r = linker._input_tables_dict[keys[1]] # This also needs to work for training u if linker._train_u_using_random_sample_mode: @@ -463,9 +292,12 @@ def block_using_rules_sqls(linker: Linker): else: spl_switch = "" + df_concat_tf = linker._intermediate_table_cache["__splink__df_concat_with_tf"] + sql = f""" select * from __splink__df_concat_with_tf{spl_switch} - where {source_dataset_col} = '{df_l.templated_name}' + where {source_dataset_col} = + (select min({source_dataset_col}) from {df_concat_tf.physical_name}) """ sqls.append( { @@ -476,7 +308,8 @@ def block_using_rules_sqls(linker: Linker): sql = f""" select * from __splink__df_concat_with_tf{spl_switch} - where {source_dataset_col} = '{df_r.templated_name}' + where {source_dataset_col} = + (select max({source_dataset_col}) from {df_concat_tf.physical_name}) """ sqls.append( { @@ -485,16 +318,8 @@ def block_using_rules_sqls(linker: Linker): } ) - if type(linker).__name__ in ["SparkLinker"]: - pass - else: - pass - settings_obj = linker._settings_obj - columns_to_select = settings_obj._columns_to_select_for_blocking - sql_select_expr = ", ".join(columns_to_select) - link_type = settings_obj._link_type if linker._two_dataset_link_only: @@ -531,19 +356,13 @@ def block_using_rules_sqls(linker: Linker): probability = "" br_sqls = [] - salted_blocking_rules = ( - salted_br - for br in blocking_rules - for salted_br in br.salted_blocking_rule_segments - ) - for salted_br in salted_blocking_rules: - parent_br = salted_br.parent_blocking_rule - sql = parent_br.create_pairwise_comparisons_sql( - linker, sql_select_expr, salted_br, probability, where_condition - ) + + for br in blocking_rules: + sql = br.create_blocked_pairs_sql(linker, where_condition, probability) br_sqls.append(sql) - sql = "union all".join(br_sqls) + sql = " UNION ALL ".join(br_sqls) sqls.append({"sql": sql, "output_table_name": "__splink__df_blocked"}) + return sqls diff --git a/splink/blocking_rule_composition.py b/splink/blocking_rule_composition.py index 29cb01d9f6..620c33b1b9 100644 --- a/splink/blocking_rule_composition.py +++ b/splink/blocking_rule_composition.py @@ -297,17 +297,23 @@ def not_(*brls: BlockingRule | dict | str, salting_partitions: int = 1) -> Block br = brls[0] blocking_rule = f"NOT ({br.blocking_rule_sql})" - return BlockingRule( - blocking_rule, - salting_partitions=salting_partitions if salting_partitions > 1 else salt, - sqlglot_dialect=sql_dialect, - ) + br_dict = { + "blocking_rule": blocking_rule, + "sql_dialect": sql_dialect, + } + + if salting_partitions > 1: + salt = salting_partitions + if salt > 1: + br_dict["salting_partitions"] = salt + + return blocking_rule_to_obj(br_dict) def _br_merge( *brls: BlockingRule | dict | str, clause: str, - salting_partitions: int = 1, + salting_partitions: int = None, ) -> BlockingRule: if len(brls) == 0: raise ValueError("You must provide at least one BlockingRule") @@ -320,11 +326,17 @@ def _br_merge( blocking_rule = f" {clause} ".join(conditions) - return BlockingRule( - blocking_rule, - salting_partitions=salting_partitions if salting_partitions > 1 else salt, - sqlglot_dialect=sql_dialect, - ) + br_dict = { + "blocking_rule": blocking_rule, + "sql_dialect": sql_dialect, + } + + if salting_partitions > 1: + salt = salting_partitions + if salt > 1: + br_dict["salting_partitions"] = salt + + return blocking_rule_to_obj(br_dict) def _parse_blocking_rules( @@ -332,7 +344,7 @@ def _parse_blocking_rules( ) -> tuple[list[BlockingRule], str | None]: brs = [_to_blocking_rule(br) for br in brs] sql_dialect = _unify_sql_dialects(brs) - salting_partitions = max([br.salting_partitions for br in brs]) + salting_partitions = max([getattr(br, "salting_partitions", 1) for br in brs]) return brs, sql_dialect, salting_partitions diff --git a/splink/blocking_rules_library.py b/splink/blocking_rules_library.py index 0d8a9d99e4..f23ab9c615 100644 --- a/splink/blocking_rules_library.py +++ b/splink/blocking_rules_library.py @@ -4,99 +4,52 @@ import sqlglot -from .blocking import BlockingRule +from .blocking import BlockingRule, blocking_rule_to_obj from .blocking_rule_composition import and_ from .misc import ensure_is_list from .sql_transform import add_quotes_and_table_prefix -class exact_match_rule(BlockingRule): - def __init__( - self, - col_name: str, - salting_partitions: int = 1, - ) -> BlockingRule: - """Represents an exact match blocking rule. - - **DEPRECATED:** - `exact_match_rule` is deprecated. Please use `block_on` - instead, which acts as a wrapper with additional functionality. - - Args: - col_name (str): Input column name, or a str represent a sql - statement you'd like to match on. For example, `surname` or - `"substr(surname,1,2)"` are both valid. - salting_partitions (optional, int): Whether to add salting - to the blocking rule. More information on salting can - be found within the docs. Salting is currently only valid - for Spark. - - Examples: - === ":simple-duckdb: DuckDB" - Simple Exact match level - ``` python - import splink.duckdb.blocking_rule_library as brl - brl.exact_match_rule("name") - - sql = "substr(surname,1,2)" - brl.exact_match_rule(sql) - ``` - === ":simple-apachespark: Spark" - Simple Exact match level - ``` python - import splink.spark.blocking_rule_library as brl - brl.exact_match_rule("name", salting_partitions=1) - - sql = "substr(surname,1,2)" - brl.exact_match_rule(sql) - ``` - === ":simple-amazonaws: Athena" - Simple Exact match level - ``` python - import splink.athena.blocking_rule_library as brl - brl.exact_match_rule("name") - - sql = "substr(surname,1,2)" - brl.exact_match_rule(sql) - ``` - === ":simple-sqlite: SQLite" - Simple Exact match level - ``` python - import splink.sqlite.blocking_rule_library as brl - brl.exact_match_rule("name") - - sql = "substr(surname,1,2)" - brl.exact_match_rule(sql) - ``` - === "PostgreSQL" - Simple Exact match level - ``` python - import splink.postgres.blocking_rule_library as brl - brl.exact_match_rule("name") - - sql = "substr(surname,1,2)" - brl.exact_match_rule(sql) - ``` - """ - - warnings.warn( - "`exact_match_rule` is deprecated; use `block_on`", - DeprecationWarning, - stacklevel=2, - ) - - syntax_tree = sqlglot.parse_one(col_name, read=self._sql_dialect) - - l_col = add_quotes_and_table_prefix(syntax_tree, "l").sql(self._sql_dialect) - r_col = add_quotes_and_table_prefix(syntax_tree, "r").sql(self._sql_dialect) - - blocking_rule = f"{l_col} = {r_col}" - self._description = "Exact match" - - super().__init__( - blocking_rule, - salting_partitions=salting_partitions, - ) +def exact_match_rule( + col_name: str, + _sql_dialect: str, + salting_partitions: int = None, +) -> BlockingRule: + """Represents an exact match blocking rule. + + **DEPRECATED:** + `exact_match_rule` is deprecated. Please use `block_on` + instead, which acts as a wrapper with additional functionality. + + Args: + col_name (str): Input column name, or a str represent a sql + statement you'd like to match on. For example, `surname` or + `"substr(surname,1,2)"` are both valid. + salting_partitions (optional, int): Whether to add salting + to the blocking rule. More information on salting can + be found within the docs. Salting is currently only valid + for Spark. + """ + warnings.warn( + "`exact_match_rule` is deprecated; use `block_on`", + DeprecationWarning, + stacklevel=2, + ) + + syntax_tree = sqlglot.parse_one(col_name, read=_sql_dialect) + + l_col = add_quotes_and_table_prefix(syntax_tree, "l").sql(_sql_dialect) + r_col = add_quotes_and_table_prefix(syntax_tree, "r").sql(_sql_dialect) + + blocking_rule = f"{l_col} = {r_col}" + + return blocking_rule_to_obj( + { + "blocking_rule": blocking_rule, + "salting_partitions": salting_partitions, + "sql_dialect": _sql_dialect, + } + ) def block_on( diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py new file mode 100644 index 0000000000..b68308b7a6 --- /dev/null +++ b/splink/cluster_metrics.py @@ -0,0 +1,65 @@ +from splink.input_column import InputColumn + + +def _size_density_sql( + df_predict, df_clustered, threshold_match_probability, _unique_id_col +): + """Generates sql for computing cluster size and density at a given threshold. + + Args: + df_predict (SplinkDataFrame): The results of `linker.predict()` + df_clustered (SplinkDataFrame): The outputs of + `linker.cluster_pairwise_predictions_at_threshold()` + threshold_match_probability (float): Filter the pairwise match + predictions to include only pairwise comparisons with a + match_probability above this threshold. + _unique_id_col (string): name of unique id column in settings dict + + Returns: + sql string for computing cluster size and density + """ + + # Get physical table names from Splink dataframes + edges_table = df_predict.physical_name + clusters_table = df_clustered.physical_name + + input_col = InputColumn(_unique_id_col) + unique_id_col_l = input_col.name_l + + sqls = [] + sql = f""" + SELECT + {unique_id_col_l}, + COUNT(*) AS count_edges + FROM {edges_table} + WHERE match_probability >= {threshold_match_probability} + GROUP BY {unique_id_col_l} + """ + + sql = {"sql": sql, "output_table_name": "__splink__count_edges"} + sqls.append(sql) + + sql = f""" + SELECT + c.cluster_id, + count(*) AS n_nodes, + sum(e.count_edges) AS n_edges + FROM {clusters_table} AS c + LEFT JOIN __splink__count_edges e ON c.{_unique_id_col} = e.{unique_id_col_l} + GROUP BY c.cluster_id + """ + sql = {"sql": sql, "output_table_name": "__splink__counts_per_cluster"} + sqls.append(sql) + + sql = """ + SELECT + cluster_id, + n_nodes, + n_edges, + (n_edges * 2)/(n_nodes * (n_nodes-1)) AS density + FROM __splink__counts_per_cluster + """ + sql = {"sql": sql, "output_table_name": "__splink__cluster_metrics_clusters"} + sqls.append(sql) + + return sqls diff --git a/splink/comparison.py b/splink/comparison.py index 3e3d0f3986..b0254fcafd 100644 --- a/splink/comparison.py +++ b/splink/comparison.py @@ -210,14 +210,14 @@ def _columns_to_select_for_comparison_vector_values(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) output_cols.append(self._case_statement) for cl in self.comparison_levels: if cl._has_tf_adjustments: col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) return dedupe_preserving_order(output_cols) @@ -230,7 +230,7 @@ def _columns_to_select_for_bayes_factor_parts(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) output_cols.append(self._gamma_column_name) @@ -240,7 +240,7 @@ def _columns_to_select_for_bayes_factor_parts(self): and self._settings_obj._retain_intermediate_calculation_columns ): col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) # Bayes factor case when statement sqls = [cl._bayes_factor_sql for cl in self.comparison_levels] @@ -268,7 +268,7 @@ def _columns_to_select_for_predict(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) if ( self._settings_obj._training_mode @@ -282,7 +282,7 @@ def _columns_to_select_for_predict(self): and self._settings_obj._retain_intermediate_calculation_columns ): col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) for _col in input_cols: if self._settings_obj._retain_intermediate_calculation_columns: @@ -445,7 +445,7 @@ def _comparison_level_description_list(self): @property def _human_readable_description_succinct(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_case_statement] + [c.name for c in self._input_columns_used_by_case_statement] ) comp_levels = self._comparison_level_description_list @@ -463,7 +463,7 @@ def _human_readable_description_succinct(self): @property def human_readable_description(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_case_statement] + [c.name for c in self._input_columns_used_by_case_statement] ) comp_levels = self._comparison_level_description_list diff --git a/splink/comparison_level.py b/splink/comparison_level.py index 9011be8d9e..8b44d9036f 100644 --- a/splink/comparison_level.py +++ b/splink/comparison_level.py @@ -202,7 +202,7 @@ def _tf_adjustment_input_column(self): def _tf_adjustment_input_column_name(self): input_column = self._tf_adjustment_input_column if input_column: - return input_column.unquote().name() + return input_column.unquote().name @property def _has_comparison(self): @@ -465,11 +465,9 @@ def _columns_to_select_for_blocking(self): cols = self._input_columns_used_by_sql_condition for c in cols: - output_cols.extend(c.l_r_names_as_l_r()) + output_cols.extend(c.l_r_names_as_l_r) if self._tf_adjustment_input_column: - output_cols.extend( - self._tf_adjustment_input_column.l_r_tf_names_as_l_r() - ) + output_cols.extend(self._tf_adjustment_input_column.l_r_tf_names_as_l_r) return dedupe_preserving_order(output_cols) @@ -577,12 +575,8 @@ def _tf_adjustment_sql(self): else: tf_adj_col = self._tf_adjustment_input_column - coalesce_l_r = ( - f"coalesce({tf_adj_col.tf_name_l()}, {tf_adj_col.tf_name_r()})" - ) - coalesce_r_l = ( - f"coalesce({tf_adj_col.tf_name_r()}, {tf_adj_col.tf_name_l()})" - ) + coalesce_l_r = f"coalesce({tf_adj_col.tf_name_l}, {tf_adj_col.tf_name_r})" + coalesce_r_l = f"coalesce({tf_adj_col.tf_name_r}, {tf_adj_col.tf_name_l})" tf_adjustment_exists = f"{coalesce_l_r} is not null" u_prob_exact_match = self._u_probability_corresponding_to_exact_match @@ -730,7 +724,7 @@ def _human_readable_succinct(self): @property def human_readable_description(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_sql_condition] + [c.name for c in self._input_columns_used_by_sql_condition] ) desc = ( f"Comparison level: {self.label_for_charts} of {input_cols}\n" diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index d7807b99c4..6f1744fe09 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -98,7 +98,7 @@ def __init__( valid_string_pattern = valid_string_regex col = InputColumn(col_name, sql_dialect=self._sql_dialect) - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if invalid_dates_as_null: col_name_l = self._valid_date_function(col_name_l, valid_string_pattern) @@ -231,7 +231,7 @@ def __init__( else: label_suffix = "" - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -395,7 +395,7 @@ def __init__( else: operator = "<=" - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -938,8 +938,8 @@ def __init__( col_1 = InputColumn(col_name_1, sql_dialect=self._sql_dialect) col_2 = InputColumn(col_name_2, sql_dialect=self._sql_dialect) - col_1_l, col_1_r = col_1.name_l(), col_1.name_r() - col_2_l, col_2_r = col_2.name_l(), col_2.name_r() + col_1_l, col_1_r = col_1.name_l, col_1.name_r + col_2_l, col_2_r = col_2.name_l, col_2.name_r if set_to_lowercase: col_1_l = f"lower({col_1_l})" @@ -1030,8 +1030,8 @@ def __init__( lat = InputColumn(lat_col, sql_dialect=self._sql_dialect) long = InputColumn(long_col, sql_dialect=self._sql_dialect) - lat_l, lat_r = lat.names_l_r() - long_l, long_r = long.names_l_r() + lat_l, lat_r = lat.names_l_r + long_l, long_r = long.names_l_r distance_km_sql = f""" {great_circle_distance_km_sql(lat_l, lat_r, long_l, long_r)} <= {km_threshold} @@ -1108,11 +1108,11 @@ def __init__( """ col = InputColumn(col_name, sql_dialect=self._sql_dialect) - s = f"""(abs({col.name_l()} - {col.name_r()})/ + s = f"""(abs({col.name_l} - {col.name_r})/ (case - when {col.name_r()} > {col.name_l()} - then {col.name_r()} - else {col.name_l()} + when {col.name_r} > {col.name_l} + then {col.name_r} + else {col.name_l} end)) < {percentage_distance_threshold}""" @@ -1178,7 +1178,7 @@ def __init__( col = InputColumn(col_name, sql_dialect=self._sql_dialect) size_array_intersection = ( - f"{self._size_array_intersect_function(col.name_l(), col.name_r())}" + f"{self._size_array_intersect_function(col.name_l, col.name_r)}" ) sql = f"{size_array_intersection} >= {min_intersection}" @@ -1359,7 +1359,7 @@ def __init__( """ date = InputColumn(date_col, sql_dialect=self._sql_dialect) - date_l, date_r = date.names_l_r() + date_l, date_r = date.names_l_r datediff_sql = self._datediff_function( date_l, diff --git a/splink/cost_of_blocking_rules.py b/splink/cost_of_blocking_rules.py new file mode 100644 index 0000000000..8117a67e92 --- /dev/null +++ b/splink/cost_of_blocking_rules.py @@ -0,0 +1,114 @@ +import logging +from typing import Dict, List, Union + +logger = logging.getLogger(__name__) + + +def calculate_field_freedom_cost(combination_of_brs: List[Dict]) -> int: + """ + We want a higher scores (lower cost) for combinations of blocking rules that allow + as much variation in each field as possible + + e.g. we don't like combinations of four rules + that hold first_name and surname constant in 3 out of 4 + and only allows them to vary in one, even if that affords greater + variance to other fields. + + That is, we would prefer a spread in which fields are held fixed across the blocking + rules + + Calculates the field cost for a given combination of brs. It counts the number of + times each field is allowed to vary (i.e., not included in the blocking rules). + + Args: + combination_of_brs (List[Dict]): The combination_of_brs rows + + Returns: + int: The field freedom cost. + """ + + total_cost = 0 + field_names = [c for c in combination_of_brs[0].keys() if c.startswith("__fixed__")] + + # This lookup is somewhat arbitary but its purpose is to assign a very high + # cost to combinations of blocking rules where a a field is not allowed to vary + # much + # TODO: Could incorporate information about how many other fields are allowed + # to vary i.e. it's not just the count of other blocking rules that allow this + # field to matter ,it's also how strict they are + costs_by_count = {0: 20, 1: 10, 2: 2, 3: 1, 4: 1} + + for field in field_names: + field_can_vary_count = sum(row[field] == 0 for row in combination_of_brs) + + cost = costs_by_count.get(field_can_vary_count, 0) / 10 + + total_cost = total_cost + cost + + return total_cost + + +def calculate_cost_of_combination_of_brs( + br_combination: List[Dict], + max_comparison_count: int, + num_equi_join_weight: Union[int, float] = 1, + field_freedom_weight: Union[int, float] = 1, + num_brs_weight: Union[int, float] = 1, + num_comparison_weight: Union[int, float] = 1, +) -> dict: + """ + Calculates the cost for a given combination of blocking rules. + + The cost is a weighted sum of the number of equi joins in the rules, the count of + rules, the number of fields that are allowed to vary, and the number of rows. + + Args: + br_combination (List[Dict]): The combination of rows outputted by + find_blocking_rules_below_threshold_comparison_count. + max_comparison_count (int): The maximum comparison count amongst the rules. + This is needed to normalise the cost of more or fewer comparison rows. + num_equi_join_weight (Union[int, float], optional): The weight for num_equi_join + Defaults to 1. + field_freedom_weight (Union[int, float], optional): The weight for field + freedom. Defaults to 1. + num_brs_weight (Union[int, float], optional): The weight for the number of + blocking rules found. Defaults to 1. + num_comparison_weight (Union[int, float], optional): The weight for the + number of comparison rows. Defaults to 1. + + Returns: + dict: The calculated cost and individual component costs. + """ + + num_equi_join_cost = sum(row["num_equi_joins"] for row in br_combination) + total_row_count = sum(row["comparison_count"] for row in br_combination) + normalised_row_count = total_row_count / max_comparison_count + + # We want a better score for br_combinations that allow each field to + # vary as much as possible. + field_freedom_cost = calculate_field_freedom_cost(br_combination) + num_brs_cost = len(br_combination) + + num_equi_join_cost_weighted = num_equi_join_weight * num_equi_join_cost + field_freedom_cost_weighted = field_freedom_weight * field_freedom_cost + num_brs_cost_weighted = num_brs_weight * num_brs_cost + num_comparison_rows_cost_weighted = num_comparison_weight * normalised_row_count + + total_cost = ( + num_equi_join_cost_weighted + + field_freedom_cost_weighted + + num_brs_cost_weighted + + num_comparison_rows_cost_weighted + ) + + return { + "cost": total_cost, + "num_equi_join_cost_weighted": num_equi_join_cost_weighted, + "field_freedom_cost_weighted": field_freedom_cost_weighted, + "num_brs_cost_weighted": num_brs_cost_weighted, + "num_comparison_rows_cost_weighted": num_comparison_rows_cost_weighted, + "num_equi_join_cost": num_equi_join_cost, + "field_freedom_cost": field_freedom_cost, + "num_brs_cost": num_brs_cost, + "num_comparison_rows_cost": normalised_row_count, + } diff --git a/splink/duckdb/duckdb_helpers/duckdb_blocking_rule_imports.py b/splink/duckdb/duckdb_helpers/duckdb_blocking_rule_imports.py index 431db488b0..6afd8c8767 100644 --- a/splink/duckdb/duckdb_helpers/duckdb_blocking_rule_imports.py +++ b/splink/duckdb/duckdb_helpers/duckdb_blocking_rule_imports.py @@ -1,5 +1,7 @@ from __future__ import annotations +from functools import partial + from ...blocking_rules_library import ( BlockingRule, exact_match_rule, @@ -7,13 +9,8 @@ from ...blocking_rules_library import ( block_on as _block_on_, ) -from .duckdb_base import ( - DuckDBBase, -) - -class exact_match_rule(DuckDBBase, exact_match_rule): - pass +exact_match_rule = partial(exact_match_rule, _sql_dialect="duckdb") def block_on( diff --git a/splink/duckdb/linker.py b/splink/duckdb/linker.py index 8b0e016df6..79fc0b43c1 100644 --- a/splink/duckdb/linker.py +++ b/splink/duckdb/linker.py @@ -218,6 +218,7 @@ def _execute_sql_against_backend(self, sql, templated_name, physical_name): ({sql}) """ self._log_and_run_sql_execution(sql, templated_name, physical_name) + return DuckDBDataFrame(templated_name, physical_name, self) def _run_sql_execution(self, final_sql, templated_name, physical_name): @@ -318,21 +319,3 @@ def export_to_duckdb_file(self, output_path, delete_intermediate_tables=False): new_con = duckdb.connect(database=output_path) new_con.execute(f"IMPORT DATABASE '{tmpdir}';") new_con.close() - - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): - """Generated sql that explodes one or more columns in a table""" - columns_to_explode = columns_to_explode.copy() - other_columns_to_retain = other_columns_to_retain.copy() - # base case - if len(columns_to_explode) == 0: - return f"select {','.join(other_columns_to_retain)} from {tbl_name}" - else: - column_to_explode = columns_to_explode.pop() - cols_to_select = ( - [f"unnest({column_to_explode}) as {column_to_explode}"] - + other_columns_to_retain - + columns_to_explode - ) - other_columns_to_retain.append(column_to_explode) - return f"""select {','.join(cols_to_select)} - from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})""" # noqa: E501 diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py new file mode 100644 index 0000000000..1d71125606 --- /dev/null +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -0,0 +1,250 @@ +import logging +import string +from typing import TYPE_CHECKING, Dict, List, Set + +import pandas as pd + +from .blocking import BlockingRule +from .input_column import InputColumn + +if TYPE_CHECKING: + from .linker import Linker +logger = logging.getLogger(__name__) + + +def sanitise_column_name_for_one_hot_encoding(column_name) -> str: + allowed_chars = string.ascii_letters + string.digits + "_" + sanitised_name = "".join(c for c in column_name if c in allowed_chars) + return sanitised_name + + +def _generate_output_combinations_table_row( + blocking_columns, splink_blocking_rule, comparison_count, all_columns +) -> dict: + row = {} + + blocking_columns = [ + sanitise_column_name_for_one_hot_encoding(c) for c in blocking_columns + ] + all_columns = [sanitise_column_name_for_one_hot_encoding(c) for c in all_columns] + + row["blocking_columns_sanitised"] = blocking_columns + row["splink_blocking_rule"] = splink_blocking_rule + row["comparison_count"] = comparison_count + row["num_equi_joins"] = len(blocking_columns) + + for col in all_columns: + row[f"__fixed__{col}"] = 1 if col in blocking_columns else 0 + + return row + + +def _generate_combinations( + all_columns, current_combination, already_visited: Set[frozenset] +) -> list: + """Generate combinations of columns to visit that haven't been visited already + irrespective of order + """ + + combinations = [] + for col in all_columns: + if col not in current_combination: + next_combination = current_combination + [col] + if frozenset(next_combination) not in already_visited: + combinations.append(next_combination) + + return combinations + + +def _generate_blocking_rule( + linker: "Linker", cols_as_string: List[str] +) -> BlockingRule: + """Generate a Splink blocking rule given a list of column names which + are provided as as string""" + + # TODO: Refactor in Splink4 + dialect = linker._sql_dialect + + module_mapping = { + "presto": "splink.athena.blocking_rule_library", + "duckdb": "splink.duckdb.blocking_rule_library", + "postgres": "splink.postgres.blocking_rule_library", + "spark": "splink.spark.blocking_rule_library", + "sqlite": "splink.sqlite.blocking_rule_library", + } + + if dialect not in module_mapping: + raise ValueError(f"Unsupported SQL dialect: {dialect}") + + module_name = module_mapping[dialect] + block_on_module = __import__(module_name, fromlist=["block_on"]) + block_on = block_on_module.block_on + + if len(cols_as_string) == 0: + return block_on("1") + + br = block_on(cols_as_string) + + return br + + +def _search_tree_for_blocking_rules_below_threshold_count( + linker: "Linker", + all_columns: List[str], + threshold: float, + current_combination: List[str] = None, + already_visited: Set[frozenset] = None, + results: List[Dict[str, str]] = None, +) -> List[Dict[str, str]]: + """ + Recursively search combinations of fields to find ones that result in a count less + than the threshold. + + Uses the new, fast counting function + linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions + to count + + The full tree looks like this, where c1 c2 are columns: + c1 count_comparisons(c1) + ├── c2 count_comparisons(c1, c2) + │ └── c3 count_comparisons(c1, c2, c3) + ├── c3 count_comparisons(c1, c3) + │ └── c2 count_comparisons(c1, c3, c2) + c2 count_comparisons(c2) + ├── c1 count_comparisons(c2, c1) + │ └── c3 count_comparisons(c2, c1, c3) + ├── c3 count_comparisons(c2, c3) + │ └── c1 count_comparisons(c2, c3, c1) + + But many nodes do not need to be visited: + - Once the count is below the threshold, no branches from the node are explored. + - If a combination has alraedy been evaluated, it is not evaluated again. For + example, c2 -> c1 will not be evaluated because c1 -> c2 has already been + counted + + When a count is below the threshold, create a dictionary with the relevant stats + like : + { + 'blocking_columns_sanitised':['first_name'], + 'splink_blocking_rule':', + comparison_count':4827, + 'num_equi_join':1, + '__fixed__first_name':1, + '__fixed__surname':0, + '__fixed__dob':0, + '__fixed__city':0, + '__fixed__email':0, + '__fixed__cluster':0, + } + + Return a list of these dicts. + + + Args: + linker: splink.Linker + fields (List[str]): List of fields to combine. + threshold (float): The count threshold. + current_combination (List[str], optional): Current combination of fields. + already_visited (Set[frozenset], optional): Set of visited combinations. + results (List[Dict[str, str]], optional): List of results. Defaults to []. + + Returns: + List[Dict]: List of results. Each result is a dict with statistics like + the number of comparisons, the blocking rule etc. + """ + if current_combination is None: + current_combination = [] + if already_visited is None: + already_visited = set() + if results is None: + results = [] + + if len(current_combination) == len(all_columns): + return results # All fields included, meaning we're at a leaf so exit recursion + + br = _generate_blocking_rule(linker, current_combination) + comparison_count = ( + linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) + ) + + already_visited.add(frozenset(current_combination)) + + if comparison_count > threshold: + # Generate all valid combinations and continue the search + combinations = _generate_combinations( + all_columns, current_combination, already_visited + ) + for next_combination in combinations: + _search_tree_for_blocking_rules_below_threshold_count( + linker, + all_columns, + threshold, + next_combination, + already_visited, + results, + ) + else: + row = _generate_output_combinations_table_row( + current_combination, + br, + comparison_count, + all_columns, + ) + results.append(row) + + return results + + +def find_blocking_rules_below_threshold_comparison_count( + linker: "Linker", max_comparisons_per_rule, column_expressions: List[str] = None +) -> pd.DataFrame: + """ + Finds blocking rules which return a comparison count below a given threshold. + + In addition to returning blocking rules, returns the comparison count and + 'num_equi_joins', which refers to the number of equi-joins used by the rule. + + Also returns one-hot encoding that describes which columns are __fixed__ by the + blocking rule + + e.g. equality on first_name and surname has num_equi_joins of 2 + + Args: + linker (Linker): The Linker object + max_comparisons_per_rule (int): Max comparisons allowed per blocking rule. + column_expressions: List[str] = Algorithm will find combinations of these + column expressions to use as blocking rules. If None, uses all columns used + by the ComparisonLevels of the Linker. Column expressions can be SQL + expressions, not just column names i.e. 'substr(surname, 1,1)' is a valid + entry in this list. + + Returns: + pd.DataFrame: DataFrame with blocking rules, comparison_count and num_equi_joins + """ + + if not column_expressions: + column_expressions = linker._input_columns( + include_unique_id_col_names=False, + include_additional_columns_to_retain=False, + ) + + column_expressions_as_strings = [] + + for c in column_expressions: + if isinstance(c, InputColumn): + column_expressions_as_strings.append(c.quote().name) + else: + column_expressions_as_strings.append(c) + + results = _search_tree_for_blocking_rules_below_threshold_count( + linker, column_expressions_as_strings, max_comparisons_per_rule + ) + + if not results: + raise ValueError( + "No blocking rules could be found that produce a comparison count below " + "your chosen max_comparisons_per_rule threshold of " + f"{max_comparisons_per_rule}. Try increasing the threshold." + ) + + return pd.DataFrame(results) diff --git a/splink/find_matches_to_new_records.py b/splink/find_matches_to_new_records.py index 23bcd72820..83e633aaa5 100644 --- a/splink/find_matches_to_new_records.py +++ b/splink/find_matches_to_new_records.py @@ -11,17 +11,13 @@ def add_unique_id_and_source_dataset_cols_if_needed( linker: "Linker", new_records_df: "SplinkDataFrame" ): cols = new_records_df.columns - cols = [c.unquote().name() for c in cols] + cols = [c.unquote().name for c in cols] # Add source dataset column to new records if required and not exists sds_sel_sql = "" if linker._settings_obj._source_dataset_column_name_is_required: - sds_col = linker._settings_obj._source_dataset_input_column + sds_col = linker._settings_obj._source_dataset_column_name - # TODO: Shouldn't be necessary but the source dataset properties on settings - # are currently broken - sds_col = InputColumn(sds_col, linker._settings_obj) - sds_col = sds_col.unquote().name() if sds_col not in cols: sds_sel_sql = f", 'new_record' as {sds_col}" @@ -29,7 +25,7 @@ def add_unique_id_and_source_dataset_cols_if_needed( uid_sel_sql = "" uid_col = linker._settings_obj._unique_id_column_name uid_col = InputColumn(uid_col, linker._settings_obj) - uid_col = uid_col.unquote().name() + uid_col = uid_col.unquote().name if uid_col not in cols: uid_sel_sql = f", 'no_id_provided' as {uid_col}" diff --git a/splink/input_column.py b/splink/input_column.py index df4f2ad701..8d94c4e245 100644 --- a/splink/input_column.py +++ b/splink/input_column.py @@ -5,6 +5,7 @@ import sqlglot import sqlglot.expressions as exp from sqlglot.errors import ParseError +from sqlglot.expressions import Expression from .default_from_jsonschema import default_value_from_schema @@ -21,7 +22,7 @@ def sqlglot_tree_signature(tree): return " ".join(n[0].key for n in tree.walk()) -def add_suffix(tree, suffix): +def add_suffix(tree, suffix) -> Expression: tree = tree.copy() identifier_string = tree.find(exp.Identifier).this identifier_string = f"{identifier_string}{suffix}" @@ -29,7 +30,7 @@ def add_suffix(tree, suffix): return tree -def add_prefix(tree, prefix): +def add_prefix(tree, prefix) -> Expression: tree = tree.copy() identifier_string = tree.find(exp.Identifier).this identifier_string = f"{prefix}{identifier_string}" @@ -37,7 +38,7 @@ def add_prefix(tree, prefix): return tree -def add_table(tree, tablename): +def add_table(tree, tablename) -> Expression: tree = tree.copy() table_identifier = exp.Identifier(this=tablename, quoted=True) identifier = tree.find(exp.Column) @@ -45,7 +46,7 @@ def add_table(tree, tablename): return tree -def remove_quotes_from_identifiers(tree): +def remove_quotes_from_identifiers(tree) -> Expression: tree = tree.copy() for identifier in tree.find_all(exp.Identifier): identifier.args["quoted"] = False @@ -53,7 +54,26 @@ def remove_quotes_from_identifiers(tree): class InputColumn: - def __init__(self, name, settings_obj=None, sql_dialect=None): + """ + Represents a SQL column or column reference + Handles SQL dialect-specific issues such as identifier quoting. + + The input should be the raw identifier, without SQL-specific identifier quotes. + + For example, if a column is named 'first name' (with a space), the input should be + 'first name', not '"first name"'. + + Examples of valid inputs include: + - 'first_name' + - 'first name' + - 'coordinates['lat']' + - 'coordinates[1]' + + """ + + def __init__( + self, raw_column_name_or_column_reference, settings_obj=None, sql_dialect=None + ): # If settings_obj is None, then default values will be used # from the jsonschama self._settings_obj = settings_obj @@ -65,40 +85,48 @@ def __init__(self, name, settings_obj=None, sql_dialect=None): else: self._sql_dialect = None - self.input_name = self._quote_name(name) + self.input_name = self._quote_if_sql_keyword( + raw_column_name_or_column_reference + ) self.input_name_as_tree = self.parse_input_name_to_sqlglot_tree() for identifier in self.input_name_as_tree.find_all(exp.Identifier): identifier.args["quoted"] = True - def quote(self): + def quote(self) -> "InputColumn": self_copy = deepcopy(self) for identifier in self_copy.input_name_as_tree.find_all(exp.Identifier): identifier.args["quoted"] = True return self_copy - def unquote(self): + def unquote(self) -> "InputColumn": self_copy = deepcopy(self) for identifier in self_copy.input_name_as_tree.find_all(exp.Identifier): identifier.args["quoted"] = False return self_copy - def parse_input_name_to_sqlglot_tree(self): - # Cases that could occur for self.input_name: - # SUR name -> parses to 'alias column identifier identifier' - # first and surname -> parses to 'and column column identifier identifier' - # a b c -> parse error - # "SUR name" -> parses to 'column identifier' - # geocode['lat'] -> parsees to bracket column literal identifier - # geocode[1] -> parsees to bracket column literal identifier + def parse_input_name_to_sqlglot_tree(self) -> Expression: + """ + Parses the input name into a SQLglot expression tree. - # Note we don't expect SUR name[1] since the user should have quoted this + Fiddly because we need to deal with escaping issues. For example + the column name in the input dataset may be 'first and surname', but + if we naively parse this using sqlglot it will be interpreted as an AND + expression + + Note: We do not support inputs like 'SUR name[1]', in this case the user + would have to quote e.g. `SUR name`[1] + """ + + q_s, q_e = _get_dialect_quotes(self._sql_dialect) try: tree = sqlglot.parse_one(self.input_name, read=self._sql_dialect) except ParseError: - tree = sqlglot.parse_one(f'"{self.input_name}"', read=self._sql_dialect) + tree = sqlglot.parse_one( + f"{q_s}{self.input_name}{q_e}", read=self._sql_dialect + ) tree_signature = sqlglot_tree_signature(tree) valid_signatures = ["column identifier", "bracket column literal identifier"] @@ -108,7 +136,9 @@ def parse_input_name_to_sqlglot_tree(self): else: # e.g. SUR name parses to 'alias column identifier identifier' # but we want "SUR name" - tree = sqlglot.parse_one(f'"{self.input_name}"', read=self._sql_dialect) + tree = sqlglot.parse_one( + f"{q_s}{self.input_name}{q_e}", read=self._sql_dialect + ) return tree def from_settings_obj_else_default(self, key, schema_key=None): @@ -121,94 +151,108 @@ def from_settings_obj_else_default(self, key, schema_key=None): return default_value_from_schema(schema_key, "root") @property - def gamma_prefix(self): + def gamma_prefix(self) -> str: return self.from_settings_obj_else_default( "_gamma_prefix", "comparison_vector_value_column_prefix" ) @property - def bf_prefix(self): + def bf_prefix(self) -> str: return self.from_settings_obj_else_default( "_bf_prefix", "bayes_factor_column_prefix" ) @property - def tf_prefix(self): + def tf_prefix(self) -> str: return self.from_settings_obj_else_default( "_tf_prefix", "term_frequency_adjustment_column_prefix" ) - def name(self): + @property + def name(self) -> str: return self.input_name_as_tree.sql(dialect=self._sql_dialect) - def name_l(self): + @property + def name_l(self) -> str: return add_suffix(self.input_name_as_tree, suffix="_l").sql( dialect=self._sql_dialect ) - def name_r(self): + @property + def name_r(self) -> str: return add_suffix(self.input_name_as_tree, suffix="_r").sql( dialect=self._sql_dialect ) - def names_l_r(self): - return [self.name_l(), self.name_r()] + @property + def names_l_r(self) -> list[str]: + return [self.name_l, self.name_r] - def l_name_as_l(self): + @property + def l_name_as_l(self) -> str: name_with_l_table = add_table(self.input_name_as_tree, "l").sql( dialect=self._sql_dialect ) - return f"{name_with_l_table} as {self.name_l()}" + return f"{name_with_l_table} as {self.name_l}" - def r_name_as_r(self): + @property + def r_name_as_r(self) -> str: name_with_r_table = add_table(self.input_name_as_tree, "r").sql( dialect=self._sql_dialect ) - return f"{name_with_r_table} as {self.name_r()}" + return f"{name_with_r_table} as {self.name_r}" - def l_r_names_as_l_r(self): - return [self.l_name_as_l(), self.r_name_as_r()] + @property + def l_r_names_as_l_r(self) -> list[str]: + return [self.l_name_as_l, self.r_name_as_r] - def bf_name(self): + @property + def bf_name(self) -> str: return add_prefix(self.input_name_as_tree, prefix=self.bf_prefix).sql( dialect=self._sql_dialect ) - def tf_name(self): + @property + def tf_name(self) -> str: return add_prefix(self.input_name_as_tree, prefix=self.tf_prefix).sql( dialect=self._sql_dialect ) - def tf_name_l(self): + @property + def tf_name_l(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) return add_suffix(tree, suffix="_l").sql(dialect=self._sql_dialect) - def tf_name_r(self): + @property + def tf_name_r(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) return add_suffix(tree, suffix="_r").sql(dialect=self._sql_dialect) - def tf_name_l_r(self): - return [self.tf_name_l(), self.tf_name_r()] + @property + def tf_name_l_r(self) -> list[str]: + return [self.tf_name_l, self.tf_name_r] - def l_tf_name_as_l(self): + @property + def l_tf_name_as_l(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) tf_name_with_l_table = add_table(tree, tablename="l").sql( dialect=self._sql_dialect ) - return f"{tf_name_with_l_table} as {self.tf_name_l()}" + return f"{tf_name_with_l_table} as {self.tf_name_l}" - def r_tf_name_as_r(self): + @property + def r_tf_name_as_r(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) tf_name_with_r_table = add_table(tree, tablename="r").sql( dialect=self._sql_dialect ) - return f"{tf_name_with_r_table} as {self.tf_name_r()}" + return f"{tf_name_with_r_table} as {self.tf_name_r}" - def l_r_tf_names_as_l_r(self): - return [self.l_tf_name_as_l(), self.r_tf_name_as_r()] + @property + def l_r_tf_names_as_l_r(self) -> list[str]: + return [self.l_tf_name_as_l, self.r_tf_name_as_r] - def _quote_name(self, name: str) -> str: - # Quote column names that are also SQL keywords + def _quote_if_sql_keyword(self, name: str) -> str: if name not in {"group", "index"}: return name start, end = _get_dialect_quotes(self._sql_dialect) @@ -216,6 +260,15 @@ def _quote_name(self, name: str) -> str: def _get_dialect_quotes(dialect): + """ + Returns the appropriate quotation marks for identifiers based on the SQL dialect. + + For most SQL dialects, identifiers are quoted using double quotes. + For example, "first name" is a quoted identifier that + allows for a space in the column name. + + However, some SQL dialects, use other identifiers e.g. ` in Spark SQL + """ start = end = '"' if dialect is None: return start, end diff --git a/splink/labelling_tool.py b/splink/labelling_tool.py index 18bab14b0f..f39421945a 100644 --- a/splink/labelling_tool.py +++ b/splink/labelling_tool.py @@ -28,7 +28,7 @@ def generate_labelling_tool_comparisons( source_dataset_condition = "" if source_dataset is not None: - sds_col = settings._source_dataset_input_column + sds_col = settings._source_dataset_column_name source_dataset_condition = f""" and {sds_col} = '{source_dataset}' """ diff --git a/splink/linker.py b/splink/linker.py index 33427adea9..8731f2f3e8 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -33,10 +33,8 @@ ) from .blocking import ( BlockingRule, - ExplodingBlockingRule, block_using_rules_sqls, blocking_rule_to_obj, - materialise_exploded_id_tables, ) from .cache_dict_with_logging import CacheDictWithLogging from .charts import ( @@ -52,6 +50,7 @@ unlinkables_chart, waterfall_chart, ) +from .cluster_metrics import _size_density_sql from .cluster_studio import render_splink_cluster_studio_html from .comparison import Comparison from .comparison_level import ComparisonLevel @@ -66,6 +65,9 @@ from .em_training_session import EMTrainingSession from .estimate_u import estimate_u_values from .exceptions import SplinkDeprecated, SplinkException +from .find_brs_with_comparison_counts_below_threshold import ( + find_blocking_rules_below_threshold_comparison_count, +) from .find_matches_to_new_records import add_unique_id_and_source_dataset_cols_if_needed from .labelling_tool import ( generate_labelling_tool_comparisons, @@ -83,11 +85,11 @@ bayes_factor_to_prob, ensure_is_list, ensure_is_tuple, - find_unique_source_dataset, parse_duration, prob_to_bayes_factor, ) from .missingness import completeness_data, missingness_data +from .optimise_cost_of_brs import suggest_blocking_rules from .pipeline import SQLPipeline from .predict import predict_from_comparison_vectors_sqls from .profile_data import profile_columns @@ -250,19 +252,72 @@ def __init__( self.debug_mode = False - @property - def _get_input_columns( + def _input_columns( self, - as_list=True, - ): - """Retrieve the column names from the input dataset(s)""" - df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values())) + include_unique_id_col_names=True, + include_additional_columns_to_retain=True, + ) -> list[InputColumn]: + """Retrieve the column names from the input dataset(s) as InputColumns - column_names = ( - [col.name() for col in df_obj.columns] if as_list else df_obj.columns - ) + Args: + include_unique_id_col_names (bool, optional): Whether to include unique ID + column names. Defaults to True. + include_additional_columns_to_retain (bool, optional): Whether to include + additional columns to retain. Defaults to True. + + Raises: + SplinkException: If the input frames have different sets of columns. + + Returns: + list[InputColumn] + """ - return column_names + input_dfs = self._input_tables_dict.values() + + # get a list of the column names for each input frame + # sort it for consistent ordering, and give each frame's + # columns as a tuple so we can hash it + column_names_by_input_df = [ + tuple(sorted([col.name for col in input_df.columns])) + for input_df in input_dfs + ] + # check that the set of input columns is the same for each frame, + # fail if the sets are different + if len(set(column_names_by_input_df)) > 1: + common_cols = set.intersection( + *(set(col_names) for col_names in column_names_by_input_df) + ) + problem_names = { + col + for frame_col_names in column_names_by_input_df + for col in frame_col_names + if col not in common_cols + } + raise SplinkException( + "All linker input frames must have the same set of columns. " + "The following columns were not found in all input frames: " + + ", ".join(problem_names) + ) + + columns = next(iter(input_dfs)).columns + + remove_columns = [] + if not include_unique_id_col_names: + remove_columns.extend(self._settings_obj._unique_id_input_columns) + if not include_additional_columns_to_retain: + remove_columns.extend(self._settings_obj._additional_columns_to_retain) + + remove_id_cols = [c.unquote().name for c in remove_columns] + columns = [col for col in columns if col.unquote().name not in remove_id_cols] + + return columns + + @property + def _source_dataset_column_already_exists(self): + if self._settings_obj_ is None: + return False + input_cols = [c.unquote().name for c in self._input_columns()] + return self._settings_obj._source_dataset_column_name in input_cols @property def _cache_uid(self): @@ -338,21 +393,6 @@ def _input_tablename_r(self): return "__splink__df_concat_with_tf_right" return "__splink__df_concat_with_tf" - @property - def _source_dataset_column_name(self): - if self._settings_obj_ is None: - return None - - # Used throughout the scripts to feed our SQL - if self._settings_obj._source_dataset_column_name_is_required: - df_obj = next(iter(self._input_tables_dict.values())) - columns = df_obj.columns_escaped - - input_column, src_ds_col = self._settings_obj_._source_dataset_col - return "__splink_source_dataset" if src_ds_col in columns else input_column - else: - return None - @property def _two_dataset_link_only(self): # Two dataset link only join is a special case where an inner join of the @@ -395,34 +435,6 @@ def _random_sample_sql( ): raise NotImplementedError("Random sample sql not implemented for this linker") - @property - def _verify_link_only_job(self): - cache = self._intermediate_table_cache - if "__splink__df_concat_with_tf" not in cache: - return - - if self._settings_obj._link_type == "link_only": - # if input datasets > 1 then skip - if len(self._input_tables_dict) > 1: - return - - # else, check if source dataset column is populated... - src_ds = self._source_dataset_column_name - if src_ds == "__splink_source_dataset": - _, src_ds = self._settings_obj_._source_dataset_col - - sql = find_unique_source_dataset(src_ds) - self._enqueue_sql(sql, "source_ds_distinct") - src_ds_distinct = self._execute_sql_pipeline( - [cache["__splink__df_concat_with_tf"]] - ) - if len(src_ds_distinct.as_record_dict()) == 1: - raise SplinkException( - "if `link_type` is `link_only`, it should have at least two " - "input dataframes, or one dataframe with a `source_dataset` " - "column outlining which dataset each record belongs to." - ) - def _register_input_tables(self, input_tables, input_aliases, accepted_df_dtypes): # 'homogenised' means all entries are strings representing tables homogenised_tables = [] @@ -545,6 +557,11 @@ def _initialise_df_concat_with_tf(self, materialise=True): nodes_with_tf = cache.get_with_logging("__splink__df_concat_with_tf") else: + # In duckdb, calls to random() in a CTE pipeline cause problems: + # https://gist.github.com/RobinL/d329e7004998503ce91b68479aa41139 + if self._settings_obj.salting_required: + materialise = True + if materialise: # Clear the pipeline if we are materialising # There's no reason not to do this, since when @@ -562,10 +579,6 @@ def _initialise_df_concat_with_tf(self, materialise=True): nodes_with_tf = self._execute_sql_pipeline() cache["__splink__df_concat_with_tf"] = nodes_with_tf - # verify the link job - if self._settings_obj_ is not None: - self._verify_link_only_job - return nodes_with_tf def _table_to_splink_dataframe( @@ -634,9 +647,9 @@ def _execute_sql_pipeline( start_time = time.time() output_tablename = task.output_table_name sql = task.sql - print("------", flush=True) # noqa: T201 + print("------") # noqa: T201 print( # noqa: T201 - f"--------Creating table: {output_tablename}--------", flush=True + f"--------Creating table: {output_tablename}--------" ) dataframe = self._sql_to_splink_dataframe_checking_cache( @@ -996,7 +1009,7 @@ def _populate_probability_two_random_records_match_from_trained_values(self): 15, "\n" f"Probability two random records match from trained model blocking on " - f"{em_training_session._blocking_rule_for_training.blocking_rule}: " + f"{em_training_session._blocking_rule_for_training.blocking_rule_sql}: " f"{training_lambda:,.3f}", ) @@ -1731,8 +1744,6 @@ def predict( if nodes_with_tf: input_dataframes.append(nodes_with_tf) - materialise_exploded_id_tables(self) - sqls = block_using_rules_sqls(self) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) @@ -1757,11 +1768,6 @@ def predict( self._enqueue_sql(sql["sql"], sql["output_table_name"]) predictions = self._execute_sql_pipeline(input_dataframes) - - for br in self._settings_obj._blocking_rules_to_generate_predictions: - if isinstance(br, ExplodingBlockingRule): - br.drop_materialised_id_pairs_dataframes() - self._predict_warning() return predictions @@ -2046,15 +2052,15 @@ def cluster_pairwise_predictions_at_threshold( into groups of connected record using the connected components graph clustering algorithm - Records with an estimated `match_probability` above + Records with an estimated `match_probability` at or above `threshold_match_probability` are considered to be a match (i.e. they represent the same entity). Args: df_predict (SplinkDataFrame): The results of `linker.predict()` threshold_match_probability (float): Filter the pairwise match predictions - to include only pairwise comparisons with a match_probability above this - threshold. This dataframe is then fed into the clustering + to include only pairwise comparisons with a match_probability at or + above this threshold. This dataframe is then fed into the clustering algorithm. pairwise_formatting (bool): Whether to output the pairwise match predictions from linker.predict() with cluster IDs. @@ -2091,6 +2097,45 @@ def cluster_pairwise_predictions_at_threshold( return cc + def _compute_cluster_metrics( + self, + df_predict: SplinkDataFrame, + df_clustered: SplinkDataFrame, + threshold_match_probability: float = None, + ): + """Generates a table containing cluster metrics and returns a Splink dataframe + + Args: + df_predict (SplinkDataFrame): The results of `linker.predict()` + df_clustered (SplinkDataFrame): The outputs of + `linker.cluster_pairwise_predictions_at_threshold()` + threshold_match_probability (float): Filter the pairwise match predictions + to include only pairwise comparisons with a match_probability above this + threshold. + + Returns: + SplinkDataFrame: A SplinkDataFrame containing cluster IDs and selected + cluster metrics + + """ + + # Get unique row id column name from settings + unique_id_col = self._settings_obj._unique_id_column_name + + sqls = _size_density_sql( + df_predict, + df_clustered, + threshold_match_probability, + _unique_id_col=unique_id_col, + ) + + for sql in sqls: + self._enqueue_sql(sql["sql"], sql["output_table_name"]) + + df_cluster_metrics = self._execute_sql_pipeline() + + return df_cluster_metrics + def profile_columns( self, column_expressions: str | list[str] = None, top_n=10, bottom_n=10 ): @@ -3013,8 +3058,9 @@ def missingness_chart(self, input_dataset: str = None): Args: input_dataset (str, optional): Name of one of the input tables in the - database. If provided, missingness will be computed for this table alone. - Defaults to None. + database. If provided, missingness will be computed for + this table alone. + Defaults to None. Examples: ```py @@ -3594,13 +3640,13 @@ def invalidate_cache(self): # As a result, any previously cached tables will not be found self._cache_uid = ascii_uid(8) - # As a result, any previously cached tables will not be found - self._intermediate_table_cache.invalidate_cache() - # Drop any existing splink tables from the database # Note, this is not actually necessary, it's just good housekeeping self.delete_tables_created_by_splink_from_db() + # As a result, any previously cached tables will not be found + self._intermediate_table_cache.invalidate_cache() + def register_table_input_nodes_concat_with_tf(self, input_data, overwrite=False): """Register a pre-computed version of the input_nodes_concat_with_tf table that you want to re-use e.g. that you created in a previous run @@ -3710,7 +3756,171 @@ def _remove_splinkdataframe_from_cache(self, splink_dataframe: SplinkDataFrame): for k in keys_to_delete: del self._intermediate_table_cache[k] - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): - raise NotImplementedError( - f"Unnesting blocking rules are not supported for {type(self)}" + def _find_blocking_rules_below_threshold( + self, max_comparisons_per_rule, blocking_expressions=None + ): + return find_blocking_rules_below_threshold_comparison_count( + self, max_comparisons_per_rule, blocking_expressions ) + + def _detect_blocking_rules_for_prediction( + self, + max_comparisons_per_rule, + blocking_expressions=None, + min_freedom=1, + num_runs=200, + num_equi_join_weight=0, + field_freedom_weight=1, + num_brs_weight=10, + num_comparison_weight=10, + return_as_df=False, + ): + """Find blocking rules for prediction below some given threshold of the + maximum number of comparisons that can be generated per blocking rule + (max_comparisons_per_rule). + Uses a heuristic cost algorithm to identify the 'best' set of blocking rules + Args: + max_comparisons_per_rule (int): The maximum number of comparisons that + each blocking rule is allowed to generate + blocking_expressions: By default, blocking rules will be equi-joins + on the columns used by the Splink model. This allows you to manually + specify sql expressions from which combinations will be created. For + example, if you specify ["substr(dob, 1,4)", "surname", "dob"] + blocking rules will be chosen by blocking on combinations + of those expressions. + min_freedom (int, optional): The minimum amount of freedom any column should + be allowed. + num_runs (int, optional): Each run selects rows using a heuristic and costs + them. The more runs, the more likely you are to find the best rule. + Defaults to 5. + num_equi_join_weight (int, optional): Weight allocated to number of equi + joins in the blocking rules. + Defaults to 0 since this is cost better captured by other criteria. + field_freedom_weight (int, optional): Weight given to the cost of + having individual fields which don't havem much flexibility. Assigning + a high weight here makes it more likely you'll generate combinations of + blocking rules for which most fields are allowed to vary more than + the minimum. Defaults to 1. + num_brs_weight (int, optional): Weight assigned to the cost of + additional blocking rules. Higher weight here will result in a + preference for fewer blocking rules. Defaults to 10. + num_comparison_weight (int, optional): Weight assigned to the cost of + larger numbers of comparisons, which happens when more of the blocking + rules are close to the max_comparisons_per_rule. A higher + weight here prefers sets of rules which generate lower total + comparisons. Defaults to 10. + return_as_df (bool, optional): If false, assign recommendation to settings. + If true, return a dataframe containing details of the weights. + Defaults to False. + """ + + df_br_below_thres = find_blocking_rules_below_threshold_comparison_count( + self, max_comparisons_per_rule, blocking_expressions + ) + + blocking_rule_suggestions = suggest_blocking_rules( + df_br_below_thres, + min_freedom=min_freedom, + num_runs=num_runs, + num_equi_join_weight=num_equi_join_weight, + field_freedom_weight=field_freedom_weight, + num_brs_weight=num_brs_weight, + num_comparison_weight=num_comparison_weight, + ) + + if return_as_df: + return blocking_rule_suggestions + else: + if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0: + logger.warning("No set of blocking rules found within constraints") + else: + suggestion = blocking_rule_suggestions[ + "suggested_blocking_rules_as_splink_brs" + ].iloc[0] + self._settings_obj._blocking_rules_to_generate_predictions = suggestion + + suggestion_str = blocking_rule_suggestions[ + "suggested_blocking_rules_for_prediction" + ].iloc[0] + msg = ( + "The following blocking_rules_to_generate_predictions were " + "automatically detected and assigned to your settings:\n" + ) + logger.info(f"{msg}{suggestion_str}") + + def _detect_blocking_rules_for_em_training( + self, + max_comparisons_per_rule, + min_freedom=1, + num_runs=200, + num_equi_join_weight=0, + field_freedom_weight=1, + num_brs_weight=20, + num_comparison_weight=10, + return_as_df=False, + ): + """Find blocking rules for EM training below some given threshold of the + maximum number of comparisons that can be generated per blocking rule + (max_comparisons_per_rule). + Uses a heuristic cost algorithm to identify the 'best' set of blocking rules + Args: + max_comparisons_per_rule (int): The maximum number of comparisons that + each blocking rule is allowed to generate + min_freedom (int, optional): The minimum amount of freedom any column should + be allowed. + num_runs (int, optional): Each run selects rows using a heuristic and costs + them. The more runs, the more likely you are to find the best rule. + Defaults to 5. + num_equi_join_weight (int, optional): Weight allocated to number of equi + joins in the blocking rules. + Defaults to 0 since this is cost better captured by other criteria. + Defaults to 0 since this is cost better captured by other criteria. + field_freedom_weight (int, optional): Weight given to the cost of + having individual fields which don't havem much flexibility. Assigning + a high weight here makes it more likely you'll generate combinations of + blocking rules for which most fields are allowed to vary more than + the minimum. Defaults to 1. + num_brs_weight (int, optional): Weight assigned to the cost of + additional blocking rules. Higher weight here will result in a + preference for fewer blocking rules. Defaults to 10. + num_comparison_weight (int, optional): Weight assigned to the cost of + larger numbers of comparisons, which happens when more of the blocking + rules are close to the max_comparisons_per_rule. A higher + weight here prefers sets of rules which generate lower total + comparisons. Defaults to 10. + return_as_df (bool, optional): If false, return just the recommendation. + If true, return a dataframe containing details of the weights. + Defaults to False. + """ + + df_br_below_thres = find_blocking_rules_below_threshold_comparison_count( + self, max_comparisons_per_rule + ) + + blocking_rule_suggestions = suggest_blocking_rules( + df_br_below_thres, + min_freedom=min_freedom, + num_runs=num_runs, + num_equi_join_weight=num_equi_join_weight, + field_freedom_weight=field_freedom_weight, + num_brs_weight=num_brs_weight, + num_comparison_weight=num_comparison_weight, + ) + + if return_as_df: + return blocking_rule_suggestions + else: + if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0: + logger.warning("No set of blocking rules found within constraints") + return None + else: + suggestion_str = blocking_rule_suggestions[ + "suggested_EM_training_statements" + ].iloc[0] + msg = "The following EM training strategy was detected:\n" + msg = f"{msg}{suggestion_str}" + logger.info(msg) + suggestion = blocking_rule_suggestions[ + "suggested_blocking_rules_as_splink_brs" + ].iloc[0] + return suggestion diff --git a/splink/logging_messages.py b/splink/logging_messages.py index 780005fc25..0761f81a01 100644 --- a/splink/logging_messages.py +++ b/splink/logging_messages.py @@ -1,6 +1,3 @@ -import sqlglot - - def execute_sql_logging_message_info(templated_name, physical_name): return ( f"Executing sql to create " @@ -10,5 +7,4 @@ def execute_sql_logging_message_info(templated_name, physical_name): def log_sql(sql): - # sql = sql).sql(pretty=True) return "\n------Start SQL---------\n" f"{sql}\n" "-------End SQL-----------\n" diff --git a/splink/lower_id_on_lhs.py b/splink/lower_id_on_lhs.py index 6f4ad48c67..2e3d2d0e70 100644 --- a/splink/lower_id_on_lhs.py +++ b/splink/lower_id_on_lhs.py @@ -66,7 +66,7 @@ def lower_id_to_left_hand_side( """ # noqa cols = df.columns - cols = [c.unquote().name() for c in cols] + cols = [c.unquote().name for c in cols] l_cols = [c for c in cols if c.endswith("_l")] r_cols = [c for c in cols if c.endswith("_r")] diff --git a/splink/misc.py b/splink/misc.py index e8d634528c..a8d1b1b70a 100644 --- a/splink/misc.py +++ b/splink/misc.py @@ -160,15 +160,6 @@ def ascii_uid(len): return "".join(random.choices(string.ascii_lowercase + string.digits, k=len)) -def find_unique_source_dataset(src_ds): - sql = f""" - select distinct {src_ds} as src - from __splink__df_concat_with_tf - """ - - return sql - - def parse_duration(duration: float) -> str: # math.ceil to clean up our output for anything over a minute d = int(ceil(duration)) diff --git a/splink/missingness.py b/splink/missingness.py index 197c307f8d..8479d951c1 100644 --- a/splink/missingness.py +++ b/splink/missingness.py @@ -8,8 +8,8 @@ def missingness_sqls(columns, input_tablename): selects = [ col_template.format( - col_name_escaped=col.name(), - col_name=col.unquote().name(), + col_name_escaped=col.name, + col_name=col.unquote().name, input_tablename=input_tablename, ) for col in columns @@ -40,13 +40,13 @@ def missingness_sqls(columns, input_tablename): def missingness_data(linker, input_tablename): + columns = linker._input_columns() if input_tablename is None: splink_dataframe = linker._initialise_df_concat(materialise=True) else: splink_dataframe = linker._table_to_splink_dataframe( input_tablename, input_tablename ) - columns = splink_dataframe.columns sqls = missingness_sqls(columns, splink_dataframe.physical_name) @@ -69,7 +69,7 @@ def completeness_data(linker, input_tablename=None, cols=None): cols = linker._settings_obj._columns_used_by_comparisons if linker._settings_obj._source_dataset_column_name_is_required: - source_name = linker._source_dataset_column_name + source_name = linker._settings_obj._source_dataset_column_name else: # Set source dataset to a literal string if dedupe_only source_name = "'_a'" diff --git a/splink/optimise_cost_of_brs.py b/splink/optimise_cost_of_brs.py new file mode 100644 index 0000000000..18277a5967 --- /dev/null +++ b/splink/optimise_cost_of_brs.py @@ -0,0 +1,214 @@ +import logging +from random import randint + +import pandas as pd + +from .cost_of_blocking_rules import calculate_cost_of_combination_of_brs + +logger = logging.getLogger(__name__) + + +def localised_shuffle(lst: list, window_percent: float) -> list: + """ + Performs a localised shuffle on a list. + + This is used to choose semi-randomly from a list of + sorted rows, so you tend to pick from items towards the top + + Args: + lst (list): The list to shuffle. + window_percent (float): The window percent for shuffle e.g. 0.3 for shuffle + within 30% of orig position + + Returns: + list: A shuffled copy of the original list. + """ + window_size = max(1, int(window_percent * len(lst))) + return sorted(lst, key=lambda x: lst.index(x) + randint(-window_size, window_size)) + + +def check_field_freedom(candidate_set, field_names, min_field_freedom): + """ + Checks if each field in the candidate set is allowed to vary at least + 'min_field_freedom' times. + + Args: + candidate_set (list): The candidate set of rows. + field_names (list): The list of field names. + min_field_freedom (int): The minimum field freedom. + + Returns: + bool: True if each field can vary at least 'min_field_freedom' times, + False otherwise. + """ + covered_fields = {field: 0 for field in field_names} + for row in candidate_set: + for field in field_names: + if row[field] == 0: + covered_fields[field] += 1 + return all(count >= min_field_freedom for count in covered_fields.values()) + + +def heuristic_select_brs_that_have_min_freedom(data, field_names, min_field_freedom): + """ + A heuristic algorithm to select blocking rules that between them + ensure that each field is allowed to vary at least 'min_field_freedom' times. + + Args: + data (list): The data rows. + field_names (list): The list of field names. + min_field_freedom (int): The minimum field freedom. + + Returns: + list: The candidate set of rows. + """ + data_sorted_randomised = localised_shuffle(data, 0.5) + candidate_rows = [] + + for row in data_sorted_randomised: + candidate_rows.append(row) + if check_field_freedom(candidate_rows, field_names, min_field_freedom): + break + + sorted_candidate_rows = sorted( + candidate_rows, key=lambda x: x["blocking_columns_sanitised"] + ) + + return sorted_candidate_rows + + +def get_block_on_string(br_rows): + block_on_strings = [] + + for row in br_rows: + quoted_args = [] + for arg in row["blocking_columns_sanitised"]: + quoted_arg = f'"{arg}"' + quoted_args.append(quoted_arg) + + block_on_args = ", ".join(quoted_args) + block_on_string = f"block_on([{block_on_args}])" + block_on_strings.append(block_on_string) + + return " \n".join(block_on_strings) + + +def get_em_training_string(br_rows): + block_on_strings = [] + + for row in br_rows: + quoted_args = [] + for arg in row["blocking_columns_sanitised"]: + quoted_arg = f'"{arg}"' + quoted_args.append(quoted_arg) + + block_on_args = ", ".join(quoted_args) + block_on_string = f"block_on([{block_on_args}])" + block_on_strings.append(block_on_string) + + training_statements = [] + for block_on_str in block_on_strings: + statement = ( + f"linker.estimate_parameters_using_expectation_maximisation({block_on_str})" + ) + training_statements.append(statement) + + return " \n".join(training_statements) + + +def suggest_blocking_rules( + df_block_stats, + min_freedom=1, + num_runs=100, + num_equi_join_weight=0, + field_freedom_weight=1, + num_brs_weight=10, + num_comparison_weight=10, +): + """Use a cost optimiser to suggest blocking rules + + Args: + df_block_stats: Dataframe returned by find_blocking_rules_below_threshold + min_freedom (int, optional): Each column should have at least this many + opportunities to vary amongst the blocking rules. Defaults to 1. + num_runs (int, optional): How many random combinations of + rules to try. The best will be selected. Defaults to 5. + num_equi_join_weight (int, optional): The weight for number of equi joins. + Defaults to 0. + field_freedom_weight (int, optional): The weight for field freedom. Defaults to + 10. + num_brs_weight (int, optional): The weight for the number of blocking rules + found. Defaults to 10. + + Returns: + pd.DataFrame: A DataFrame containing the results of the blocking rules + suggestion. It includes columns such as + 'suggested_blocking_rules_for_prediction', + 'suggested_EM_training_statements', and various cost information + + """ + if len(df_block_stats) == 0: + return None + + max_comparison_count = df_block_stats["comparison_count"].max() + + df_block_stats = df_block_stats.sort_values( + by=["num_equi_joins", "comparison_count"], ascending=[True, False] + ) + blocks_found_recs = df_block_stats.to_dict(orient="records") + + blocking_cols = list(blocks_found_recs[0].keys()) + blocking_cols = [c for c in blocking_cols if c.startswith("__fixed__")] + + results = [] + + for run in range(num_runs): + selected_rows = heuristic_select_brs_that_have_min_freedom( + blocks_found_recs, blocking_cols, min_field_freedom=min_freedom + ) + + cost_dict = { + "suggested_blocking_rules_for_prediction": get_block_on_string( + selected_rows + ), + "suggested_EM_training_statements": get_em_training_string(selected_rows), + } + + costs = calculate_cost_of_combination_of_brs( + selected_rows, + max_comparison_count, + num_equi_join_weight, + field_freedom_weight, + num_brs_weight, + num_comparison_weight, + ) + + cost_dict.update(costs) + cost_dict.update( + { + "run_num": run, + "minimum_freedom_for_each_column": min_freedom, + "suggested_blocking_rules_as_splink_brs": [ + row["splink_blocking_rule"] for row in selected_rows + ], + } + ) + results.append(cost_dict) + + results_df = pd.DataFrame(results) + # easier to read if we normalise the cost so the best is 0 + min_ = results_df["field_freedom_cost"].min() + results_df["field_freedom_cost"] = results_df["field_freedom_cost"] - min_ + + min_ = results_df["field_freedom_cost_weighted"].min() + results_df["field_freedom_cost_weighted"] = ( + results_df["field_freedom_cost_weighted"] - min_ + ) + results_df["cost"] = results_df["cost"] - min_ + + min_scores_df = results_df.sort_values("cost") + min_scores_df = min_scores_df.drop_duplicates( + "suggested_blocking_rules_for_prediction" + ) + + return min_scores_df diff --git a/splink/postgres/postgres_helpers/postgres_blocking_rule_imports.py b/splink/postgres/postgres_helpers/postgres_blocking_rule_imports.py index 135f2783bf..e99bc20199 100644 --- a/splink/postgres/postgres_helpers/postgres_blocking_rule_imports.py +++ b/splink/postgres/postgres_helpers/postgres_blocking_rule_imports.py @@ -1,5 +1,7 @@ from __future__ import annotations +from functools import partial + from ...blocking_rules_library import ( BlockingRule, exact_match_rule, @@ -7,13 +9,8 @@ from ...blocking_rules_library import ( block_on as _block_on_, ) -from .postgres_base import ( - PostgresBase, -) - -class exact_match_rule(PostgresBase, exact_match_rule): - pass +exact_match_rule = partial(exact_match_rule, _sql_dialect="postgres") def block_on( diff --git a/splink/profile_data.py b/splink/profile_data.py index 62b73546d6..93986dfb2f 100644 --- a/splink/profile_data.py +++ b/splink/profile_data.py @@ -1,9 +1,12 @@ +import logging import re from copy import deepcopy from .charts import altair_or_json, load_chart_definition from .misc import ensure_is_list +logger = logging.getLogger(__name__) + def _group_name(cols_or_expr): cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr) @@ -229,7 +232,7 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): """ if not column_expressions: - column_expressions = linker._get_input_columns + column_expressions = [col.name for col in linker._input_columns()] df_concat = linker._initialise_df_concat() @@ -270,21 +273,34 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): percentile_rows = [ p for p in percentile_rows_all if p["group_name"] == _group_name(expression) ] - percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows) - top_n_rows = [ - p for p in top_n_rows_all if p["group_name"] == _group_name(expression) - ] - bottom_n_rows = [ - p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression) - ] - # remove concat blank from expression title - expression = expression.replace(", ' '", "") - inner_chart = _get_inner_chart_spec_freq( - percentile_rows, top_n_rows, bottom_n_rows, expression - ) - inner_charts.append(inner_chart) - outer_spec = deepcopy(_outer_chart_spec_freq) - - outer_spec["vconcat"] = inner_charts - - return altair_or_json(outer_spec) + if percentile_rows == []: + logger.warning( + "Warning: No charts produced for " + f"{expression}" + " as the column only contains null values." + ) + else: + percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows) + top_n_rows = [ + p for p in top_n_rows_all if p["group_name"] == _group_name(expression) + ] + bottom_n_rows = [ + p + for p in bottom_n_rows_all + if p["group_name"] == _group_name(expression) + ] + # remove concat blank from expression title + expression = expression.replace(", ' '", "") + inner_chart = _get_inner_chart_spec_freq( + percentile_rows, top_n_rows, bottom_n_rows, expression + ) + inner_charts.append(inner_chart) + + if inner_charts != []: + outer_spec = deepcopy(_outer_chart_spec_freq) + outer_spec["vconcat"] = inner_charts + + return altair_or_json(outer_spec) + + else: + return None diff --git a/splink/settings.py b/splink/settings.py index 261eb9e886..df13bd4fae 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -2,8 +2,9 @@ import logging from copy import deepcopy +from typing import List -from .blocking import blocking_rule_to_obj +from .blocking import BlockingRule, SaltedBlockingRule, blocking_rule_to_obj from .charts import m_u_parameters_chart, match_weights_chart from .comparison import Comparison from .comparison_level import ComparisonLevel @@ -130,10 +131,10 @@ def _get_additional_columns_to_retain(self): used_by_brs = [InputColumn(c) for c in used_by_brs] - used_by_brs = [c.unquote().name() for c in used_by_brs] + used_by_brs = [c.unquote().name for c in used_by_brs] already_used = self._columns_used_by_comparisons already_used = [InputColumn(c) for c in already_used] - already_used = [c.unquote().name() for c in already_used] + already_used = [c.unquote().name for c in already_used] new_cols = list(set(used_by_brs) - set(already_used)) a_cols.extend(new_cols) @@ -159,25 +160,20 @@ def _source_dataset_column_name_is_required(self): return self._link_type not in ["dedupe_only"] @property - def _source_dataset_input_column(self): + def _source_dataset_column_name(self): if self._source_dataset_column_name_is_required: s_else_d = self._from_settings_dict_else_default return s_else_d("source_dataset_column_name") else: return None - @property - def _source_dataset_col(self): - input_column = self._source_dataset_input_column - return (input_column, InputColumn(input_column, self).name()) - @property def _unique_id_input_columns(self) -> list[InputColumn]: cols = [] if self._source_dataset_column_name_is_required: col = InputColumn( - self._source_dataset_input_column, + self._source_dataset_column_name, settings_obj=self, ) cols.append(col) @@ -209,11 +205,11 @@ def _needs_matchkey_column(self) -> bool: def _columns_used_by_comparisons(self): cols_used = [] if self._source_dataset_column_name_is_required: - cols_used.append(self._source_dataset_input_column) + cols_used.append(self._source_dataset_column_name) cols_used.append(self._unique_id_column_name) for cc in self.comparisons: cols = cc._input_columns_used_by_case_statement - cols = [c.name() for c in cols] + cols = [c.name for c in cols] cols_used.extend(cols) return dedupe_preserving_order(cols_used) @@ -223,14 +219,14 @@ def _columns_to_select_for_blocking(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.l_name_as_l()) - cols.append(uid_col.r_name_as_r()) + cols.append(uid_col.l_name_as_l) + cols.append(uid_col.r_name_as_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_blocking) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.l_r_names_as_l_r()) + cols.extend(add_col.l_r_names_as_l_r) return dedupe_preserving_order(cols) @@ -239,14 +235,14 @@ def _columns_to_select_for_comparison_vector_values(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_l) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_comparison_vector_values) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key") @@ -259,14 +255,14 @@ def _columns_to_select_for_bayes_factor_parts(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_l) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_bayes_factor_parts) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key") @@ -279,14 +275,14 @@ def _columns_to_select_for_predict(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_l) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_predict) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key") @@ -300,7 +296,7 @@ def _get_comparison_by_output_column_name(self, name): return cc raise ValueError(f"No comparison column with name {name}") - def _brs_as_objs(self, brs_as_strings): + def _brs_as_objs(self, brs_as_strings) -> List[BlockingRule]: brs_as_objs = [blocking_rule_to_obj(br) for br in brs_as_strings] for n, br in enumerate(brs_as_objs): br.add_preceding_rules(brs_as_objs[:n]) @@ -440,7 +436,7 @@ def _as_completed_dict(self): "comparisons": [cc._as_completed_dict() for cc in self.comparisons], "probability_two_random_records_match": rr_match, "unique_id_column_name": self._unique_id_column_name, - "source_dataset_column_name": self._source_dataset_input_column, + "source_dataset_column_name": self._source_dataset_column_name, } return {**self._settings_dict, **current_settings} @@ -517,6 +513,6 @@ def human_readable_description(self): @property def salting_required(self): for br in self._blocking_rules_to_generate_predictions: - if br.salting_partitions > 1: + if isinstance(br, SaltedBlockingRule): return True return False diff --git a/splink/settings_validation/settings_validator.py b/splink/settings_validation/settings_validator.py index a4b84743f8..ca7fe754c0 100644 --- a/splink/settings_validation/settings_validator.py +++ b/splink/settings_validation/settings_validator.py @@ -4,6 +4,7 @@ import re from functools import reduce from operator import and_ +from typing import List import sqlglot @@ -49,7 +50,7 @@ def uid(self): return self.clean_list_of_column_names(uid_as_tree) @property - def blocking_rules(self): + def blocking_rules(self) -> List[str]: brs = self.settings_obj._blocking_rules_to_generate_predictions return [br.blocking_rule_sql for br in brs] diff --git a/splink/spark/linker.py b/splink/spark/linker.py index c2cfcbc646..c19e09f716 100644 --- a/splink/spark/linker.py +++ b/splink/spark/linker.py @@ -95,6 +95,7 @@ def __init__( database=None, repartition_after_blocking=False, num_partitions_on_repartition=None, + register_udfs_automatically=True, ): """Initialise the linker object, which manages the data linkage process and holds the data linkage model. @@ -130,6 +131,10 @@ def __init__( num_partitions_on_repartition (int, optional): When saving out intermediate results, how many partitions to use? This should be set so that partitions are roughly 100Mb. Defaults to 100. + register_udfs_automatically (bool, optional): When True, distance metric + UDFs will be downloaded. In environments without internet access, or + where UDF registration is not whitelisted, this should be set to False. + Defaults to True. """ @@ -188,7 +193,8 @@ def __init__( self._set_default_break_lineage_method() - self._register_udfs_from_jar() + if register_udfs_automatically: + self._register_udfs_from_jar() def _get_spark_from_input_tables_if_not_provided(self, spark, input_tables): self.spark = spark @@ -534,19 +540,3 @@ def _check_ansi_enabled_if_converting_dates(self): classed as comparison level = "ELSE". Ensure date strings are cleaned to remove bad dates \n""" ) - - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): - """Generated sql that explodes one or more columns in a table""" - columns_to_explode = columns_to_explode.copy() - other_columns_to_retain = other_columns_to_retain.copy() - if len(columns_to_explode) == 0: - return f"select {','.join(other_columns_to_retain)} from {tbl_name}" - else: - column_to_explode = columns_to_explode.pop() - cols_to_select = ( - [f"explode({column_to_explode}) as {column_to_explode}"] - + other_columns_to_retain - + columns_to_explode - ) - return f"""select {','.join(cols_to_select)} - from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})""" # noqa: E501 diff --git a/splink/spark/spark_helpers/spark_blocking_rule_imports.py b/splink/spark/spark_helpers/spark_blocking_rule_imports.py index bed4230391..04fbab7b95 100644 --- a/splink/spark/spark_helpers/spark_blocking_rule_imports.py +++ b/splink/spark/spark_helpers/spark_blocking_rule_imports.py @@ -1,5 +1,7 @@ from __future__ import annotations +from functools import partial + from ...blocking_rules_library import ( BlockingRule, exact_match_rule, @@ -7,13 +9,8 @@ from ...blocking_rules_library import ( block_on as _block_on_, ) -from .spark_base import ( - SparkBase, -) - -class exact_match_rule(SparkBase, exact_match_rule): - pass +exact_match_rule = partial(exact_match_rule, _sql_dialect="spark") def block_on( diff --git a/splink/splink_comparison_viewer.py b/splink/splink_comparison_viewer.py index f46435611e..d6ec3ef496 100644 --- a/splink/splink_comparison_viewer.py +++ b/splink/splink_comparison_viewer.py @@ -18,8 +18,8 @@ def row_examples(linker: Linker, example_rows_per_category=2): sqls = [] uid_cols = linker._settings_obj._unique_id_input_columns - uid_cols_l = [uid_col.name_l() for uid_col in uid_cols] - uid_cols_r = [uid_col.name_r() for uid_col in uid_cols] + uid_cols_l = [uid_col.name_l for uid_col in uid_cols] + uid_cols_r = [uid_col.name_r for uid_col in uid_cols] uid_cols = uid_cols_l + uid_cols_r uid_expr = " || '-' ||".join(uid_cols) diff --git a/splink/splink_dataframe.py b/splink/splink_dataframe.py index a561cd01be..5721d5f8e2 100644 --- a/splink/splink_dataframe.py +++ b/splink/splink_dataframe.py @@ -33,7 +33,7 @@ def columns(self): @property def columns_escaped(self): cols = self.columns - return [c.name() for c in cols] + return [c.name for c in cols] def validate(): pass diff --git a/splink/sqlite/sqlite_helpers/sqlite_blocking_rule_imports.py b/splink/sqlite/sqlite_helpers/sqlite_blocking_rule_imports.py index 3680d02b28..b5229b29f4 100644 --- a/splink/sqlite/sqlite_helpers/sqlite_blocking_rule_imports.py +++ b/splink/sqlite/sqlite_helpers/sqlite_blocking_rule_imports.py @@ -1,5 +1,7 @@ from __future__ import annotations +from functools import partial + from ...blocking_rules_library import ( BlockingRule, exact_match_rule, @@ -7,13 +9,8 @@ from ...blocking_rules_library import ( block_on as _block_on_, ) -from .sqlite_base import ( - SqliteBase, -) - -class exact_match_rule(SqliteBase, exact_match_rule): - pass +exact_match_rule = partial(exact_match_rule, _sql_dialect="sqlite") def block_on( diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index 38e0807ff2..dc0dd84d3d 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -31,13 +31,13 @@ def colname_to_tf_tablename(input_column: InputColumn): def term_frequencies_for_single_column_sql( input_column: InputColumn, table_name="__splink__df_concat" ): - col_name = input_column.name() + col_name = input_column.name sql = f""" select {col_name}, cast(count(*) as float8) / (select count({col_name}) as total from {table_name}) - as {input_column.tf_name()} + as {input_column.tf_name} from {table_name} where {col_name} is not null group by {col_name} @@ -56,7 +56,7 @@ def _join_tf_to_input_df_sql(linker: Linker): tbl = colname_to_tf_tablename(col) if tbl in linker._intermediate_table_cache: tbl = linker._intermediate_table_cache[tbl].physical_name - tf_col = col.tf_name() + tf_col = col.tf_name select_cols.append(f"{tbl}.{tf_col}") select_cols.insert(0, "__splink__df_concat.*") @@ -69,11 +69,11 @@ def _join_tf_to_input_df_sql(linker: Linker): tbl = colname_to_tf_tablename(col) if tbl in linker._intermediate_table_cache: tbl = linker._intermediate_table_cache[tbl].physical_name - sql = templ.format(tbl=tbl, col=col.name()) + sql = templ.format(tbl=tbl, col=col.name) left_joins.append(sql) # left_joins = [ - # templ.format(tbl=colname_to_tf_tablename(col), col=col.name()) + # templ.format(tbl=colname_to_tf_tablename(col), col=col.name) # for col in tf_cols # ] left_joins = " ".join(left_joins) @@ -90,8 +90,8 @@ def _join_tf_to_input_df_sql(linker: Linker): def term_frequencies_from_concat_with_tf(input_column): sql = f""" select - distinct {input_column.name()}, - {input_column.tf_name()} + distinct {input_column.name}, + {input_column.tf_name} from __splink__df_concat_with_tf """ diff --git a/splink/unique_id_concat.py b/splink/unique_id_concat.py index 6b74c9299b..f5b7cd9bc8 100644 --- a/splink/unique_id_concat.py +++ b/splink/unique_id_concat.py @@ -11,7 +11,7 @@ def _composite_unique_id_from_nodes_sql(unique_id_cols, table_prefix=None): else: table_prefix = "" - cols = [f"{table_prefix}{c.name()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name}" for c in unique_id_cols] return f" || '{CONCAT_SEPARATOR}' || ".join(cols) @@ -28,10 +28,10 @@ def _composite_unique_id_from_edges_sql(unique_id_cols, l_or_r, table_prefix=Non table_prefix = "" if l_or_r == "l": - cols = [f"{table_prefix}{c.name_l()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name_l}" for c in unique_id_cols] if l_or_r == "r": - cols = [f"{table_prefix}{c.name_r()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name_r}" for c in unique_id_cols] if l_or_r is None: - cols = [f"{table_prefix}{c.name()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name}" for c in unique_id_cols] return f" || '{CONCAT_SEPARATOR}' || ".join(cols) diff --git a/splink/vertically_concatenate.py b/splink/vertically_concatenate.py index ff69530884..7d3f69afb1 100644 --- a/splink/vertically_concatenate.py +++ b/splink/vertically_concatenate.py @@ -41,6 +41,7 @@ def vertically_concatenate_sql(linker: Linker) -> str: source_dataset_col_req = ( linker._settings_obj._source_dataset_column_name_is_required ) + salting_reqiured = linker._settings_obj.salting_required if salting_reqiured: @@ -50,10 +51,15 @@ def vertically_concatenate_sql(linker: Linker) -> str: if source_dataset_col_req: sqls_to_union = [] + + create_sds_if_needed = "" + for df_obj in linker._input_tables_dict.values(): - source_ds_col = linker._source_dataset_column_name + if not linker._source_dataset_column_already_exists: + create_sds_if_needed = f"'{df_obj.templated_name}' as source_dataset," sql = f""" - select '{df_obj.templated_name}' as {source_ds_col}, + select + {create_sds_if_needed} {select_columns_sql} {salt_sql} from {df_obj.physical_name} diff --git a/splink/waterfall_chart.py b/splink/waterfall_chart.py index e52822e214..03f1325d9b 100644 --- a/splink/waterfall_chart.py +++ b/splink/waterfall_chart.py @@ -60,8 +60,8 @@ def _comparison_records(record_as_dict, comparison: Comparison): waterfall_record["u_probability"] = cl.u_probability waterfall_record["bayes_factor_description"] = cl._bayes_factor_description input_cols_used = c._input_columns_used_by_case_statement - input_cols_l = [ic.unquote().name_l() for ic in input_cols_used] - input_cols_r = [ic.unquote().name_r() for ic in input_cols_used] + input_cols_l = [ic.unquote().name_l for ic in input_cols_used] + input_cols_r = [ic.unquote().name_r for ic in input_cols_used] waterfall_record["value_l"] = ", ".join( [str(record_as_dict[n]) for n in input_cols_l] ) @@ -78,10 +78,10 @@ def _comparison_records(record_as_dict, comparison: Comparison): if cl._tf_adjustment_input_column is not None: waterfall_record_2["value_l"] = str( - record_as_dict[cl._tf_adjustment_input_column.unquote().name_l()] + record_as_dict[cl._tf_adjustment_input_column.unquote().name_l] ) waterfall_record_2["value_r"] = str( - record_as_dict[cl._tf_adjustment_input_column.unquote().name_r()] + record_as_dict[cl._tf_adjustment_input_column.unquote().name_r] ) else: waterfall_record_2["value_l"] = "" diff --git a/tests/conftest.py b/tests/conftest.py index a97c67a013..8c40955eb1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,9 +43,9 @@ def _make_spark(): conf = SparkConf() - conf.set("spark.driver.memory", "4g") - conf.set("spark.sql.shuffle.partitions", "8") - conf.set("spark.default.parallelism", "8") + conf.set("spark.driver.memory", "6g") + conf.set("spark.sql.shuffle.partitions", "1") + conf.set("spark.default.parallelism", "1") # Add custom similarity functions, which are bundled with Splink # documented here: https://github.com/moj-analytical-services/splink_scalaudfs path = similarity_jar_location() diff --git a/tests/helpers.py b/tests/helpers.py index 45b0739434..dc0b63750c 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -113,7 +113,7 @@ def Linker(self): return SparkLinker def extra_linker_args(self): - return {"spark": self.spark} + return {"spark": self.spark, "num_partitions_on_repartition": 1} def convert_frame(self, df): spark_frame = self.spark.createDataFrame(df) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 52c693293e..17d7acf1dd 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -105,19 +105,38 @@ def test_blocking_records_accuracy(test_helpers, dialect): helper = test_helpers[dialect] Linker = helper.Linker brl = helper.brl - df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + # resolve an issue w/ pyspark nulls - df = df.fillna(nan).replace([nan], [None]) - linker_settings = Linker(df, get_settings_dict(), **helper.extra_linker_args()) + df = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + {"unique_id": 3, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] + df = pd.DataFrame(df).fillna(nan).replace([nan], [None]) + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + "l.first_name = r.first_name", + ], + "comparisons": [], + "retain_matching_columns": True, + "retain_intermediate_calculation_columns": True, + "em_convergence": 0.001, + "max_iterations": 20, + } + + linker_settings = Linker(df, settings, **helper.extra_linker_args()) + n = len(df) # dedupe only validate_blocking_output( linker_settings, expected_out={ - "row_count": [3167], - "cumulative_rows": [3167], - "cartesian": 499500, + "row_count": [1], + "cumulative_rows": [1], + "cartesian": n * (n - 1) / 2, }, blocking_rules=None, ) @@ -131,9 +150,9 @@ def test_blocking_records_accuracy(test_helpers, dialect): validate_blocking_output( linker_settings, expected_out={ - "row_count": [3167, 1654], - "cumulative_rows": [3167, 4821], - "cartesian": 499500, + "row_count": [1, 1], + "cumulative_rows": [1, 2], + "cartesian": n * (n - 1) / 2, }, blocking_rules=blocking_rules, ) @@ -147,74 +166,121 @@ def test_blocking_records_accuracy(test_helpers, dialect): validate_blocking_output( linker_settings, expected_out={ - "row_count": [2253, 0, 1244], - "cumulative_rows": [2253, 2253, 3497], - "cartesian": 499500, + "row_count": [1, 0, 1], + "cumulative_rows": [1, 1, 2], + "cartesian": n * (n - 1) / 2, }, blocking_rules=blocking_rules, ) - # link and dedupe + link only without settings + # link and dedupe + link only + df_l = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + ] + + df_l = pd.DataFrame(df_l) + + df_r = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] + + df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None]) + blocking_rules = [ - "l.surname = r.surname", + "l.surname = r.surname", # 2l:2r, brl.or_( brl.exact_match_rule("first_name"), "substr(l.dob,1,4) = substr(r.dob,1,4)", - ), - "l.city = r.city", + ), # 1r:1r, 1l:2l, 1l:2r + "l.surname = r.surname", ] settings = {"link_type": "link_and_dedupe"} - linker_settings = Linker([df, df], settings, **helper.extra_linker_args()) + linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) validate_blocking_output( linker_settings, expected_out={ - "row_count": [13591, 50245, 137280], - "cumulative_rows": [13591, 63836, 201116], - "cartesian": 1999000, + "row_count": [1, 3, 0], + "cumulative_rows": [1, 4, 4], + "cartesian": 1 + 1 + 4, # within, within, between }, blocking_rules=blocking_rules, ) + blocking_rules = [ + "l.surname = r.surname", # 2l:2r, + brl.or_( + brl.exact_match_rule("first_name"), + "substr(l.dob,1,4) = substr(r.dob,1,4)", + ), # 1l:1r, 1l:2r + "l.surname = r.surname", + ] + settings = {"link_type": "link_only"} - linker_settings = Linker([df, df], settings, **helper.extra_linker_args()) + linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) validate_blocking_output( linker_settings, expected_out={ - "row_count": [7257, 25161, 68640], - "cumulative_rows": [7257, 32418, 101058], - "cartesian": 1000000, + "row_count": [1, 2, 0], + "cumulative_rows": [1, 3, 3], + "cartesian": 4, }, blocking_rules=blocking_rules, ) - # now multi-table - # still link only - linker_settings = Linker([df, df, df], settings, **helper.extra_linker_args()) + # link and dedupe + df_1 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + ] + + df_1 = pd.DataFrame(df_l) + + df_2 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] + + df_2 = pd.DataFrame(df_2).fillna(nan).replace([nan], [None]) + + df_3 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + ] + + df_3 = pd.DataFrame(df_3) + + settings = {"link_type": "link_and_dedupe"} + blocking_rules = [ + "l.surname = r.surname", + "l.first_name = r.first_name", + ] + + linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) validate_blocking_output( linker_settings, expected_out={ - # number of links per block simply related to two-frame case - "row_count": [3 * 7257, 3 * 25161, 3 * 68640], - "cumulative_rows": [ - 3 * 7257, - 3 * 7257 + 3 * 25161, - 3 * 7257 + 3 * 25161 + 3 * 68640, - ], - "cartesian": 3_000_000, + "row_count": [2, 2], + "cumulative_rows": [2, 4], + "cartesian": 5 * 4 / 2, }, blocking_rules=blocking_rules, ) - settings = {"link_type": "link_and_dedupe"} - linker_settings = Linker([df, df, df], settings, **helper.extra_linker_args()) + settings = {"link_type": "link_only"} + blocking_rules = [ + "l.surname = r.surname", + "l.first_name = r.first_name", + ] + + linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) validate_blocking_output( linker_settings, expected_out={ - # and as above, - "row_count": [31272, 113109, 308880], - "cumulative_rows": [31272, 31272 + 113109, 31272 + 113109 + 308880], - "cartesian": (3000 * 2999) // 2, + "row_count": [2, 2], + "cumulative_rows": [2, 4], + "cartesian": 8, }, blocking_rules=blocking_rules, ) @@ -223,7 +289,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): linker_settings, blocking_rules=blocking_rules, return_dataframe=True ) - expected_row_count = pd.DataFrame({"row_count": [31272, 113109, 308880]}) + expected_row_count = pd.DataFrame({"row_count": [2, 2]}) assert (blocking_rules_df["row_count"] == expected_row_count["row_count"]).all() diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py deleted file mode 100644 index 1da579609c..0000000000 --- a/tests/test_array_based_blocking.py +++ /dev/null @@ -1,255 +0,0 @@ -import copy -import random - -import pandas as pd -from pyspark import SparkConf, SparkContext -from pyspark.sql import SparkSession - -import splink.spark.comparison_library as cl -from splink.spark.linker import SparkLinker -from tests.decorator import mark_with_dialects_including - - -@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) -def test_simple_example_link_only(test_helpers, dialect): - data_l = pd.DataFrame.from_dict( - [ - {"unique_id": 1, "gender": "m", "postcode": ["2612", "2000"]}, - {"unique_id": 2, "gender": "m", "postcode": ["2612", "2617"]}, - {"unique_id": 3, "gender": "f", "postcode": ["2617"]}, - ] - ) - data_r = pd.DataFrame.from_dict( - [ - {"unique_id": 4, "gender": "m", "postcode": ["2617", "2600"]}, - {"unique_id": 5, "gender": "f", "postcode": ["2000"]}, - {"unique_id": 6, "gender": "m", "postcode": ["2617", "2612", "2000"]}, - ] - ) - helper = test_helpers[dialect] - settings = { - "link_type": "link_only", - "blocking_rules_to_generate_predictions": [ - { - "blocking_rule": "l.gender = r.gender and l.postcode = r.postcode", - "arrays_to_explode": ["postcode"], - }, - "l.gender = r.gender", - ], - "comparisons": [helper.cl.array_intersect_at_sizes("postcode", [1])], - } - ## the pairs returned by the first blocking rule are (1,6),(2,4),(2,6) - ## the additional pairs returned by the second blocking rule are (1,4),(3,5) - linker = helper.Linker([data_l, data_r], settings, **helper.extra_linker_args()) - linker.debug_mode = False - returned_triples = linker.predict().as_pandas_dataframe()[ - ["unique_id_l", "unique_id_r", "match_key"] - ] - returned_triples = { - (unique_id_l, unique_id_r, match_key) - for unique_id_l, unique_id_r, match_key in zip( - returned_triples.unique_id_l, - returned_triples.unique_id_r, - returned_triples.match_key, - ) - } - expected_triples = {(1, 6, "0"), (2, 4, "0"), (2, 6, "0"), (1, 4, "1"), (3, 5, "1")} - assert expected_triples == returned_triples - - -def generate_array_based_datasets_helper( - n_rows=1000, n_array_based_columns=3, n_distinct_values=1000, array_size=3, seed=1 -): - random.seed(seed) - datasets = [] - for _k in range(2): - results_dict = {} - results_dict["cluster"] = list(range(n_rows)) - for i in range(n_array_based_columns): - col = [] - for j in range(n_rows): - col.append(random.sample(range(n_distinct_values), array_size)) - if random.random() < 0.8 or i == n_array_based_columns - 1: - col[-1].append(j) - random.shuffle(col[-1]) - results_dict[f"array_column_{i}"] = col - datasets.append(pd.DataFrame.from_dict(results_dict)) - return datasets - - -@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) -def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect): - helper = test_helpers[dialect] - input_data_l, input_data_r = generate_array_based_datasets_helper() - input_data_l = input_data_l.assign( - unique_id=[str(cluster_id) + "-0" for cluster_id in input_data_l.cluster] - ) - input_data_r = input_data_r.assign( - unique_id=[str(cluster_id) + "-1" for cluster_id in input_data_r.cluster] - ) - input_data = pd.concat([input_data_l, input_data_r]) - blocking_rules = [ - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1""", - "arrays_to_explode": ["array_column_0", "array_column_1"], - }, - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1 - and l.array_column_2 = r.array_column_2""", - "arrays_to_explode": ["array_column_0", "array_column_1"], - }, - { - "blocking_rule": "l.array_column_2 = r.array_column_2", - "arrays_to_explode": ["array_column_2"], - }, - ] - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": blocking_rules, - "unique_id_column_name": "unique_id", - "additional_columns_to_retain": ["cluster"], - "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], - } - linker = helper.Linker(input_data, settings, **helper.extra_linker_args()) - linker.debug_mode = False - df_predict = linker.predict().as_pandas_dataframe() - ## check that there are no duplicates in the output - assert ( - df_predict.drop_duplicates(["unique_id_l", "unique_id_r"]).shape[0] - == df_predict.shape[0] - ) - - ## check that the output contains no links with match_key=1, - ## since all pairs returned by the second rule should also be - ## returned by the first rule and so should be filtered out - assert df_predict[df_predict.match_key == 1].shape[0] == 0 - - ## check that all 1000 true matches are in the output - ## (this is guaranteed by how the data was generated) - assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 - - -@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) -def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): - helper = test_helpers[dialect] - input_data_l, input_data_r = generate_array_based_datasets_helper() - blocking_rules = [ - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1""", - "arrays_to_explode": ["array_column_0", "array_column_1"], - }, - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1 - and l.array_column_2=r.array_column_2""", - "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], - }, - { - "blocking_rule": "l.array_column_2 = r.array_column_2", - "arrays_to_explode": ["array_column_2"], - }, - ] - settings = { - "link_type": "link_only", - "blocking_rules_to_generate_predictions": blocking_rules, - "unique_id_column_name": "cluster", - "additional_columns_to_retain": ["cluster"], - "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], - } - linker = helper.Linker( - [input_data_l, input_data_r], settings, **helper.extra_linker_args() - ) - linker.debug_mode = False - df_predict = linker.predict().as_pandas_dataframe() - - ## check that we get no within-dataset links - within_dataset_links = df_predict[ - df_predict.source_dataset_l == df_predict.source_dataset_r - ].shape[0] - assert within_dataset_links == 0 - - ## check that no pair of ids appears twice in the output - assert ( - df_predict.drop_duplicates(["cluster_l", "cluster_r"]).shape[0] - == df_predict.shape[0] - ) - - ## check that the second blocking rule returns no matches, - ## since every pair matching the second rule will also match the first, - ## and so should be filtered out - assert df_predict[df_predict.match_key == 1].shape[0] == 0 - - ## check that all 1000 true matches are returned - assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 - - -@mark_with_dialects_including("spark") -def test_array_based_blocking_with_salted_rules(): - input_data_l, input_data_r = generate_array_based_datasets_helper() - blocking_rules = [ - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1""", - "arrays_to_explode": ["array_column_0", "array_column_1"], - "salting_partitions": 3, - }, - { - "blocking_rule": """l.array_column_0 = r.array_column_0 - and l.array_column_1 = r.array_column_1 - and l.array_column_2=r.array_column_2""", - "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], - "salting_partitions": 2, - }, - { - "blocking_rule": "l.array_column_2 = r.array_column_2", - "arrays_to_explode": ["array_column_2"], - "salting_partitions": 1, - }, - ] - settings = { - "link_type": "link_only", - "blocking_rules_to_generate_predictions": blocking_rules, - "unique_id_column_name": "cluster", - "additional_columns_to_retain": ["cluster"], - "comparisons": [cl.array_intersect_at_sizes("array_column_1", [1])], - } - - conf = SparkConf() - sc = SparkContext.getOrCreate(conf=conf) - spark = SparkSession(sc) - input_l_spark = spark.createDataFrame(input_data_l) - input_r_spark = spark.createDataFrame(input_data_r) - - linker = SparkLinker([input_l_spark, input_r_spark], settings) - df_predict = linker.predict().as_pandas_dataframe() - - ## check that there are no duplicates in the output - assert ( - df_predict.drop_duplicates(["cluster_l", "cluster_r"]).shape[0] - == df_predict.shape[0] - ) - - ## check that results include the same pairs (and with the same match keys) - ## as an equivalent linkage with no salting - blocking_rules_no_salt = copy.deepcopy(blocking_rules) - settings_no_salt = copy.deepcopy(settings) - for br in blocking_rules_no_salt: - br.pop("salting_partitions") - settings_no_salt["blocking_rules_to_generate_predictions"] = blocking_rules_no_salt - linker_no_salt = SparkLinker([input_l_spark, input_r_spark], settings_no_salt) - df_predict_no_salt = linker_no_salt.predict().as_pandas_dataframe() - predictions_no_salt = set( - zip( - df_predict_no_salt.cluster_l, - df_predict_no_salt.cluster_r, - df_predict_no_salt.match_key, - ) - ) - predictions_with_salt = set( - zip(df_predict.cluster_l, df_predict.cluster_r, df_predict.match_key) - ) - - assert predictions_no_salt == predictions_with_salt diff --git a/tests/test_blocking.py b/tests/test_blocking.py index fd01645275..5feec7bcbc 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -15,11 +15,8 @@ def test_binary_composition_internals_OR(test_helpers, dialect): br_surname = brl.exact_match_rule("surname", salting_partitions=4) q, _ = _get_dialect_quotes(dialect) em_rule = f"l.{q}surname{q} = r.{q}surname{q}" - exp_txt = "<{} blocking rule using SQL: {}>" - assert br_surname.__repr__() == exp_txt.format("Exact match", em_rule) - assert BlockingRule(em_rule).__repr__() == exp_txt.format("Custom", em_rule) - assert br_surname.blocking_rule == em_rule + assert br_surname.blocking_rule_sql == em_rule assert br_surname.salting_partitions == 4 assert br_surname.preceding_rules == [] @@ -46,7 +43,7 @@ def test_binary_composition_internals_OR(test_helpers, dialect): def assess_preceding_rules(settings_brs_index): br_prec = brs_as_objs[settings_brs_index].preceding_rules - br_prec_txt = [br.blocking_rule for br in br_prec] + br_prec_txt = [br.blocking_rule_sql for br in br_prec] assert br_prec_txt == brs_as_txt[:settings_brs_index] assess_preceding_rules(1) diff --git a/tests/test_blocking_rule_composition.py b/tests/test_blocking_rule_composition.py index fadc9d26bd..3e02c89d5b 100644 --- a/tests/test_blocking_rule_composition.py +++ b/tests/test_blocking_rule_composition.py @@ -11,7 +11,7 @@ def binary_composition_internals(clause, comp_fun, brl, dialect): # Test what happens when only one value is fed # It should just report the regular outputs of our comparison level func level = comp_fun(brl.exact_match_rule("tom")) - assert level.blocking_rule == f"l.{q}tom{q} = r.{q}tom{q}" + assert level.blocking_rule_sql == f"l.{q}tom{q} = r.{q}tom{q}" # Exact match and null level composition level = comp_fun( @@ -19,12 +19,12 @@ def binary_composition_internals(clause, comp_fun, brl, dialect): brl.exact_match_rule("surname"), ) exact_match_sql = f"(l.{q}first_name{q} = r.{q}first_name{q}) {clause} (l.{q}surname{q} = r.{q}surname{q})" # noqa: E501 - assert level.blocking_rule == exact_match_sql + assert level.blocking_rule_sql == exact_match_sql # brl.not_(or_(...)) composition level = brl.not_( comp_fun(brl.exact_match_rule("first_name"), brl.exact_match_rule("surname")), ) - assert level.blocking_rule == f"NOT ({exact_match_sql})" + assert level.blocking_rule_sql == f"NOT ({exact_match_sql})" # Check salting outputs # salting included in the composition function diff --git a/tests/test_cluster_metrics.py b/tests/test_cluster_metrics.py new file mode 100644 index 0000000000..da2fe0f5fb --- /dev/null +++ b/tests/test_cluster_metrics.py @@ -0,0 +1,50 @@ +import pandas as pd +from pandas.testing import assert_frame_equal + +from splink.duckdb.linker import DuckDBLinker + +# Dummy df +person_ids = [i + 1 for i in range(5)] +df = pd.DataFrame({"person_id": person_ids}) + +# Dummy edges df +edges_data = [ + # cluster A edges + {"person_id_l": 1, "person_id_r": 2, "match_probability": 0.99}, + {"person_id_l": 1, "person_id_r": 3, "match_probability": 0.99}, + # cluster B edge + {"person_id_l": 4, "person_id_r": 5, "match_probability": 0.99}, + # edges not in relevant clusters + {"person_id_l": 10, "person_id_r": 11, "match_probability": 0.99}, + {"person_id_l": 12, "person_id_r": 12, "match_probability": 0.95}, +] +edges = pd.DataFrame(edges_data) + +# Dummy clusters df +cluster_ids = ["A", "A", "A", "B", "B"] +clusters_data = {"cluster_id": cluster_ids, "person_id": person_ids} +clusters = pd.DataFrame(clusters_data) + +# Expected dataframe +expected_data = [ + {"cluster_id": "A", "n_nodes": 3, "n_edges": 2.0, "density": 2 / 3}, + {"cluster_id": "B", "n_nodes": 2, "n_edges": 1.0, "density": 1.0}, +] +df_expected = pd.DataFrame(expected_data) + + +def test_size_density(): + # Linker with basic settings + settings = {"link_type": "dedupe_only", "unique_id_column_name": "person_id"} + linker = DuckDBLinker(df, settings) + + # Register as Splink dataframes + df_predict = linker.register_table(edges, "df_predict", overwrite=True) + df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True) + + df_cluster_metrics = linker._compute_cluster_metrics( + df_predict, df_clustered, threshold_match_probability=0.99 + ) + df_cluster_metrics = df_cluster_metrics.as_pandas_dataframe() + + assert_frame_equal(df_cluster_metrics, df_expected) diff --git a/tests/test_correctness_of_convergence.py b/tests/test_correctness_of_convergence.py index 435c26d09b..7b536a03a5 100644 --- a/tests/test_correctness_of_convergence.py +++ b/tests/test_correctness_of_convergence.py @@ -68,7 +68,7 @@ def test_splink_converges_to_known_params(): # CREATE TABLE __splink__df_comparison_vectors_abc123 # and modify the following line to include the value of the hash (abc123 above) - cvv_hashed_tablename = "__splink__df_comparison_vectors_98aaa302a" + cvv_hashed_tablename = "__splink__df_comparison_vectors_ee08ffa85" linker.register_table(df, cvv_hashed_tablename) em_training_session = EMTrainingSession( diff --git a/tests/test_input_column.py b/tests/test_input_column.py index 445aa4d52e..8a1dd794bd 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -3,30 +3,33 @@ def test_input_column(): c = InputColumn("my_col") - assert c.name() == '"my_col"' - assert c.unquote().name() == "my_col" + assert c.name == '"my_col"' + assert c.unquote().name == "my_col" - assert c.name_l() == '"my_col_l"' - assert c.tf_name_l() == '"tf_my_col_l"' - assert c.unquote().quote().l_tf_name_as_l() == '"l"."tf_my_col" as "tf_my_col_l"' - assert c.unquote().l_tf_name_as_l() == '"l".tf_my_col as tf_my_col_l' + assert c.name_l == '"my_col_l"' + assert c.tf_name_l == '"tf_my_col_l"' + assert c.unquote().quote().l_tf_name_as_l == '"l"."tf_my_col" as "tf_my_col_l"' + assert c.unquote().l_tf_name_as_l == '"l".tf_my_col as tf_my_col_l' c = InputColumn("SUR name") - assert c.name() == '"SUR name"' - assert c.name_r() == '"SUR name_r"' - assert c.r_name_as_r() == '"r"."SUR name" as "SUR name_r"' + assert c.name == '"SUR name"' + assert c.name_r == '"SUR name_r"' + assert c.r_name_as_r == '"r"."SUR name" as "SUR name_r"' c = InputColumn("col['lat']") name = """ "col"['lat'] """.strip() - assert c.name() == name + assert c.name == name l_tf_name_as_l = """ "l"."tf_col"['lat'] as "tf_col_l"['lat'] """.strip() - assert c.l_tf_name_as_l() == l_tf_name_as_l + assert c.l_tf_name_as_l == l_tf_name_as_l - assert c.unquote().name() == "col['lat']" - assert c.unquote().quote().name() == name + assert c.unquote().name == "col['lat']" + assert c.unquote().quote().name == name + + c = InputColumn("first name", sql_dialect="spark") + assert c.name == "`first name`" diff --git a/tests/test_link_only_verification.py b/tests/test_link_only_verification.py deleted file mode 100644 index a3733016ab..0000000000 --- a/tests/test_link_only_verification.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -import pytest - -from splink.duckdb.linker import DuckDBLinker -from splink.exceptions import SplinkException -from tests.basic_settings import get_settings_dict - -df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") -df_l = df.copy() -df_r = df.copy() -df_l["source_dataset"] = "my_left_ds" -df_r["source_dataset"] = "my_right_ds" -df_final = pd.concat([df_l, df_r]) - -settings = get_settings_dict() -settings["link_type"] = "link_only" - - -def test_link_only_verification(): - # As `_initialise_df_concat_with_tf()` cannot be run without - # a setting object, we don't need to test that. - - # Two input dataframes + link only settings - linker = DuckDBLinker( - [df_l, df_r], - settings, - ) - linker._initialise_df_concat_with_tf() - - # A single dataframe with a source_dataset col - linker = DuckDBLinker( - df_final, - settings, - ) - linker._initialise_df_concat_with_tf() - - # A single df with no source_dataset col, despite - # calling link_only. Should fail w/ SplinkException - linker = DuckDBLinker( - df, - settings, - ) - # This should pass as concat_with_tf doesn't yet exist - linker._verify_link_only_job - with pytest.raises(SplinkException): - # Fails as only one df w/ no source_dataset col has - # been passed - linker._initialise_df_concat_with_tf() diff --git a/tests/test_missingness.py b/tests/test_missingness.py new file mode 100644 index 0000000000..e4c4110705 --- /dev/null +++ b/tests/test_missingness.py @@ -0,0 +1,35 @@ +import pandas as pd +from pytest import raises + +from splink.exceptions import SplinkException +from tests.decorator import mark_with_dialects_excluding + + +@mark_with_dialects_excluding() +def test_missingness_chart(dialect, test_helpers): + helper = test_helpers[dialect] + + df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + + linker = helper.Linker( + df, {"link_type": "dedupe_only"}, **helper.extra_linker_args() + ) + linker.missingness_chart() + + +@mark_with_dialects_excluding() +def test_missingness_chart_mismatched_columns(dialect, test_helpers): + helper = test_helpers[dialect] + + df_l = helper.load_frame_from_csv( + "./tests/datasets/fake_1000_from_splink_demos.csv" + ) + df_r = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + df_r.rename(columns={"surname": "SURNAME"}, inplace=True) + df_r = helper.convert_frame(df_r) + + linker = helper.Linker( + [df_l, df_r], {"link_type": "link_only"}, **helper.extra_linker_args() + ) + with raises(SplinkException): + linker.missingness_chart() diff --git a/tests/test_profile_data.py b/tests/test_profile_data.py index 98b0725ff0..dcc2123c99 100644 --- a/tests/test_profile_data.py +++ b/tests/test_profile_data.py @@ -176,3 +176,23 @@ def test_profile_using_spark(df_spark): ) assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0 + + +def test_profile_null_columns(caplog): + + df = pd.DataFrame( + [ + {"unique_id": 1, "test_1": 1, "test_2": None}, + ] + ) + + linker = DuckDBLinker(df) + + linker.profile_columns(["test_1", "test_2"]) + + captured_logs = caplog.text + + assert ( + "Warning: No charts produced for test_2 as the column only " + "contains null values." + ) in captured_logs diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py index fb3f80e885..1fc67ceb74 100644 --- a/tests/test_sql_transform.py +++ b/tests/test_sql_transform.py @@ -92,36 +92,36 @@ def test_set_numeric_as_double(): def test_add_pref_and_suffix(): dull = InputColumn("dull") dull_l_r = ['"l"."dull" as "dull_l"', '"r"."dull" as "dull_r"'] - assert dull.l_r_names_as_l_r() == dull_l_r + assert dull.l_r_names_as_l_r == dull_l_r - assert dull.bf_name() == '"bf_dull"' - assert dull.tf_name_l() == '"tf_dull_l"' + assert dull.bf_name == '"bf_dull"' + assert dull.tf_name_l == '"tf_dull_l"' tf_dull_l_r = ['"l"."tf_dull" as "tf_dull_l"', '"r"."tf_dull" as "tf_dull_r"'] - assert dull.l_r_tf_names_as_l_r() == tf_dull_l_r + assert dull.l_r_tf_names_as_l_r == tf_dull_l_r ll = InputColumn("lat['long']") - assert ll.name_l() == "\"lat_l\"['long']" + assert ll.name_l == "\"lat_l\"['long']" ll_tf_l_r = [ '"l"."tf_lat"[\'long\'] as "tf_lat_l"[\'long\']', '"r"."tf_lat"[\'long\'] as "tf_lat_r"[\'long\']', ] - assert ll.l_r_tf_names_as_l_r() == ll_tf_l_r + assert ll.l_r_tf_names_as_l_r == ll_tf_l_r group = InputColumn("cluster") - assert group.name_l() == '"cluster_l"' - assert group.bf_name() == '"bf_cluster"' + assert group.name_l == '"cluster_l"' + assert group.bf_name == '"bf_cluster"' group_l_r_names = ['"l"."cluster" as "cluster_l"', '"r"."cluster" as "cluster_r"'] - assert group.l_r_names_as_l_r() == group_l_r_names + assert group.l_r_names_as_l_r == group_l_r_names group_tf_l_r = [ '"l"."tf_cluster" as "tf_cluster_l"', '"r"."tf_cluster" as "tf_cluster_r"', ] - assert group.l_r_tf_names_as_l_r() == group_tf_l_r + assert group.l_r_tf_names_as_l_r == group_tf_l_r cols = ["unique_id", "SUR name", "cluster"] out_cols = ['"unique_id"', '"SUR name"', '"cluster"'] cols_class = [InputColumn(c) for c in cols] - assert [c.name() for c in cols_class] == out_cols + assert [c.name for c in cols_class] == out_cols diff --git a/tests/test_u_train.py b/tests/test_u_train.py index 453e13beae..274701ebcd 100644 --- a/tests/test_u_train.py +++ b/tests/test_u_train.py @@ -39,7 +39,7 @@ def test_u_train(test_helpers, dialect): assert cl_no.u_probability == (denom - 2) / denom br = linker._settings_obj._blocking_rules_to_generate_predictions[0] - assert br.blocking_rule == "l.name = r.name" + assert br.blocking_rule_sql == "l.name = r.name" @mark_with_dialects_excluding() From da5a4991f64d4b29d4d31b3669fe22efcdc67908 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 30 Nov 2023 15:07:57 +0000 Subject: [PATCH 23/37] initial working implementation --- splink/blocking.py | 160 +++++++++++++++++++++++++++++++++++++--- splink/duckdb/linker.py | 18 +++++ splink/linker.py | 5 ++ splink/spark/linker.py | 16 ++++ 4 files changed, 190 insertions(+), 9 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 47160a499c..c9bbe9646e 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -7,7 +7,9 @@ from sqlglot.expressions import Column, Join from sqlglot.optimizer.eliminate_joins import join_condition +from .input_column import InputColumn from .misc import ensure_is_list +from .splink_dataframe import SplinkDataFrame from .unique_id_concat import _composite_unique_id_from_nodes_sql logger = logging.getLogger(__name__) @@ -27,13 +29,26 @@ def blocking_rule_to_obj(br): sqlglot_dialect = br.get("sql_dialect", None) salting_partitions = br.get("salting_partitions", None) - if salting_partitions is None: - return BlockingRule(blocking_rule, sqlglot_dialect) - else: + arrays_to_explode = br.get("arrays_to_explode", None) + + if arrays_to_explode is not None and salting_partitions is not None: + raise ValueError( + "Splink does not support blocking rules that are " + " both salted and exploding" + ) + + if salting_partitions is not None: return SaltedBlockingRule( blocking_rule, sqlglot_dialect, salting_partitions ) + if arrays_to_explode is not None: + return ExplodingBlockingRule( + blocking_rule, sqlglot_dialect, arrays_to_explode + ) + + return BlockingRule(blocking_rule, sqlglot_dialect) + else: br = BlockingRule(br) return br @@ -69,8 +84,7 @@ def add_preceding_rules(self, rules): rules = ensure_is_list(rules) self.preceding_rules = rules - @property - def exclude_pairs_generated_by_this_rule_sql(self): + def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs @@ -81,15 +95,15 @@ def exclude_pairs_generated_by_this_rule_sql(self): # meaning these comparisons get lost return f"coalesce(({self.blocking_rule_sql}),false)" - @property - def exclude_pairs_generated_by_all_preceding_rules_sql(self): + def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker): """A SQL string that excludes the results of ALL previous blocking rules from the pairwise comparisons generated. """ if not self.preceding_rules: return "" or_clauses = [ - br.exclude_pairs_generated_by_this_rule_sql for br in self.preceding_rules + br.exclude_pairs_generated_by_this_rule_sql(linker) + for br in self.preceding_rules ] previous_rules = " OR ".join(or_clauses) return f"AND NOT ({previous_rules})" @@ -107,7 +121,7 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) inner join {linker._input_tablename_r} as r on ({self.blocking_rule_sql}) - {self.exclude_pairs_generated_by_all_preceding_rules_sql} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} {where_condition} """ return sql @@ -241,6 +255,134 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) return " UNION ALL ".join(sqls) +class ExplodingBlockingRule(BlockingRule): + def __init__( + self, + blocking_rule: BlockingRule | dict | str, + sqlglot_dialect: str = None, + array_columns_to_explode: list = [], + ): + super().__init__(blocking_rule, sqlglot_dialect) + self.array_columns_to_explode: List[str] = array_columns_to_explode + self.exploded_id_pair_table: SplinkDataFrame = None + + def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule): + """generates a table of the marginal id pairs from the exploded blocking rule + i.e. pairs are only created that match this blocking rule and NOT any of + the preceding blocking rules + """ + settings_obj = linker._settings_obj + unique_id_col = settings_obj._unique_id_column_name + + link_type = settings_obj._link_type + + if linker._two_dataset_link_only: + link_type = "two_dataset_link_only" + + if linker._self_link_mode: + link_type = "self_link" + + where_condition = _sql_gen_where_condition( + link_type, settings_obj._unique_id_input_columns + ) + + if link_type == "two_dataset_link_only": + where_condition = ( + where_condition + " and l.source_dataset < r.source_dataset" + ) + + sql = f""" + select distinct + l.{unique_id_col} as {unique_id_col}_l, + r.{unique_id_col} as {unique_id_col}_r + from __splink__df_concat_with_tf_unnested as l + inner join __splink__df_concat_with_tf_unnested as r + on ({br.blocking_rule_sql}) + {where_condition} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" + + return sql + + def drop_materialised_id_pairs_dataframe(self): + self.exploded_id_pair_table.drop_table_from_database_and_remove_from_cache() + + def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): + """A SQL string specifying how to exclude the results + of THIS blocking rule from subseqent blocking statements, + so that subsequent statements do not produce duplicate pairs + """ + + unique_id_column = linker._settings_obj._unique_id_column_name + splink_df = self.exploded_id_pair_table + ids_to_compare_sql = f"select * from {splink_df.physical_name}" + + return f"""EXISTS ( + select 1 from ({ids_to_compare_sql}) as ids_to_compare + where ( + l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and + r.{unique_id_column} = ids_to_compare.{unique_id_column}_r + ) + ) + """ + + def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability): + columns_to_select = linker._settings_obj._columns_to_select_for_blocking + sql_select_expr = ", ".join(columns_to_select) + exploded_id_pair_table = self.exploded_id_pair_table + unique_id_col = linker._settings_obj._unique_id_column_name + sql = f""" + select + {sql_select_expr}, + '{self.match_key}' as match_key + {probability} + from {exploded_id_pair_table.physical_name} as pairs + left join {linker._input_tablename_l} as l + on pairs.{unique_id_col}_l=l.{unique_id_col} + left join {linker._input_tablename_r} as r + on pairs.{unique_id_col}_r=r.{unique_id_col} + """ + return sql + + +def materialise_exploded_id_tables(linker: Linker): + settings_obj = linker._settings_obj + + blocking_rules = settings_obj._blocking_rules_to_generate_predictions + exploding_blocking_rules = [ + br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) + ] + + for br in exploding_blocking_rules: + input_dataframe = linker._initialise_df_concat_with_tf() + + input_colnames = {col.name for col in input_dataframe.columns} + + arrays_to_explode_quoted = [ + InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name + for colname in br.array_columns_to_explode + ] + expl_sql = linker._gen_explode_sql( + "__splink__df_concat_with_tf", + br.array_columns_to_explode, + list(input_colnames.difference(arrays_to_explode_quoted)), + ) + + linker._enqueue_sql( + expl_sql, + "__splink__df_concat_with_tf_unnested", + ) + + base_name = "__splink__marginal_exploded_ids_blocking_rule" + table_name = f"{base_name}_mk_{br.match_key}" + + sql = br.marginal_exploded_id_pairs_table_sql(linker, br) + + linker._enqueue_sql(sql, table_name) + + marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) + br.exploded_id_pair_table = marginal_ids_table + + def _sql_gen_where_condition(link_type, unique_id_cols): id_expr_l = _composite_unique_id_from_nodes_sql(unique_id_cols, "l") id_expr_r = _composite_unique_id_from_nodes_sql(unique_id_cols, "r") diff --git a/splink/duckdb/linker.py b/splink/duckdb/linker.py index 79fc0b43c1..2e6a2fe889 100644 --- a/splink/duckdb/linker.py +++ b/splink/duckdb/linker.py @@ -319,3 +319,21 @@ def export_to_duckdb_file(self, output_path, delete_intermediate_tables=False): new_con = duckdb.connect(database=output_path) new_con.execute(f"IMPORT DATABASE '{tmpdir}';") new_con.close() + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + """Generated sql that explodes one or more columns in a table""" + columns_to_explode = columns_to_explode.copy() + other_columns_to_retain = other_columns_to_retain.copy() + # base case + if len(columns_to_explode) == 0: + return f"select {','.join(other_columns_to_retain)} from {tbl_name}" + else: + column_to_explode = columns_to_explode.pop() + cols_to_select = ( + [f"unnest({column_to_explode}) as {column_to_explode}"] + + other_columns_to_retain + + columns_to_explode + ) + other_columns_to_retain.append(column_to_explode) + return f"""select {','.join(cols_to_select)} + from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})""" # noqa: E501 diff --git a/splink/linker.py b/splink/linker.py index db676d6362..996a78b8cb 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -35,6 +35,7 @@ BlockingRule, block_using_rules_sqls, blocking_rule_to_obj, + materialise_exploded_id_tables, ) from .cache_dict_with_logging import CacheDictWithLogging from .charts import ( @@ -1744,6 +1745,10 @@ def predict( if nodes_with_tf: input_dataframes.append(nodes_with_tf) + # If exploded blocking rules exist, we need to materialise + # the tables of ID pairs + materialise_exploded_id_tables(self) + sqls = block_using_rules_sqls(self) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) diff --git a/splink/spark/linker.py b/splink/spark/linker.py index c19e09f716..df74e6801e 100644 --- a/splink/spark/linker.py +++ b/splink/spark/linker.py @@ -540,3 +540,19 @@ def _check_ansi_enabled_if_converting_dates(self): classed as comparison level = "ELSE". Ensure date strings are cleaned to remove bad dates \n""" ) + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + """Generated sql that explodes one or more columns in a table""" + columns_to_explode = columns_to_explode.copy() + other_columns_to_retain = other_columns_to_retain.copy() + if len(columns_to_explode) == 0: + return f"select {','.join(other_columns_to_retain)} from {tbl_name}" + else: + column_to_explode = columns_to_explode.pop() + cols_to_select = ( + [f"explode({column_to_explode}) as {column_to_explode}"] + + other_columns_to_retain + + columns_to_explode + ) + return f"""select {','.join(cols_to_select)} + from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})""" # noqa: E501 From bba25dcc1f7d06453c4d212c6baa86b5459adb4d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 30 Nov 2023 16:52:49 +0000 Subject: [PATCH 24/37] update --- splink/blocking.py | 4 +++- splink/linker.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index c9bbe9646e..0fc3c03a91 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -351,7 +351,7 @@ def materialise_exploded_id_tables(linker: Linker): exploding_blocking_rules = [ br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) ] - + exploded_tables = [] for br in exploding_blocking_rules: input_dataframe = linker._initialise_df_concat_with_tf() @@ -381,6 +381,8 @@ def materialise_exploded_id_tables(linker: Linker): marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) br.exploded_id_pair_table = marginal_ids_table + exploded_tables.append(marginal_ids_table) + return exploded_tables def _sql_gen_where_condition(link_type, unique_id_cols): diff --git a/splink/linker.py b/splink/linker.py index 996a78b8cb..1dae751cd7 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1747,7 +1747,7 @@ def predict( # If exploded blocking rules exist, we need to materialise # the tables of ID pairs - materialise_exploded_id_tables(self) + exploded_tables = materialise_exploded_id_tables(self) sqls = block_using_rules_sqls(self) for sql in sqls: @@ -1774,6 +1774,9 @@ def predict( predictions = self._execute_sql_pipeline(input_dataframes) self._predict_warning() + + [t.drop_table_from_database_and_remove_from_cache() for t in exploded_tables] + return predictions def find_matches_to_new_records( @@ -3929,3 +3932,8 @@ def _detect_blocking_rules_for_em_training( "suggested_blocking_rules_as_splink_brs" ].iloc[0] return suggestion + + def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + raise NotImplementedError( + f"Unnesting blocking rules are not supported for {type(self)}" + ) From 24cd2e75daed9f48dfe6480682177666085ebd66 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 10:50:47 +0000 Subject: [PATCH 25/37] fix spark --- splink/blocking.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 0fc3c03a91..eb4357a387 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -121,8 +121,9 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) inner join {linker._input_tablename_r} as r on ({self.blocking_rule_sql}) - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} {where_condition} + + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} """ return sql @@ -247,8 +248,10 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) inner join {linker._input_tablename_r} as r on ({self.blocking_rule_sql} {salt_condition}) - {self.exclude_pairs_generated_by_all_preceding_rules_sql} {where_condition} + + {self.exclude_pairs_generated_by_all_preceding_rules_sql} + """ sqls.append(sql) @@ -299,6 +302,7 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) inner join __splink__df_concat_with_tf_unnested as r on ({br.blocking_rule_sql}) {where_condition} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" return sql From 1d632187652443b676457bc3f67f4aae6cab6eef Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 11:19:11 +0000 Subject: [PATCH 26/37] format better --- splink/blocking.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index eb4357a387..0377f1f969 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -122,7 +122,6 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) on ({self.blocking_rule_sql}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} """ return sql @@ -249,8 +248,7 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) on ({self.blocking_rule_sql} {salt_condition}) {where_condition} - - {self.exclude_pairs_generated_by_all_preceding_rules_sql} + {self.exclude_pars_generated_by_all_preceding_rules_sql} """ @@ -302,8 +300,8 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) inner join __splink__df_concat_with_tf_unnested as r on ({br.blocking_rule_sql}) {where_condition} - - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)}""" + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + """ return sql From 12e0c90d863c76466091e34959444c713eac7952 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 11:46:53 +0000 Subject: [PATCH 27/37] fix tests --- tests/test_correctness_of_convergence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_correctness_of_convergence.py b/tests/test_correctness_of_convergence.py index 4adcecec30..00ff41b8d2 100644 --- a/tests/test_correctness_of_convergence.py +++ b/tests/test_correctness_of_convergence.py @@ -68,7 +68,7 @@ def test_splink_converges_to_known_params(): # CREATE TABLE __splink__df_comparison_vectors_abc123 # and modify the following line to include the value of the hash (abc123 above) - cvv_hashed_tablename = "__splink__df_comparison_vectors_f9bd31158" + cvv_hashed_tablename = "__splink__df_comparison_vectors_cf129c9c9" linker.register_table(df, cvv_hashed_tablename) em_training_session = EMTrainingSession( From c2aea3ca15e2848f0885f19824fb613ab6f76d7a Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 15:11:14 +0000 Subject: [PATCH 28/37] make work with deterministic link --- splink/blocking.py | 16 +++++++++++++--- splink/linker.py | 11 ++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 0377f1f969..14e4775091 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -248,8 +248,7 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) on ({self.blocking_rule_sql} {salt_condition}) {where_condition} - {self.exclude_pars_generated_by_all_preceding_rules_sql} - + {self.exclude_pairs_generated_by_all_preceding_rules_sql} """ sqls.append(sql) @@ -345,6 +344,11 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) """ return sql + def as_dict(self): + output = super().as_dict() + output["arrays_to_explode"] = self.array_columns_to_explode + return output + def materialise_exploded_id_tables(linker: Linker): settings_obj = linker._settings_obj @@ -405,7 +409,7 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition -def block_using_rules_sqls(linker: Linker): +def block_using_rules_sqls(linker: Linker, allow_exploding=False): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). @@ -494,6 +498,12 @@ def block_using_rules_sqls(linker: Linker): if not blocking_rules: blocking_rules = [BlockingRule("1=1")] + has_exploding = any(isinstance(br, ExplodingBlockingRule) for br in blocking_rules) + if has_exploding and not allow_exploding: + raise ValueError( + "Exploding blocking rules are not currently supported for this function" + ) + # For Blocking rules for deterministic rules, add a match probability # column with all probabilities set to 1. if linker._deterministic_link_mode: diff --git a/splink/linker.py b/splink/linker.py index b048ccf63e..935478ede9 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1429,10 +1429,15 @@ def deterministic_link(self) -> SplinkDataFrame: self._deterministic_link_mode = True concat_with_tf = self._initialise_df_concat_with_tf() - sqls = block_using_rules_sqls(self) + exploded_tables = materialise_exploded_id_tables(self) + + sqls = block_using_rules_sqls(self, allow_exploding=True) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) - return self._execute_sql_pipeline([concat_with_tf]) + + deterministic_link_df = self._execute_sql_pipeline([concat_with_tf]) + [t.drop_table_from_database_and_remove_from_cache() for t in exploded_tables] + return deterministic_link_df def estimate_u_using_random_sampling( self, max_pairs: int = None, seed: int = None, *, target_rows=None @@ -1756,7 +1761,7 @@ def predict( # the tables of ID pairs exploded_tables = materialise_exploded_id_tables(self) - sqls = block_using_rules_sqls(self) + sqls = block_using_rules_sqls(self, allow_exploding=True) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) From a58deca4d54dfe5a4a66258f58dbadaf76df7988 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 15:29:57 +0000 Subject: [PATCH 29/37] put check supported logic in place most likely to be caught --- splink/blocking.py | 3 ++- splink/linker.py | 24 ++++++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 14e4775091..ae507d9b2f 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -271,6 +271,7 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) i.e. pairs are only created that match this blocking rule and NOT any of the preceding blocking rules """ + settings_obj = linker._settings_obj unique_id_col = settings_obj._unique_id_column_name @@ -388,7 +389,7 @@ def materialise_exploded_id_tables(linker: Linker): marginal_ids_table = linker._execute_sql_pipeline([input_dataframe]) br.exploded_id_pair_table = marginal_ids_table exploded_tables.append(marginal_ids_table) - return exploded_tables + return exploding_blocking_rules def _sql_gen_where_condition(link_type, unique_id_cols): diff --git a/splink/linker.py b/splink/linker.py index 935478ede9..5620adb3dc 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1429,14 +1429,14 @@ def deterministic_link(self) -> SplinkDataFrame: self._deterministic_link_mode = True concat_with_tf = self._initialise_df_concat_with_tf() - exploded_tables = materialise_exploded_id_tables(self) + exploding_br_with_id_tables = materialise_exploded_id_tables(self) sqls = block_using_rules_sqls(self, allow_exploding=True) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) deterministic_link_df = self._execute_sql_pipeline([concat_with_tf]) - [t.drop_table_from_database_and_remove_from_cache() for t in exploded_tables] + [br.exploding_br_with_id_tables() for br in exploding_br_with_id_tables] return deterministic_link_df def estimate_u_using_random_sampling( @@ -1658,7 +1658,16 @@ def estimate_parameters_using_expectation_maximisation( self._initialise_df_concat_with_tf() # Extract the blocking rule - blocking_rule = blocking_rule_to_obj(blocking_rule).blocking_rule_sql + # Check it's a BlockingRule (not a SaltedBlockingRule, ExlpodingBlockingRule) + # and raise error if not specfically a BlockingRule + br = blocking_rule_to_obj(blocking_rule) + if type(br) is not BlockingRule: + raise TypeError( + "EM blocking rules must be plain blocking rules, not " + "salted or exploding blocking rules" + ) + + blocking_rule_sql = br.blocking_rule_sql if comparisons_to_deactivate: # If user provided a string, convert to Comparison object @@ -1682,7 +1691,7 @@ def estimate_parameters_using_expectation_maximisation( em_training_session = EMTrainingSession( self, - blocking_rule, + blocking_rule_sql, fix_u_probabilities=fix_u_probabilities, fix_m_probabilities=fix_m_probabilities, fix_probability_two_random_records_match=fix_probability_two_random_records_match, # noqa 501 @@ -1759,7 +1768,7 @@ def predict( # If exploded blocking rules exist, we need to materialise # the tables of ID pairs - exploded_tables = materialise_exploded_id_tables(self) + exploding_br_with_id_tables = materialise_exploded_id_tables(self) sqls = block_using_rules_sqls(self, allow_exploding=True) for sql in sqls: @@ -1787,7 +1796,10 @@ def predict( predictions = self._execute_sql_pipeline(input_dataframes) self._predict_warning() - [t.drop_table_from_database_and_remove_from_cache() for t in exploded_tables] + [ + br.drop_materialised_id_pairs_dataframe() + for br in exploding_br_with_id_tables + ] return predictions From 4021d3f0c44dca524c38bc523f09a380586af9a1 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 15:42:38 +0000 Subject: [PATCH 30/37] gives correct error messages --- splink/blocking.py | 15 ++++++++------- splink/linker.py | 11 ++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index ae507d9b2f..164fa16534 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -307,6 +307,7 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) def drop_materialised_id_pairs_dataframe(self): self.exploded_id_pair_table.drop_table_from_database_and_remove_from_cache() + self.exploded_id_pair_table = None def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): """A SQL string specifying how to exclude the results @@ -330,6 +331,12 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability): columns_to_select = linker._settings_obj._columns_to_select_for_blocking sql_select_expr = ", ".join(columns_to_select) + + if self.exploded_id_pair_table is None: + raise ValueError( + "Exploding blocking rules are not supported for the function you have" + " called." + ) exploded_id_pair_table = self.exploded_id_pair_table unique_id_col = linker._settings_obj._unique_id_column_name sql = f""" @@ -410,7 +417,7 @@ def _sql_gen_where_condition(link_type, unique_id_cols): return where_condition -def block_using_rules_sqls(linker: Linker, allow_exploding=False): +def block_using_rules_sqls(linker: Linker): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). @@ -499,12 +506,6 @@ def block_using_rules_sqls(linker: Linker, allow_exploding=False): if not blocking_rules: blocking_rules = [BlockingRule("1=1")] - has_exploding = any(isinstance(br, ExplodingBlockingRule) for br in blocking_rules) - if has_exploding and not allow_exploding: - raise ValueError( - "Exploding blocking rules are not currently supported for this function" - ) - # For Blocking rules for deterministic rules, add a match probability # column with all probabilities set to 1. if linker._deterministic_link_mode: diff --git a/splink/linker.py b/splink/linker.py index 5620adb3dc..012588113d 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1431,12 +1431,12 @@ def deterministic_link(self) -> SplinkDataFrame: concat_with_tf = self._initialise_df_concat_with_tf() exploding_br_with_id_tables = materialise_exploded_id_tables(self) - sqls = block_using_rules_sqls(self, allow_exploding=True) + sqls = block_using_rules_sqls(self) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) deterministic_link_df = self._execute_sql_pipeline([concat_with_tf]) - [br.exploding_br_with_id_tables() for br in exploding_br_with_id_tables] + [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables] return deterministic_link_df def estimate_u_using_random_sampling( @@ -1770,7 +1770,7 @@ def predict( # the tables of ID pairs exploding_br_with_id_tables = materialise_exploded_id_tables(self) - sqls = block_using_rules_sqls(self, allow_exploding=True) + sqls = block_using_rules_sqls(self) for sql in sqls: self._enqueue_sql(sql["sql"], sql["output_table_name"]) @@ -1796,10 +1796,7 @@ def predict( predictions = self._execute_sql_pipeline(input_dataframes) self._predict_warning() - [ - br.drop_materialised_id_pairs_dataframe() - for br in exploding_br_with_id_tables - ] + [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables] return predictions From 0cede3d199852f22f201ce69a0471f22d9238e1e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 16:10:38 +0000 Subject: [PATCH 31/37] fix tests --- splink/blocking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/blocking.py b/splink/blocking.py index 164fa16534..c02d10317e 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -248,7 +248,7 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) on ({self.blocking_rule_sql} {salt_condition}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} """ sqls.append(sql) From 26bf082d3d9eb8d92932ead0686bc92e9e3db9cf Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 11 Dec 2023 16:38:16 +0000 Subject: [PATCH 32/37] add tests back in --- tests/test_array_based_blocking.py | 181 +++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 tests/test_array_based_blocking.py diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py new file mode 100644 index 0000000000..62d7176852 --- /dev/null +++ b/tests/test_array_based_blocking.py @@ -0,0 +1,181 @@ +import random + +import pandas as pd + +from tests.decorator import mark_with_dialects_including + + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_simple_example_link_only(test_helpers, dialect): + data_l = pd.DataFrame.from_dict( + [ + {"unique_id": 1, "gender": "m", "postcode": ["2612", "2000"]}, + {"unique_id": 2, "gender": "m", "postcode": ["2612", "2617"]}, + {"unique_id": 3, "gender": "f", "postcode": ["2617"]}, + ] + ) + data_r = pd.DataFrame.from_dict( + [ + {"unique_id": 4, "gender": "m", "postcode": ["2617", "2600"]}, + {"unique_id": 5, "gender": "f", "postcode": ["2000"]}, + {"unique_id": 6, "gender": "m", "postcode": ["2617", "2612", "2000"]}, + ] + ) + helper = test_helpers[dialect] + settings = { + "link_type": "link_only", + "blocking_rules_to_generate_predictions": [ + { + "blocking_rule": "l.gender = r.gender and l.postcode = r.postcode", + "arrays_to_explode": ["postcode"], + }, + "l.gender = r.gender", + ], + "comparisons": [helper.cl.array_intersect_at_sizes("postcode", [1])], + } + ## the pairs returned by the first blocking rule are (1,6),(2,4),(2,6) + ## the additional pairs returned by the second blocking rule are (1,4),(3,5) + linker = helper.Linker([data_l, data_r], settings, **helper.extra_linker_args()) + linker.debug_mode = False + returned_triples = linker.predict().as_pandas_dataframe()[ + ["unique_id_l", "unique_id_r", "match_key"] + ] + returned_triples = { + (unique_id_l, unique_id_r, match_key) + for unique_id_l, unique_id_r, match_key in zip( + returned_triples.unique_id_l, + returned_triples.unique_id_r, + returned_triples.match_key, + ) + } + expected_triples = {(1, 6, "0"), (2, 4, "0"), (2, 6, "0"), (1, 4, "1"), (3, 5, "1")} + assert expected_triples == returned_triples + + +def generate_array_based_datasets_helper( + n_rows=1000, n_array_based_columns=3, n_distinct_values=1000, array_size=3, seed=1 +): + random.seed(seed) + datasets = [] + for _k in range(2): + results_dict = {} + results_dict["cluster"] = list(range(n_rows)) + for i in range(n_array_based_columns): + col = [] + for j in range(n_rows): + col.append(random.sample(range(n_distinct_values), array_size)) + if random.random() < 0.8 or i == n_array_based_columns - 1: + col[-1].append(j) + random.shuffle(col[-1]) + results_dict[f"array_column_{i}"] = col + datasets.append(pd.DataFrame.from_dict(results_dict)) + return datasets + + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect): + helper = test_helpers[dialect] + input_data_l, input_data_r = generate_array_based_datasets_helper() + input_data_l = input_data_l.assign( + unique_id=[str(cluster_id) + "-0" for cluster_id in input_data_l.cluster] + ) + input_data_r = input_data_r.assign( + unique_id=[str(cluster_id) + "-1" for cluster_id in input_data_r.cluster] + ) + input_data = pd.concat([input_data_l, input_data_r]) + blocking_rules = [ + { + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1""", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1 + and l.array_column_2 = r.array_column_2""", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": "l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_2"], + }, + ] + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": blocking_rules, + "unique_id_column_name": "unique_id", + "additional_columns_to_retain": ["cluster"], + "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], + } + linker = helper.Linker(input_data, settings, **helper.extra_linker_args()) + linker.debug_mode = False + df_predict = linker.predict().as_pandas_dataframe() + ## check that there are no duplicates in the output + assert ( + df_predict.drop_duplicates(["unique_id_l", "unique_id_r"]).shape[0] + == df_predict.shape[0] + ) + + ## check that the output contains no links with match_key=1, + ## since all pairs returned by the second rule should also be + ## returned by the first rule and so should be filtered out + assert df_predict[df_predict.match_key == 1].shape[0] == 0 + + ## check that all 1000 true matches are in the output + ## (this is guaranteed by how the data was generated) + assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 + + +@mark_with_dialects_including("duckdb", "spark", pass_dialect=True) +def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): + helper = test_helpers[dialect] + input_data_l, input_data_r = generate_array_based_datasets_helper() + blocking_rules = [ + { + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1""", + "arrays_to_explode": ["array_column_0", "array_column_1"], + }, + { + "blocking_rule": """l.array_column_0 = r.array_column_0 + and l.array_column_1 = r.array_column_1 + and l.array_column_2=r.array_column_2""", + "arrays_to_explode": ["array_column_0", "array_column_1", "array_column_2"], + }, + { + "blocking_rule": "l.array_column_2 = r.array_column_2", + "arrays_to_explode": ["array_column_2"], + }, + ] + settings = { + "link_type": "link_only", + "blocking_rules_to_generate_predictions": blocking_rules, + "unique_id_column_name": "cluster", + "additional_columns_to_retain": ["cluster"], + "comparisons": [helper.cl.array_intersect_at_sizes("array_column_1", [1])], + } + linker = helper.Linker( + [input_data_l, input_data_r], settings, **helper.extra_linker_args() + ) + linker.debug_mode = False + df_predict = linker.predict().as_pandas_dataframe() + + ## check that we get no within-dataset links + within_dataset_links = df_predict[ + df_predict.source_dataset_l == df_predict.source_dataset_r + ].shape[0] + assert within_dataset_links == 0 + + ## check that no pair of ids appears twice in the output + assert ( + df_predict.drop_duplicates(["cluster_l", "cluster_r"]).shape[0] + == df_predict.shape[0] + ) + + ## check that the second blocking rule returns no matches, + ## since every pair matching the second rule will also match the first, + ## and so should be filtered out + assert df_predict[df_predict.match_key == 1].shape[0] == 0 + + ## check that all 1000 true matches are returned + assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 From c7307ebf689b26e57e49390e97b0f11c9e9e5ee3 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 12 Dec 2023 11:35:05 +0000 Subject: [PATCH 33/37] fix link type issue and unique ids --- splink/blocking.py | 35 +++++++++++--- tests/test_array_based_blocking.py | 74 ++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 6 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index c02d10317e..00de9d3c0e 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -287,6 +287,13 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) link_type, settings_obj._unique_id_input_columns ) + id_expr_l = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "l" + ) + id_expr_r = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "r" + ) + if link_type == "two_dataset_link_only": where_condition = ( where_condition + " and l.source_dataset < r.source_dataset" @@ -294,8 +301,8 @@ def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule) sql = f""" select distinct - l.{unique_id_col} as {unique_id_col}_l, - r.{unique_id_col} as {unique_id_col}_r + {id_expr_l} as {unique_id_col}_l, + {id_expr_r} as {unique_id_col}_r from __splink__df_concat_with_tf_unnested as l inner join __splink__df_concat_with_tf_unnested as r on ({br.blocking_rule_sql}) @@ -319,11 +326,19 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker): splink_df = self.exploded_id_pair_table ids_to_compare_sql = f"select * from {splink_df.physical_name}" + settings_obj = linker._settings_obj + id_expr_l = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "l" + ) + id_expr_r = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "r" + ) + return f"""EXISTS ( select 1 from ({ids_to_compare_sql}) as ids_to_compare where ( - l.{unique_id_column} = ids_to_compare.{unique_id_column}_l and - r.{unique_id_column} = ids_to_compare.{unique_id_column}_r + {id_expr_l} = ids_to_compare.{unique_id_column}_l and + {id_expr_r} = ids_to_compare.{unique_id_column}_r ) ) """ @@ -337,6 +352,14 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) "Exploding blocking rules are not supported for the function you have" " called." ) + settings_obj = linker._settings_obj + id_expr_l = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "l" + ) + id_expr_r = _composite_unique_id_from_nodes_sql( + settings_obj._unique_id_input_columns, "r" + ) + exploded_id_pair_table = self.exploded_id_pair_table unique_id_col = linker._settings_obj._unique_id_column_name sql = f""" @@ -346,9 +369,9 @@ def create_blocked_pairs_sql(self, linker: Linker, where_condition, probability) {probability} from {exploded_id_pair_table.physical_name} as pairs left join {linker._input_tablename_l} as l - on pairs.{unique_id_col}_l=l.{unique_id_col} + on pairs.{unique_id_col}_l={id_expr_l} left join {linker._input_tablename_r} as r - on pairs.{unique_id_col}_r=r.{unique_id_col} + on pairs.{unique_id_col}_r={id_expr_r} """ return sql diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index 62d7176852..778f039090 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -179,3 +179,77 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect): ## check that all 1000 true matches are returned assert sum(df_predict.cluster_l == df_predict.cluster_r) == 1000 + + +@mark_with_dialects_including("duckdb", pass_dialect=True) +def test_link_only_unique_id_ambiguity(test_helpers, dialect): + helper = test_helpers[dialect] + data_1 = [ + { + "unique_id": 1, + "first_name": "John", + "surname": "Doe", + "postcode": ["A", "B"], + }, + {"unique_id": 3, "first_name": "John", "surname": "Doe", "postcode": ["B"]}, + ] + + data_2 = [ + {"unique_id": 3, "first_name": "John", "surname": "Smith", "postcode": ["A"]}, + ] + + data_3 = [ + {"unique_id": 3, "first_name": "John", "surname": "Smith", "postcode": ["A"]}, + {"unique_id": 4, "first_name": "John", "surname": "Doe", "postcode": ["C"]}, + ] + + df_1 = pd.DataFrame(data_1) + df_2 = pd.DataFrame(data_2) + df_3 = pd.DataFrame(data_3) + + settings = { + "link_type": "link_only", + "blocking_rules_to_generate_predictions": [ + { + "blocking_rule": "l.postcode = r.postcode and l.first_name = r.first_name", + "arrays_to_explode": ["postcode"], + }, + "l.surname = r.surname", + ], + "comparisons": [ + helper.cl.exact_match("first_name"), + helper.cl.exact_match("surname"), + helper.cl.exact_match("postcode"), + ], + "retain_intermediate_calculation_columns": True, + } + + linker = helper.Linker( + [df_1, df_2, df_3], settings, input_table_aliases=["a_", "b_", "c_"] + ) + returned_triples = linker.predict().as_pandas_dataframe()[ + [ + "source_dataset_l", + "unique_id_l", + "source_dataset_r", + "unique_id_r", + "match_key", + ] + ] + + triples = returned_triples.to_dict(orient="split")["data"] + + actual_triples = { + tuple(t) for t in returned_triples.to_dict(orient="split")["data"] + } + assert len(returned_triples) == 5 + + rule1_tuples = { + ("a_", 1, "b_", 3, "0"), + ("a_", 1, "c_", 3, "0"), + ("b_", 3, "c_", 3, "0"), + } + rule2_tuples = {("a_", 1, "c_", 4, "1"), ("a_", 3, "c_", 4, "1")} + + all_tuples = rule1_tuples.union(rule2_tuples) + assert actual_triples == all_tuples From 8aecb270022a31e141dbb10e5e3eb28679d5a9b3 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 12 Dec 2023 11:39:04 +0000 Subject: [PATCH 34/37] lint --- tests/test_array_based_blocking.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index 778f039090..848b4637e6 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -237,8 +237,6 @@ def test_link_only_unique_id_ambiguity(test_helpers, dialect): ] ] - triples = returned_triples.to_dict(orient="split")["data"] - actual_triples = { tuple(t) for t in returned_triples.to_dict(orient="split")["data"] } From 6798cb1d4bd2b8bdefd9f8f83e02a683585af32e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 12 Dec 2023 11:54:07 +0000 Subject: [PATCH 35/37] fix line length --- tests/test_array_based_blocking.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index 848b4637e6..a8ca333d7b 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -211,7 +211,8 @@ def test_link_only_unique_id_ambiguity(test_helpers, dialect): "link_type": "link_only", "blocking_rules_to_generate_predictions": [ { - "blocking_rule": "l.postcode = r.postcode and l.first_name = r.first_name", + "blocking_rule": """l.postcode = r.postcode + and l.first_name = r.first_name""", "arrays_to_explode": ["postcode"], }, "l.surname = r.surname", From 9ef61f190472949f0f325e6e311eda37ad731ed2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 12 Dec 2023 13:12:47 +0000 Subject: [PATCH 36/37] rename method for clarity --- splink/blocking.py | 2 +- splink/duckdb/linker.py | 6 ++++-- splink/linker.py | 4 +++- splink/spark/linker.py | 6 ++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 00de9d3c0e..56dcb4f75a 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -398,7 +398,7 @@ def materialise_exploded_id_tables(linker: Linker): InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name for colname in br.array_columns_to_explode ] - expl_sql = linker._gen_explode_sql( + expl_sql = linker._explode_arrays_sql( "__splink__df_concat_with_tf", br.array_columns_to_explode, list(input_colnames.difference(arrays_to_explode_quoted)), diff --git a/splink/duckdb/linker.py b/splink/duckdb/linker.py index 2e6a2fe889..1685bb136f 100644 --- a/splink/duckdb/linker.py +++ b/splink/duckdb/linker.py @@ -320,7 +320,9 @@ def export_to_duckdb_file(self, output_path, delete_intermediate_tables=False): new_con.execute(f"IMPORT DATABASE '{tmpdir}';") new_con.close() - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + def _explode_arrays_sql( + self, tbl_name, columns_to_explode, other_columns_to_retain + ): """Generated sql that explodes one or more columns in a table""" columns_to_explode = columns_to_explode.copy() other_columns_to_retain = other_columns_to_retain.copy() @@ -336,4 +338,4 @@ def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain ) other_columns_to_retain.append(column_to_explode) return f"""select {','.join(cols_to_select)} - from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain)})""" # noqa: E501 + from ({self._explode_arrays_sql(tbl_name,columns_to_explode,other_columns_to_retain)})""" # noqa: E501 diff --git a/splink/linker.py b/splink/linker.py index 012588113d..9eef7c2961 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -3959,7 +3959,9 @@ def _detect_blocking_rules_for_em_training( ].iloc[0] return suggestion - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + def _explode_arrays_sql( + self, tbl_name, columns_to_explode, other_columns_to_retain + ): raise NotImplementedError( f"Unnesting blocking rules are not supported for {type(self)}" ) diff --git a/splink/spark/linker.py b/splink/spark/linker.py index df74e6801e..1514e63b0c 100644 --- a/splink/spark/linker.py +++ b/splink/spark/linker.py @@ -541,7 +541,9 @@ def _check_ansi_enabled_if_converting_dates(self): are cleaned to remove bad dates \n""" ) - def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain): + def _explode_arrays_sql( + self, tbl_name, columns_to_explode, other_columns_to_retain + ): """Generated sql that explodes one or more columns in a table""" columns_to_explode = columns_to_explode.copy() other_columns_to_retain = other_columns_to_retain.copy() @@ -555,4 +557,4 @@ def _gen_explode_sql(self, tbl_name, columns_to_explode, other_columns_to_retain + columns_to_explode ) return f"""select {','.join(cols_to_select)} - from ({self._gen_explode_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})""" # noqa: E501 + from ({self._explode_arrays_sql(tbl_name,columns_to_explode,other_columns_to_retain+[column_to_explode])})""" # noqa: E501 From ca0e202a42495cbe3cebdf2277c9fde8a3b831d0 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 17 Jan 2024 12:05:05 +0000 Subject: [PATCH 37/37] Move out of loop --- splink/blocking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 56dcb4f75a..b1468067c8 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -389,11 +389,11 @@ def materialise_exploded_id_tables(linker: Linker): br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) ] exploded_tables = [] - for br in exploding_blocking_rules: - input_dataframe = linker._initialise_df_concat_with_tf() - input_colnames = {col.name for col in input_dataframe.columns} + input_dataframe = linker._initialise_df_concat_with_tf() + input_colnames = {col.name for col in input_dataframe.columns} + for br in exploding_blocking_rules: arrays_to_explode_quoted = [ InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name for colname in br.array_columns_to_explode