From aca606e5efb4e6b2c4b17e74efdf488604239b3e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 13 Nov 2023 15:37:06 +0000 Subject: [PATCH 1/5] name_l --- splink/cluster_metrics.py | 2 +- splink/comparison_level.py | 8 ++------ splink/comparison_level_library.py | 18 +++++++++--------- splink/input_column.py | 23 +++++++++++++++++++---- splink/settings.py | 6 +++--- splink/splink_comparison_viewer.py | 2 +- splink/unique_id_concat.py | 2 +- splink/waterfall_chart.py | 4 ++-- tests/test_input_column.py | 4 ++-- tests/test_sql_transform.py | 6 +++--- 10 files changed, 43 insertions(+), 32 deletions(-) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index d15977245a..b68308b7a6 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -24,7 +24,7 @@ def _size_density_sql( clusters_table = df_clustered.physical_name input_col = InputColumn(_unique_id_col) - unique_id_col_l = input_col.name_l() + unique_id_col_l = input_col.name_l sqls = [] sql = f""" diff --git a/splink/comparison_level.py b/splink/comparison_level.py index 9011be8d9e..38d4adaa76 100644 --- a/splink/comparison_level.py +++ b/splink/comparison_level.py @@ -577,12 +577,8 @@ def _tf_adjustment_sql(self): else: tf_adj_col = self._tf_adjustment_input_column - coalesce_l_r = ( - f"coalesce({tf_adj_col.tf_name_l()}, {tf_adj_col.tf_name_r()})" - ) - coalesce_r_l = ( - f"coalesce({tf_adj_col.tf_name_r()}, {tf_adj_col.tf_name_l()})" - ) + coalesce_l_r = f"coalesce({tf_adj_col.tf_name_l}, {tf_adj_col.tf_name_r()})" + coalesce_r_l = f"coalesce({tf_adj_col.tf_name_r()}, {tf_adj_col.tf_name_l})" tf_adjustment_exists = f"{coalesce_l_r} is not null" u_prob_exact_match = self._u_probability_corresponding_to_exact_match diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index d7807b99c4..9ce2e73bb5 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -98,7 +98,7 @@ def __init__( valid_string_pattern = valid_string_regex col = InputColumn(col_name, sql_dialect=self._sql_dialect) - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r() if invalid_dates_as_null: col_name_l = self._valid_date_function(col_name_l, valid_string_pattern) @@ -231,7 +231,7 @@ def __init__( else: label_suffix = "" - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r() if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -395,7 +395,7 @@ def __init__( else: operator = "<=" - col_name_l, col_name_r = col.name_l(), col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r() if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -938,8 +938,8 @@ def __init__( col_1 = InputColumn(col_name_1, sql_dialect=self._sql_dialect) col_2 = InputColumn(col_name_2, sql_dialect=self._sql_dialect) - col_1_l, col_1_r = col_1.name_l(), col_1.name_r() - col_2_l, col_2_r = col_2.name_l(), col_2.name_r() + col_1_l, col_1_r = col_1.name_l, col_1.name_r() + col_2_l, col_2_r = col_2.name_l, col_2.name_r() if set_to_lowercase: col_1_l = f"lower({col_1_l})" @@ -1108,11 +1108,11 @@ def __init__( """ col = InputColumn(col_name, sql_dialect=self._sql_dialect) - s = f"""(abs({col.name_l()} - {col.name_r()})/ + s = f"""(abs({col.name_l} - {col.name_r()})/ (case - when {col.name_r()} > {col.name_l()} + when {col.name_r()} > {col.name_l} then {col.name_r()} - else {col.name_l()} + else {col.name_l} end)) < {percentage_distance_threshold}""" @@ -1178,7 +1178,7 @@ def __init__( col = InputColumn(col_name, sql_dialect=self._sql_dialect) size_array_intersection = ( - f"{self._size_array_intersect_function(col.name_l(), col.name_r())}" + f"{self._size_array_intersect_function(col.name_l, col.name_r())}" ) sql = f"{size_array_intersection} >= {min_intersection}" diff --git a/splink/input_column.py b/splink/input_column.py index 9a5f8423f5..ce0469c362 100644 --- a/splink/input_column.py +++ b/splink/input_column.py @@ -168,65 +168,79 @@ def tf_prefix(self) -> str: "_tf_prefix", "term_frequency_adjustment_column_prefix" ) + @property def name(self) -> str: return self.input_name_as_tree.sql(dialect=self._sql_dialect) + @property def name_l(self) -> str: return add_suffix(self.input_name_as_tree, suffix="_l").sql( dialect=self._sql_dialect ) + @property def name_r(self) -> str: return add_suffix(self.input_name_as_tree, suffix="_r").sql( dialect=self._sql_dialect ) + @property def names_l_r(self) -> list[str]: - return [self.name_l(), self.name_r()] + return [self.name_l, self.name_r()] + @property def l_name_as_l(self) -> str: name_with_l_table = add_table(self.input_name_as_tree, "l").sql( dialect=self._sql_dialect ) - return f"{name_with_l_table} as {self.name_l()}" + return f"{name_with_l_table} as {self.name_l}" + @property def r_name_as_r(self) -> str: name_with_r_table = add_table(self.input_name_as_tree, "r").sql( dialect=self._sql_dialect ) return f"{name_with_r_table} as {self.name_r()}" + @property def l_r_names_as_l_r(self) -> list[str]: return [self.l_name_as_l(), self.r_name_as_r()] + @property def bf_name(self) -> str: return add_prefix(self.input_name_as_tree, prefix=self.bf_prefix).sql( dialect=self._sql_dialect ) + @property def tf_name(self) -> str: return add_prefix(self.input_name_as_tree, prefix=self.tf_prefix).sql( dialect=self._sql_dialect ) + @property def tf_name_l(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) return add_suffix(tree, suffix="_l").sql(dialect=self._sql_dialect) + @property def tf_name_r(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) return add_suffix(tree, suffix="_r").sql(dialect=self._sql_dialect) + @property def tf_name_l_r(self) -> list[str]: - return [self.tf_name_l(), self.tf_name_r()] + return [self.tf_name_l, self.tf_name_r()] + @property def l_tf_name_as_l(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) tf_name_with_l_table = add_table(tree, tablename="l").sql( dialect=self._sql_dialect ) - return f"{tf_name_with_l_table} as {self.tf_name_l()}" + return f"{tf_name_with_l_table} as {self.tf_name_l}" + @property def r_tf_name_as_r(self) -> str: tree = add_prefix(self.input_name_as_tree, prefix=self.tf_prefix) tf_name_with_r_table = add_table(tree, tablename="r").sql( @@ -234,6 +248,7 @@ def r_tf_name_as_r(self) -> str: ) return f"{tf_name_with_r_table} as {self.tf_name_r()}" + @property def l_r_tf_names_as_l_r(self) -> list[str]: return [self.l_tf_name_as_l(), self.r_tf_name_as_r()] diff --git a/splink/settings.py b/splink/settings.py index f14b2d79c1..ef7b6516bc 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -240,7 +240,7 @@ def _columns_to_select_for_comparison_vector_values(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) + cols.append(uid_col.name_l) cols.append(uid_col.name_r()) for cc in self.comparisons: @@ -260,7 +260,7 @@ def _columns_to_select_for_bayes_factor_parts(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) + cols.append(uid_col.name_l) cols.append(uid_col.name_r()) for cc in self.comparisons: @@ -280,7 +280,7 @@ def _columns_to_select_for_predict(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.name_l()) + cols.append(uid_col.name_l) cols.append(uid_col.name_r()) for cc in self.comparisons: diff --git a/splink/splink_comparison_viewer.py b/splink/splink_comparison_viewer.py index f46435611e..8cf4cfde11 100644 --- a/splink/splink_comparison_viewer.py +++ b/splink/splink_comparison_viewer.py @@ -18,7 +18,7 @@ def row_examples(linker: Linker, example_rows_per_category=2): sqls = [] uid_cols = linker._settings_obj._unique_id_input_columns - uid_cols_l = [uid_col.name_l() for uid_col in uid_cols] + uid_cols_l = [uid_col.name_l for uid_col in uid_cols] uid_cols_r = [uid_col.name_r() for uid_col in uid_cols] uid_cols = uid_cols_l + uid_cols_r uid_expr = " || '-' ||".join(uid_cols) diff --git a/splink/unique_id_concat.py b/splink/unique_id_concat.py index 6b74c9299b..a11d20d5aa 100644 --- a/splink/unique_id_concat.py +++ b/splink/unique_id_concat.py @@ -28,7 +28,7 @@ def _composite_unique_id_from_edges_sql(unique_id_cols, l_or_r, table_prefix=Non table_prefix = "" if l_or_r == "l": - cols = [f"{table_prefix}{c.name_l()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name_l}" for c in unique_id_cols] if l_or_r == "r": cols = [f"{table_prefix}{c.name_r()}" for c in unique_id_cols] if l_or_r is None: diff --git a/splink/waterfall_chart.py b/splink/waterfall_chart.py index e52822e214..e35ae48b2b 100644 --- a/splink/waterfall_chart.py +++ b/splink/waterfall_chart.py @@ -60,7 +60,7 @@ def _comparison_records(record_as_dict, comparison: Comparison): waterfall_record["u_probability"] = cl.u_probability waterfall_record["bayes_factor_description"] = cl._bayes_factor_description input_cols_used = c._input_columns_used_by_case_statement - input_cols_l = [ic.unquote().name_l() for ic in input_cols_used] + input_cols_l = [ic.unquote().name_l for ic in input_cols_used] input_cols_r = [ic.unquote().name_r() for ic in input_cols_used] waterfall_record["value_l"] = ", ".join( [str(record_as_dict[n]) for n in input_cols_l] @@ -78,7 +78,7 @@ def _comparison_records(record_as_dict, comparison: Comparison): if cl._tf_adjustment_input_column is not None: waterfall_record_2["value_l"] = str( - record_as_dict[cl._tf_adjustment_input_column.unquote().name_l()] + record_as_dict[cl._tf_adjustment_input_column.unquote().name_l] ) waterfall_record_2["value_r"] = str( record_as_dict[cl._tf_adjustment_input_column.unquote().name_r()] diff --git a/tests/test_input_column.py b/tests/test_input_column.py index aed6f0f2f8..3b85907085 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -6,8 +6,8 @@ def test_input_column(): assert c.name() == '"my_col"' assert c.unquote().name() == "my_col" - assert c.name_l() == '"my_col_l"' - assert c.tf_name_l() == '"tf_my_col_l"' + assert c.name_l == '"my_col_l"' + assert c.tf_name_l == '"tf_my_col_l"' assert c.unquote().quote().l_tf_name_as_l() == '"l"."tf_my_col" as "tf_my_col_l"' assert c.unquote().l_tf_name_as_l() == '"l".tf_my_col as tf_my_col_l' diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py index fb3f80e885..7afe60682c 100644 --- a/tests/test_sql_transform.py +++ b/tests/test_sql_transform.py @@ -95,12 +95,12 @@ def test_add_pref_and_suffix(): assert dull.l_r_names_as_l_r() == dull_l_r assert dull.bf_name() == '"bf_dull"' - assert dull.tf_name_l() == '"tf_dull_l"' + assert dull.tf_name_l == '"tf_dull_l"' tf_dull_l_r = ['"l"."tf_dull" as "tf_dull_l"', '"r"."tf_dull" as "tf_dull_r"'] assert dull.l_r_tf_names_as_l_r() == tf_dull_l_r ll = InputColumn("lat['long']") - assert ll.name_l() == "\"lat_l\"['long']" + assert ll.name_l == "\"lat_l\"['long']" ll_tf_l_r = [ '"l"."tf_lat"[\'long\'] as "tf_lat_l"[\'long\']', @@ -110,7 +110,7 @@ def test_add_pref_and_suffix(): assert ll.l_r_tf_names_as_l_r() == ll_tf_l_r group = InputColumn("cluster") - assert group.name_l() == '"cluster_l"' + assert group.name_l == '"cluster_l"' assert group.bf_name() == '"bf_cluster"' group_l_r_names = ['"l"."cluster" as "cluster_l"', '"r"."cluster" as "cluster_r"'] assert group.l_r_names_as_l_r() == group_l_r_names From 0f9eed5fb97343232ed0316348d978501c4358ef Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 13 Nov 2023 15:42:14 +0000 Subject: [PATCH 2/5] name_r --- splink/comparison_level.py | 4 ++-- splink/comparison_level_library.py | 18 +++++++++--------- splink/input_column.py | 8 ++++---- splink/settings.py | 6 +++--- splink/splink_comparison_viewer.py | 2 +- splink/unique_id_concat.py | 2 +- splink/waterfall_chart.py | 4 ++-- tests/test_input_column.py | 2 +- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/splink/comparison_level.py b/splink/comparison_level.py index 38d4adaa76..531148ddbf 100644 --- a/splink/comparison_level.py +++ b/splink/comparison_level.py @@ -577,8 +577,8 @@ def _tf_adjustment_sql(self): else: tf_adj_col = self._tf_adjustment_input_column - coalesce_l_r = f"coalesce({tf_adj_col.tf_name_l}, {tf_adj_col.tf_name_r()})" - coalesce_r_l = f"coalesce({tf_adj_col.tf_name_r()}, {tf_adj_col.tf_name_l})" + coalesce_l_r = f"coalesce({tf_adj_col.tf_name_l}, {tf_adj_col.tf_name_r})" + coalesce_r_l = f"coalesce({tf_adj_col.tf_name_r}, {tf_adj_col.tf_name_l})" tf_adjustment_exists = f"{coalesce_l_r} is not null" u_prob_exact_match = self._u_probability_corresponding_to_exact_match diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index 9ce2e73bb5..90ac54139d 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -98,7 +98,7 @@ def __init__( valid_string_pattern = valid_string_regex col = InputColumn(col_name, sql_dialect=self._sql_dialect) - col_name_l, col_name_r = col.name_l, col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if invalid_dates_as_null: col_name_l = self._valid_date_function(col_name_l, valid_string_pattern) @@ -231,7 +231,7 @@ def __init__( else: label_suffix = "" - col_name_l, col_name_r = col.name_l, col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -395,7 +395,7 @@ def __init__( else: operator = "<=" - col_name_l, col_name_r = col.name_l, col.name_r() + col_name_l, col_name_r = col.name_l, col.name_r if set_to_lowercase: col_name_l = f"lower({col_name_l})" @@ -938,8 +938,8 @@ def __init__( col_1 = InputColumn(col_name_1, sql_dialect=self._sql_dialect) col_2 = InputColumn(col_name_2, sql_dialect=self._sql_dialect) - col_1_l, col_1_r = col_1.name_l, col_1.name_r() - col_2_l, col_2_r = col_2.name_l, col_2.name_r() + col_1_l, col_1_r = col_1.name_l, col_1.name_r + col_2_l, col_2_r = col_2.name_l, col_2.name_r if set_to_lowercase: col_1_l = f"lower({col_1_l})" @@ -1108,10 +1108,10 @@ def __init__( """ col = InputColumn(col_name, sql_dialect=self._sql_dialect) - s = f"""(abs({col.name_l} - {col.name_r()})/ + s = f"""(abs({col.name_l} - {col.name_r})/ (case - when {col.name_r()} > {col.name_l} - then {col.name_r()} + when {col.name_r} > {col.name_l} + then {col.name_r} else {col.name_l} end)) < {percentage_distance_threshold}""" @@ -1178,7 +1178,7 @@ def __init__( col = InputColumn(col_name, sql_dialect=self._sql_dialect) size_array_intersection = ( - f"{self._size_array_intersect_function(col.name_l, col.name_r())}" + f"{self._size_array_intersect_function(col.name_l, col.name_r)}" ) sql = f"{size_array_intersection} >= {min_intersection}" diff --git a/splink/input_column.py b/splink/input_column.py index ce0469c362..de0ef0d4e3 100644 --- a/splink/input_column.py +++ b/splink/input_column.py @@ -186,7 +186,7 @@ def name_r(self) -> str: @property def names_l_r(self) -> list[str]: - return [self.name_l, self.name_r()] + return [self.name_l, self.name_r] @property def l_name_as_l(self) -> str: @@ -200,7 +200,7 @@ def r_name_as_r(self) -> str: name_with_r_table = add_table(self.input_name_as_tree, "r").sql( dialect=self._sql_dialect ) - return f"{name_with_r_table} as {self.name_r()}" + return f"{name_with_r_table} as {self.name_r}" @property def l_r_names_as_l_r(self) -> list[str]: @@ -230,7 +230,7 @@ def tf_name_r(self) -> str: @property def tf_name_l_r(self) -> list[str]: - return [self.tf_name_l, self.tf_name_r()] + return [self.tf_name_l, self.tf_name_r] @property def l_tf_name_as_l(self) -> str: @@ -246,7 +246,7 @@ def r_tf_name_as_r(self) -> str: tf_name_with_r_table = add_table(tree, tablename="r").sql( dialect=self._sql_dialect ) - return f"{tf_name_with_r_table} as {self.tf_name_r()}" + return f"{tf_name_with_r_table} as {self.tf_name_r}" @property def l_r_tf_names_as_l_r(self) -> list[str]: diff --git a/splink/settings.py b/splink/settings.py index ef7b6516bc..59fd9a052e 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -241,7 +241,7 @@ def _columns_to_select_for_comparison_vector_values(self): for uid_col in self._unique_id_input_columns: cols.append(uid_col.name_l) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_comparison_vector_values) @@ -261,7 +261,7 @@ def _columns_to_select_for_bayes_factor_parts(self): for uid_col in self._unique_id_input_columns: cols.append(uid_col.name_l) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_bayes_factor_parts) @@ -281,7 +281,7 @@ def _columns_to_select_for_predict(self): for uid_col in self._unique_id_input_columns: cols.append(uid_col.name_l) - cols.append(uid_col.name_r()) + cols.append(uid_col.name_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_predict) diff --git a/splink/splink_comparison_viewer.py b/splink/splink_comparison_viewer.py index 8cf4cfde11..d6ec3ef496 100644 --- a/splink/splink_comparison_viewer.py +++ b/splink/splink_comparison_viewer.py @@ -19,7 +19,7 @@ def row_examples(linker: Linker, example_rows_per_category=2): uid_cols = linker._settings_obj._unique_id_input_columns uid_cols_l = [uid_col.name_l for uid_col in uid_cols] - uid_cols_r = [uid_col.name_r() for uid_col in uid_cols] + uid_cols_r = [uid_col.name_r for uid_col in uid_cols] uid_cols = uid_cols_l + uid_cols_r uid_expr = " || '-' ||".join(uid_cols) diff --git a/splink/unique_id_concat.py b/splink/unique_id_concat.py index a11d20d5aa..a22a4ae775 100644 --- a/splink/unique_id_concat.py +++ b/splink/unique_id_concat.py @@ -30,7 +30,7 @@ def _composite_unique_id_from_edges_sql(unique_id_cols, l_or_r, table_prefix=Non if l_or_r == "l": cols = [f"{table_prefix}{c.name_l}" for c in unique_id_cols] if l_or_r == "r": - cols = [f"{table_prefix}{c.name_r()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name_r}" for c in unique_id_cols] if l_or_r is None: cols = [f"{table_prefix}{c.name()}" for c in unique_id_cols] diff --git a/splink/waterfall_chart.py b/splink/waterfall_chart.py index e35ae48b2b..03f1325d9b 100644 --- a/splink/waterfall_chart.py +++ b/splink/waterfall_chart.py @@ -61,7 +61,7 @@ def _comparison_records(record_as_dict, comparison: Comparison): waterfall_record["bayes_factor_description"] = cl._bayes_factor_description input_cols_used = c._input_columns_used_by_case_statement input_cols_l = [ic.unquote().name_l for ic in input_cols_used] - input_cols_r = [ic.unquote().name_r() for ic in input_cols_used] + input_cols_r = [ic.unquote().name_r for ic in input_cols_used] waterfall_record["value_l"] = ", ".join( [str(record_as_dict[n]) for n in input_cols_l] ) @@ -81,7 +81,7 @@ def _comparison_records(record_as_dict, comparison: Comparison): record_as_dict[cl._tf_adjustment_input_column.unquote().name_l] ) waterfall_record_2["value_r"] = str( - record_as_dict[cl._tf_adjustment_input_column.unquote().name_r()] + record_as_dict[cl._tf_adjustment_input_column.unquote().name_r] ) else: waterfall_record_2["value_l"] = "" diff --git a/tests/test_input_column.py b/tests/test_input_column.py index 3b85907085..042430f8b3 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -13,7 +13,7 @@ def test_input_column(): c = InputColumn("SUR name") assert c.name() == '"SUR name"' - assert c.name_r() == '"SUR name_r"' + assert c.name_r == '"SUR name_r"' assert c.r_name_as_r() == '"r"."SUR name" as "SUR name_r"' c = InputColumn("col['lat']") From 2d506f237f763bf68373f5ed20f5a745b80c8649 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 13 Nov 2023 15:44:55 +0000 Subject: [PATCH 3/5] name --- splink/blocking.py | 2 +- splink/comparison.py | 4 ++-- splink/comparison_level.py | 4 ++-- splink/find_matches_to_new_records.py | 6 +++--- splink/linker.py | 2 +- splink/lower_id_on_lhs.py | 2 +- splink/missingness.py | 4 ++-- splink/profile_data.py | 3 +-- splink/settings.py | 8 ++++---- splink/splink_dataframe.py | 2 +- splink/term_frequencies.py | 8 ++++---- splink/unique_id_concat.py | 4 ++-- tests/test_input_column.py | 14 +++++++------- tests/test_sql_transform.py | 2 +- 14 files changed, 32 insertions(+), 33 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 303a4c6622..aa52dd1b77 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -200,7 +200,7 @@ def _sql_gen_where_condition(link_type, unique_id_cols): source_dataset_col = unique_id_cols[0] where_condition = ( f"where {id_expr_l} < {id_expr_r} " - f"and l.{source_dataset_col.name()} != r.{source_dataset_col.name()}" + f"and l.{source_dataset_col.name} != r.{source_dataset_col.name}" ) return where_condition diff --git a/splink/comparison.py b/splink/comparison.py index 3e3d0f3986..aa6a3fb205 100644 --- a/splink/comparison.py +++ b/splink/comparison.py @@ -445,7 +445,7 @@ def _comparison_level_description_list(self): @property def _human_readable_description_succinct(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_case_statement] + [c.name for c in self._input_columns_used_by_case_statement] ) comp_levels = self._comparison_level_description_list @@ -463,7 +463,7 @@ def _human_readable_description_succinct(self): @property def human_readable_description(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_case_statement] + [c.name for c in self._input_columns_used_by_case_statement] ) comp_levels = self._comparison_level_description_list diff --git a/splink/comparison_level.py b/splink/comparison_level.py index 531148ddbf..ecbe5d69f1 100644 --- a/splink/comparison_level.py +++ b/splink/comparison_level.py @@ -202,7 +202,7 @@ def _tf_adjustment_input_column(self): def _tf_adjustment_input_column_name(self): input_column = self._tf_adjustment_input_column if input_column: - return input_column.unquote().name() + return input_column.unquote().name @property def _has_comparison(self): @@ -726,7 +726,7 @@ def _human_readable_succinct(self): @property def human_readable_description(self): input_cols = join_list_with_commas_final_and( - [c.name() for c in self._input_columns_used_by_sql_condition] + [c.name for c in self._input_columns_used_by_sql_condition] ) desc = ( f"Comparison level: {self.label_for_charts} of {input_cols}\n" diff --git a/splink/find_matches_to_new_records.py b/splink/find_matches_to_new_records.py index 23bcd72820..ad6b452c08 100644 --- a/splink/find_matches_to_new_records.py +++ b/splink/find_matches_to_new_records.py @@ -11,7 +11,7 @@ def add_unique_id_and_source_dataset_cols_if_needed( linker: "Linker", new_records_df: "SplinkDataFrame" ): cols = new_records_df.columns - cols = [c.unquote().name() for c in cols] + cols = [c.unquote().name for c in cols] # Add source dataset column to new records if required and not exists sds_sel_sql = "" @@ -21,7 +21,7 @@ def add_unique_id_and_source_dataset_cols_if_needed( # TODO: Shouldn't be necessary but the source dataset properties on settings # are currently broken sds_col = InputColumn(sds_col, linker._settings_obj) - sds_col = sds_col.unquote().name() + sds_col = sds_col.unquote().name if sds_col not in cols: sds_sel_sql = f", 'new_record' as {sds_col}" @@ -29,7 +29,7 @@ def add_unique_id_and_source_dataset_cols_if_needed( uid_sel_sql = "" uid_col = linker._settings_obj._unique_id_column_name uid_col = InputColumn(uid_col, linker._settings_obj) - uid_col = uid_col.unquote().name() + uid_col = uid_col.unquote().name if uid_col not in cols: uid_sel_sql = f", 'no_id_provided' as {uid_col}" diff --git a/splink/linker.py b/splink/linker.py index f173e86a4f..c0da5d0336 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -260,7 +260,7 @@ def _input_columns( # sort it for consistent ordering, and give each frame's # columns as a tuple so we can hash it column_names_by_input_df = [ - tuple(sorted([col.name() for col in input_df.columns])) + tuple(sorted([col.name for col in input_df.columns])) for input_df in input_dfs ] # check that the set of input columns is the same for each frame, diff --git a/splink/lower_id_on_lhs.py b/splink/lower_id_on_lhs.py index 6f4ad48c67..2e3d2d0e70 100644 --- a/splink/lower_id_on_lhs.py +++ b/splink/lower_id_on_lhs.py @@ -66,7 +66,7 @@ def lower_id_to_left_hand_side( """ # noqa cols = df.columns - cols = [c.unquote().name() for c in cols] + cols = [c.unquote().name for c in cols] l_cols = [c for c in cols if c.endswith("_l")] r_cols = [c for c in cols if c.endswith("_r")] diff --git a/splink/missingness.py b/splink/missingness.py index bd5711bd6c..c936c34046 100644 --- a/splink/missingness.py +++ b/splink/missingness.py @@ -8,8 +8,8 @@ def missingness_sqls(columns, input_tablename): selects = [ col_template.format( - col_name_escaped=col.name(), - col_name=col.unquote().name(), + col_name_escaped=col.name, + col_name=col.unquote().name, input_tablename=input_tablename, ) for col in columns diff --git a/splink/profile_data.py b/splink/profile_data.py index f09d6340f6..ea035a54c1 100644 --- a/splink/profile_data.py +++ b/splink/profile_data.py @@ -232,7 +232,7 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): """ if not column_expressions: - column_expressions = [col.name() for col in linker._input_columns] + column_expressions = [col.name for col in linker._input_columns] df_concat = linker._initialise_df_concat() @@ -297,7 +297,6 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): inner_charts.append(inner_chart) if inner_charts != []: - outer_spec = deepcopy(_outer_chart_spec_freq) outer_spec["vconcat"] = inner_charts diff --git a/splink/settings.py b/splink/settings.py index 59fd9a052e..04b2eaf7b8 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -131,10 +131,10 @@ def _get_additional_columns_to_retain(self): used_by_brs = [InputColumn(c) for c in used_by_brs] - used_by_brs = [c.unquote().name() for c in used_by_brs] + used_by_brs = [c.unquote().name for c in used_by_brs] already_used = self._columns_used_by_comparisons already_used = [InputColumn(c) for c in already_used] - already_used = [c.unquote().name() for c in already_used] + already_used = [c.unquote().name for c in already_used] new_cols = list(set(used_by_brs) - set(already_used)) a_cols.extend(new_cols) @@ -170,7 +170,7 @@ def _source_dataset_input_column(self): @property def _source_dataset_col(self): input_column = self._source_dataset_input_column - return (input_column, InputColumn(input_column, self).name()) + return (input_column, InputColumn(input_column, self).name) @property def _unique_id_input_columns(self) -> list[InputColumn]: @@ -214,7 +214,7 @@ def _columns_used_by_comparisons(self): cols_used.append(self._unique_id_column_name) for cc in self.comparisons: cols = cc._input_columns_used_by_case_statement - cols = [c.name() for c in cols] + cols = [c.name for c in cols] cols_used.extend(cols) return dedupe_preserving_order(cols_used) diff --git a/splink/splink_dataframe.py b/splink/splink_dataframe.py index a561cd01be..5721d5f8e2 100644 --- a/splink/splink_dataframe.py +++ b/splink/splink_dataframe.py @@ -33,7 +33,7 @@ def columns(self): @property def columns_escaped(self): cols = self.columns - return [c.name() for c in cols] + return [c.name for c in cols] def validate(): pass diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index 38e0807ff2..c8034e4a37 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -31,7 +31,7 @@ def colname_to_tf_tablename(input_column: InputColumn): def term_frequencies_for_single_column_sql( input_column: InputColumn, table_name="__splink__df_concat" ): - col_name = input_column.name() + col_name = input_column.name sql = f""" select @@ -69,11 +69,11 @@ def _join_tf_to_input_df_sql(linker: Linker): tbl = colname_to_tf_tablename(col) if tbl in linker._intermediate_table_cache: tbl = linker._intermediate_table_cache[tbl].physical_name - sql = templ.format(tbl=tbl, col=col.name()) + sql = templ.format(tbl=tbl, col=col.name) left_joins.append(sql) # left_joins = [ - # templ.format(tbl=colname_to_tf_tablename(col), col=col.name()) + # templ.format(tbl=colname_to_tf_tablename(col), col=col.name) # for col in tf_cols # ] left_joins = " ".join(left_joins) @@ -90,7 +90,7 @@ def _join_tf_to_input_df_sql(linker: Linker): def term_frequencies_from_concat_with_tf(input_column): sql = f""" select - distinct {input_column.name()}, + distinct {input_column.name}, {input_column.tf_name()} from __splink__df_concat_with_tf """ diff --git a/splink/unique_id_concat.py b/splink/unique_id_concat.py index a22a4ae775..f5b7cd9bc8 100644 --- a/splink/unique_id_concat.py +++ b/splink/unique_id_concat.py @@ -11,7 +11,7 @@ def _composite_unique_id_from_nodes_sql(unique_id_cols, table_prefix=None): else: table_prefix = "" - cols = [f"{table_prefix}{c.name()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name}" for c in unique_id_cols] return f" || '{CONCAT_SEPARATOR}' || ".join(cols) @@ -32,6 +32,6 @@ def _composite_unique_id_from_edges_sql(unique_id_cols, l_or_r, table_prefix=Non if l_or_r == "r": cols = [f"{table_prefix}{c.name_r}" for c in unique_id_cols] if l_or_r is None: - cols = [f"{table_prefix}{c.name()}" for c in unique_id_cols] + cols = [f"{table_prefix}{c.name}" for c in unique_id_cols] return f" || '{CONCAT_SEPARATOR}' || ".join(cols) diff --git a/tests/test_input_column.py b/tests/test_input_column.py index 042430f8b3..4ec780783e 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -3,8 +3,8 @@ def test_input_column(): c = InputColumn("my_col") - assert c.name() == '"my_col"' - assert c.unquote().name() == "my_col" + assert c.name == '"my_col"' + assert c.unquote().name == "my_col" assert c.name_l == '"my_col_l"' assert c.tf_name_l == '"tf_my_col_l"' @@ -12,7 +12,7 @@ def test_input_column(): assert c.unquote().l_tf_name_as_l() == '"l".tf_my_col as tf_my_col_l' c = InputColumn("SUR name") - assert c.name() == '"SUR name"' + assert c.name == '"SUR name"' assert c.name_r == '"SUR name_r"' assert c.r_name_as_r() == '"r"."SUR name" as "SUR name_r"' @@ -21,15 +21,15 @@ def test_input_column(): name = """ "col"['lat'] """.strip() - assert c.name() == name + assert c.name == name l_tf_name_as_l = """ "l"."tf_col"['lat'] as "tf_col_l"['lat'] """.strip() assert c.l_tf_name_as_l() == l_tf_name_as_l - assert c.unquote().name() == "col['lat']" - assert c.unquote().quote().name() == name + assert c.unquote().name == "col['lat']" + assert c.unquote().quote().name == name c = InputColumn("first name", sql_dialect="spark") - assert c.name() == "`first name`" + assert c.name == "`first name`" diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py index 7afe60682c..de1956c24e 100644 --- a/tests/test_sql_transform.py +++ b/tests/test_sql_transform.py @@ -124,4 +124,4 @@ def test_add_pref_and_suffix(): cols = ["unique_id", "SUR name", "cluster"] out_cols = ['"unique_id"', '"SUR name"', '"cluster"'] cols_class = [InputColumn(c) for c in cols] - assert [c.name() for c in cols_class] == out_cols + assert [c.name for c in cols_class] == out_cols From 0402fabe737583c073582d31418c991fcd0d48e4 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 13 Nov 2023 15:49:37 +0000 Subject: [PATCH 4/5] remaining methods --- splink/comparison.py | 6 +++--- splink/comparison_level.py | 6 ++---- splink/input_column.py | 4 ++-- splink/settings.py | 6 +++--- splink/term_frequencies.py | 6 +++--- tests/test_input_column.py | 8 ++++---- tests/test_sql_transform.py | 14 +++++++------- 7 files changed, 24 insertions(+), 26 deletions(-) diff --git a/splink/comparison.py b/splink/comparison.py index aa6a3fb205..05c12087ba 100644 --- a/splink/comparison.py +++ b/splink/comparison.py @@ -217,7 +217,7 @@ def _columns_to_select_for_comparison_vector_values(self): for cl in self.comparison_levels: if cl._has_tf_adjustments: col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) return dedupe_preserving_order(output_cols) @@ -240,7 +240,7 @@ def _columns_to_select_for_bayes_factor_parts(self): and self._settings_obj._retain_intermediate_calculation_columns ): col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) # Bayes factor case when statement sqls = [cl._bayes_factor_sql for cl in self.comparison_levels] @@ -282,7 +282,7 @@ def _columns_to_select_for_predict(self): and self._settings_obj._retain_intermediate_calculation_columns ): col = cl._tf_adjustment_input_column - output_cols.extend(col.tf_name_l_r()) + output_cols.extend(col.tf_name_l_r) for _col in input_cols: if self._settings_obj._retain_intermediate_calculation_columns: diff --git a/splink/comparison_level.py b/splink/comparison_level.py index ecbe5d69f1..8b44d9036f 100644 --- a/splink/comparison_level.py +++ b/splink/comparison_level.py @@ -465,11 +465,9 @@ def _columns_to_select_for_blocking(self): cols = self._input_columns_used_by_sql_condition for c in cols: - output_cols.extend(c.l_r_names_as_l_r()) + output_cols.extend(c.l_r_names_as_l_r) if self._tf_adjustment_input_column: - output_cols.extend( - self._tf_adjustment_input_column.l_r_tf_names_as_l_r() - ) + output_cols.extend(self._tf_adjustment_input_column.l_r_tf_names_as_l_r) return dedupe_preserving_order(output_cols) diff --git a/splink/input_column.py b/splink/input_column.py index de0ef0d4e3..8d94c4e245 100644 --- a/splink/input_column.py +++ b/splink/input_column.py @@ -204,7 +204,7 @@ def r_name_as_r(self) -> str: @property def l_r_names_as_l_r(self) -> list[str]: - return [self.l_name_as_l(), self.r_name_as_r()] + return [self.l_name_as_l, self.r_name_as_r] @property def bf_name(self) -> str: @@ -250,7 +250,7 @@ def r_tf_name_as_r(self) -> str: @property def l_r_tf_names_as_l_r(self) -> list[str]: - return [self.l_tf_name_as_l(), self.r_tf_name_as_r()] + return [self.l_tf_name_as_l, self.r_tf_name_as_r] def _quote_if_sql_keyword(self, name: str) -> str: if name not in {"group", "index"}: diff --git a/splink/settings.py b/splink/settings.py index 04b2eaf7b8..3c6a2363a0 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -224,14 +224,14 @@ def _columns_to_select_for_blocking(self): cols = [] for uid_col in self._unique_id_input_columns: - cols.append(uid_col.l_name_as_l()) - cols.append(uid_col.r_name_as_r()) + cols.append(uid_col.l_name_as_l) + cols.append(uid_col.r_name_as_r) for cc in self.comparisons: cols.extend(cc._columns_to_select_for_blocking) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.l_r_names_as_l_r()) + cols.extend(add_col.l_r_names_as_l_r) return dedupe_preserving_order(cols) diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index c8034e4a37..dc0dd84d3d 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -37,7 +37,7 @@ def term_frequencies_for_single_column_sql( select {col_name}, cast(count(*) as float8) / (select count({col_name}) as total from {table_name}) - as {input_column.tf_name()} + as {input_column.tf_name} from {table_name} where {col_name} is not null group by {col_name} @@ -56,7 +56,7 @@ def _join_tf_to_input_df_sql(linker: Linker): tbl = colname_to_tf_tablename(col) if tbl in linker._intermediate_table_cache: tbl = linker._intermediate_table_cache[tbl].physical_name - tf_col = col.tf_name() + tf_col = col.tf_name select_cols.append(f"{tbl}.{tf_col}") select_cols.insert(0, "__splink__df_concat.*") @@ -91,7 +91,7 @@ def term_frequencies_from_concat_with_tf(input_column): sql = f""" select distinct {input_column.name}, - {input_column.tf_name()} + {input_column.tf_name} from __splink__df_concat_with_tf """ diff --git a/tests/test_input_column.py b/tests/test_input_column.py index 4ec780783e..8a1dd794bd 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -8,13 +8,13 @@ def test_input_column(): assert c.name_l == '"my_col_l"' assert c.tf_name_l == '"tf_my_col_l"' - assert c.unquote().quote().l_tf_name_as_l() == '"l"."tf_my_col" as "tf_my_col_l"' - assert c.unquote().l_tf_name_as_l() == '"l".tf_my_col as tf_my_col_l' + assert c.unquote().quote().l_tf_name_as_l == '"l"."tf_my_col" as "tf_my_col_l"' + assert c.unquote().l_tf_name_as_l == '"l".tf_my_col as tf_my_col_l' c = InputColumn("SUR name") assert c.name == '"SUR name"' assert c.name_r == '"SUR name_r"' - assert c.r_name_as_r() == '"r"."SUR name" as "SUR name_r"' + assert c.r_name_as_r == '"r"."SUR name" as "SUR name_r"' c = InputColumn("col['lat']") @@ -26,7 +26,7 @@ def test_input_column(): l_tf_name_as_l = """ "l"."tf_col"['lat'] as "tf_col_l"['lat'] """.strip() - assert c.l_tf_name_as_l() == l_tf_name_as_l + assert c.l_tf_name_as_l == l_tf_name_as_l assert c.unquote().name == "col['lat']" assert c.unquote().quote().name == name diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py index de1956c24e..1fc67ceb74 100644 --- a/tests/test_sql_transform.py +++ b/tests/test_sql_transform.py @@ -92,12 +92,12 @@ def test_set_numeric_as_double(): def test_add_pref_and_suffix(): dull = InputColumn("dull") dull_l_r = ['"l"."dull" as "dull_l"', '"r"."dull" as "dull_r"'] - assert dull.l_r_names_as_l_r() == dull_l_r + assert dull.l_r_names_as_l_r == dull_l_r - assert dull.bf_name() == '"bf_dull"' + assert dull.bf_name == '"bf_dull"' assert dull.tf_name_l == '"tf_dull_l"' tf_dull_l_r = ['"l"."tf_dull" as "tf_dull_l"', '"r"."tf_dull" as "tf_dull_r"'] - assert dull.l_r_tf_names_as_l_r() == tf_dull_l_r + assert dull.l_r_tf_names_as_l_r == tf_dull_l_r ll = InputColumn("lat['long']") assert ll.name_l == "\"lat_l\"['long']" @@ -107,19 +107,19 @@ def test_add_pref_and_suffix(): '"r"."tf_lat"[\'long\'] as "tf_lat_r"[\'long\']', ] - assert ll.l_r_tf_names_as_l_r() == ll_tf_l_r + assert ll.l_r_tf_names_as_l_r == ll_tf_l_r group = InputColumn("cluster") assert group.name_l == '"cluster_l"' - assert group.bf_name() == '"bf_cluster"' + assert group.bf_name == '"bf_cluster"' group_l_r_names = ['"l"."cluster" as "cluster_l"', '"r"."cluster" as "cluster_r"'] - assert group.l_r_names_as_l_r() == group_l_r_names + assert group.l_r_names_as_l_r == group_l_r_names group_tf_l_r = [ '"l"."tf_cluster" as "tf_cluster_l"', '"r"."tf_cluster" as "tf_cluster_r"', ] - assert group.l_r_tf_names_as_l_r() == group_tf_l_r + assert group.l_r_tf_names_as_l_r == group_tf_l_r cols = ["unique_id", "SUR name", "cluster"] out_cols = ['"unique_id"', '"SUR name"', '"cluster"'] From 5040ee123f2a1785015538b7bc10ba5cbbdddd44 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 13 Nov 2023 15:52:57 +0000 Subject: [PATCH 5/5] missed some --- splink/comparison.py | 6 +++--- splink/comparison_level_library.py | 6 +++--- splink/settings.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/splink/comparison.py b/splink/comparison.py index 05c12087ba..b0254fcafd 100644 --- a/splink/comparison.py +++ b/splink/comparison.py @@ -210,7 +210,7 @@ def _columns_to_select_for_comparison_vector_values(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) output_cols.append(self._case_statement) @@ -230,7 +230,7 @@ def _columns_to_select_for_bayes_factor_parts(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) output_cols.append(self._gamma_column_name) @@ -268,7 +268,7 @@ def _columns_to_select_for_predict(self): output_cols = [] for col in input_cols: if self._settings_obj._retain_matching_columns: - output_cols.extend(col.names_l_r()) + output_cols.extend(col.names_l_r) if ( self._settings_obj._training_mode diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index 90ac54139d..6f1744fe09 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -1030,8 +1030,8 @@ def __init__( lat = InputColumn(lat_col, sql_dialect=self._sql_dialect) long = InputColumn(long_col, sql_dialect=self._sql_dialect) - lat_l, lat_r = lat.names_l_r() - long_l, long_r = long.names_l_r() + lat_l, lat_r = lat.names_l_r + long_l, long_r = long.names_l_r distance_km_sql = f""" {great_circle_distance_km_sql(lat_l, lat_r, long_l, long_r)} <= {km_threshold} @@ -1359,7 +1359,7 @@ def __init__( """ date = InputColumn(date_col, sql_dialect=self._sql_dialect) - date_l, date_r = date.names_l_r() + date_l, date_r = date.names_l_r datediff_sql = self._datediff_function( date_l, diff --git a/splink/settings.py b/splink/settings.py index 3c6a2363a0..51d29476a5 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -247,7 +247,7 @@ def _columns_to_select_for_comparison_vector_values(self): cols.extend(cc._columns_to_select_for_comparison_vector_values) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key") @@ -267,7 +267,7 @@ def _columns_to_select_for_bayes_factor_parts(self): cols.extend(cc._columns_to_select_for_bayes_factor_parts) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key") @@ -287,7 +287,7 @@ def _columns_to_select_for_predict(self): cols.extend(cc._columns_to_select_for_predict) for add_col in self._additional_columns_to_retain: - cols.extend(add_col.names_l_r()) + cols.extend(add_col.names_l_r) if self._needs_matchkey_column: cols.append("match_key")