Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pairwise string comparison #51

Merged
merged 9 commits into from
Dec 16, 2024
Merged
Prev Previous commit
Next Next commit
move test to cl_ch module, now that we are implementing our own version
ADBond committed Dec 16, 2024
commit 72457373ee21181f3af0a822068798695148d5fb
39 changes: 0 additions & 39 deletions tests/test_cl.py

This file was deleted.

36 changes: 35 additions & 1 deletion tests/test_cl_ch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import splink.comparison_library as cl
from pytest import raises
from pytest import mark, raises
from splink import DuckDBAPI, Linker, SettingsCreator

import splinkclickhouse.comparison_library as cl_ch
@@ -105,3 +105,37 @@ def test_clickhouse_date_of_birth_comparison(api_info, fake_1000):

linker = Linker(fake_1000, settings, db_api)
linker.inference.predict()


# TODO: for now there's not a straightforward way (afaik) to get an array column
# into chdb. So for the time being we test only clickhouse server version
@mark.clickhouse
@mark.clickhouse_no_core
def test_pairwise_string_distance(clickhouse_api_factory, input_nodes_with_name_arrays):
db_api = clickhouse_api_factory()

settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("username"),
# can pretend these are distinct
cl_ch.PairwiseStringDistanceFunctionAtThresholds(
"aliases", "levenshtein", [1, 2]
),
cl_ch.PairwiseStringDistanceFunctionAtThresholds(
"aliases_2", "damerau_levenshtein", [1, 2, 3]
),
cl_ch.PairwiseStringDistanceFunctionAtThresholds(
"aliases_3", "jaro", [0.88, 0.7]
),
cl_ch.PairwiseStringDistanceFunctionAtThresholds(
"aliases_4", "jaro_winkler", [0.88, 0.7]
),
],
)

input_nodes_with_name_arrays["aliases_2"] = input_nodes_with_name_arrays["aliases"]
input_nodes_with_name_arrays["aliases_3"] = input_nodes_with_name_arrays["aliases"]
input_nodes_with_name_arrays["aliases_4"] = input_nodes_with_name_arrays["aliases"]
linker = Linker(input_nodes_with_name_arrays, settings, db_api)
linker.inference.predict()