Merge pull request #2283 from moj-analytical-services/update-docs-loa…

…d-settings Update Splink 4 docs
moj-analytical-services · Jul 23, 2024 · 8e10539 · 8e10539
2 parents 572d29d + 77c0d5f
commit 8e10539
Show file tree

Hide file tree

Showing 10 changed files with 46 additions and 40 deletions.
diff --git a/docs/api_docs/datasets.md b/docs/api_docs/datasets.md
@@ -17,16 +17,18 @@ df = splink_datasets.fake_1000
 which you can then use to set up a linker:
 ```py
 from splink splink_datasets, Linker, DuckDBAPI, SettingsCreator
+
 df = splink_datasets.fake_1000
-linker = DuckDBLinker(
+linker = Linker(
     df,
     SettingsCreator(
         link_type="dedupe_only",
         comparisons=[
             cl.exact_match("first_name"),
             cl.exact_match("surname"),
         ],
-    )
+    ),
+    db_api=DuckDBAPI()
 )
 ```
 

diff --git a/docs/dev_guides/debug_modes.md b/docs/dev_guides/debug_modes.md
@@ -44,7 +44,7 @@ Note that by default Splink sets the [logging level to `INFO` on initialisation]
 
 ```python
 import logging
-linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
+linker = Linker(df, settings, db_api, set_up_basic_logging=False)
 
 # This must come AFTER the linker is intialised, because the logging level
 # will be set to INFO
@@ -61,5 +61,5 @@ logging.basicConfig(format="%(message)s")
 splink_logger = logging.getLogger("splink")
 splink_logger.setLevel(logging.INFO)
 
-linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
+linker = Linker(df, settings, db_api, set_up_basic_logging=False)
 ```
diff --git a/docs/topic_guides/data_preparation/feature_engineering.md b/docs/topic_guides/data_preparation/feature_engineering.md
@@ -194,7 +194,7 @@ For a more detailed explanation on phonetic transformation algorithms, see the [
 
 ### Example
 
-There are a number of python packages which support phonetic transformations that can be applied to a pandas dataframe, which can then be loaded into the `DuckDBLinker`. For example, creating a [Double Metaphone](../comparisons/phonetic.md#double-metaphone) column with the [phonetics](https://pypi.org/project/phonetics/) python library:
+There are a number of python packages which support phonetic transformations that can be applied to a pandas dataframe, which can then be loaded into the `Linker`. For example, creating a [Double Metaphone](../comparisons/phonetic.md#double-metaphone) column with the [phonetics](https://pypi.org/project/phonetics/) python library:
 
 ```python
 import pandas as pd

diff --git a/docs/topic_guides/performance/optimising_duckdb.md b/docs/topic_guides/performance/optimising_duckdb.md
@@ -99,17 +99,17 @@ Use the special `:temporary:` connection built into Splink that creates a tempor
 
 ```python
 
-linker = DuckDBLinker(
-    df, settings, connection=":temporary:"
+linker = Linker(
+    df, settings, DuckDBAPI(connection=":temporary:")
 )
 ```
 
 Use an on-disk database:
 
 ```python
 con = duckdb.connect(database='my-db.duckdb')
-linker = DuckDBLinker(
-    df, settings, connection=con
+linker = Linker(
+    df, settings, DuckDBAPI(connection=con)
 )
 ```
 
@@ -119,8 +119,8 @@ Use an in-memory database, but ensure it can spill to disk:
 con = duckdb.connect(":memory:")
 
 con.execute("SET temp_directory='/path/to/temp';")
-linker = DuckDBLinker(
-    df, settings, connection=con
+linker = Linker(
+    df, settings, DuckDBAPI(connection=con)
 )
 ```
 

diff --git a/docs/topic_guides/performance/optimising_spark.md b/docs/topic_guides/performance/optimising_spark.md
@@ -25,9 +25,10 @@ For a cluster with 10 CPUs, that outputs about 8GB of data in parquet format, th
 spark.conf.set("spark.default.parallelism", "50")
 spark.conf.set("spark.sql.shuffle.partitions", "50")
 
-linker = SparkLinker(
+linker = Linker(
     person_standardised_nodes,
     settings,
+    db_api=spark_api,
     break_lineage_method="parquet",
     num_partitions_on_repartition=80,
 )
@@ -45,9 +46,10 @@ Splink will automatically break lineage in sensible places. We have found in pra
 You can do this using the `break_lineage_method` parameter as follows:
 
 ```
-linker = SparkLinker(
+linker = Linker(
     person_standardised_nodes,
     settings,
+    db_api=db_api,
     break_lineage_method="parquet"
 )
 
@@ -78,7 +80,7 @@ In general, increasing parallelism will make Spark 'chunk' your job into a large
 
 ## Repartition after blocking
 
-For some jobs, setting `repartition_after_blocking=True` when you initialise the `SparkLinker` may improve performance.
+For some jobs, setting `repartition_after_blocking=True` when you initialise the `SparkAPI` may improve performance.
 
 ## Salting
 

diff --git a/docs/topic_guides/performance/salting.md b/docs/topic_guides/performance/salting.md
@@ -17,22 +17,22 @@ Further information about the motivation for salting can be found [here](https:/
 
 ## How to use salting
 
-To enable salting using the `SparkLinker`, you provide some of your blocking rules as a dictionary rather than a string.
+To enable salting using the `Linker` with Spark, you provide some of your blocking rules as a dictionary rather than a string.
 
 This enables you to choose the number of salts for each blocking rule.
 
 Blocking rules provided as plain strings default to no salting (`salting_partitions = 1`)
 
 The following code snippet illustrates:
 
-```
+```py
 import logging
 
-from pyspark.context import SparkContext, SparkConf
+from pyspark.context import SparkConf, SparkContext
 from pyspark.sql import SparkSession
 
-from splink.spark.linker import SparkLinker
-from splink.spark.comparison_library import levenshtein_at_thresholds, exact_match
+import splink.comparison_library as cl
+from splink import Linker, SparkAPI, splink_datasets
 
 conf = SparkConf()
 conf.set("spark.driver.memory", "12g")
@@ -41,7 +41,7 @@ conf.set("spark.default.parallelism", "8")
 
 sc = SparkContext.getOrCreate(conf=conf)
 spark = SparkSession(sc)
-
+spark.sparkContext.setCheckpointDir("./tmp_checkpoints")
 
 settings = {
     "probability_two_random_records_match": 0.01,
@@ -51,29 +51,27 @@ settings = {
         {"blocking_rule": "l.first_name = r.first_name", "salting_partitions": 4},
     ],
     "comparisons": [
-        levenshtein_at_thresholds("first_name", 2),
-        exact_match("surname"),
-        exact_match("dob"),
-        exact_match("city", term_frequency_adjustments=True),
-        exact_match("email"),
+        cl.LevenshteinAtThresholds("first_name", 2),
+        cl.ExactMatch("surname"),
+        cl.ExactMatch("dob"),
+        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
+        cl.ExactMatch("email"),
     ],
     "retain_matching_columns": True,
     "retain_intermediate_calculation_columns": True,
-    "additional_columns_to_retain": ["group"],
+    "additional_columns_to_retain": ["cluster"],
     "max_iterations": 1,
     "em_convergence": 0.01,
 }
 
 
-df = spark.read.csv("./tests/datasets/fake_1000_from_splink_demos.csv", header=True)
+df = splink_datasets.fake_1000
 
-
-linker = SparkLinker(df, settings)
+spark_api = SparkAPI(spark_session=spark)
+linker = Linker(df, settings, db_api=spark_api)
 logging.getLogger("splink").setLevel(5)
 
-linker.load_settings(settings)
-linker.deterministic_link()
-
+linker.inference.deterministic_link()
 ```
 
 And we can see that salting has been applied by looking at the SQL generated in the log:

diff --git a/docs/topic_guides/splink_fundamentals/settings.md b/docs/topic_guides/splink_fundamentals/settings.md
@@ -416,7 +416,8 @@ where the `m_probability` and `u_probability` values here are then used to gener
 When using a pre-trained model, you can read in the model from a json and recreate the linker object to make new pairwise predictions. For example:
 
 ```py
-linker = DuckDBLinker(new_df,
+linker = Linker(
+    new_df,
     settings="./path/to/model.json",
     db_api=db_api
 )

diff --git a/splink/internals/linker.py b/splink/internals/linker.py
@@ -112,10 +112,12 @@ def __init__(
                 database) for link_only or link_and_dedupe.  For some linkers, such as
                 the DuckDBLinker and the SparkLinker, it's also possible to pass in
                 dataframes (Pandas and Spark respectively) rather than strings.
-            settings_dict (dict | Path, optional): A Splink settings dictionary, or a
-                path to a json defining a settingss dictionary or pre-trained model.
-                If not provided when the object is created, can later be added using
-                `linker.load_settings()` or `linker.load_model()` Defaults to None.
+            settings_dict (dict | Path | str): A Splink settings dictionary,
+                or a path (either as a pathlib.Path object, or a string) to a json file
+                defining a settings dictionary or pre-trained model.
+            db_api (DatabaseAPI): A `DatabaseAPI` object, which manages interactions
+                with the database. You can import these for use from
+                `splink.backends.{your_backend}`
             set_up_basic_logging (bool, optional): If true, sets ups up basic logging
                 so that Splink sends messages at INFO level to stdout. Defaults to True.
             input_table_aliases (Union[str, list], optional): Labels assigned to

diff --git a/splink/internals/linker_components/misc.py b/splink/internals/linker_components/misc.py
@@ -23,7 +23,9 @@ def save_model_to_json(
     ) -> dict[str, Any]:
         """Save the configuration and parameters of the linkage model to a `.json` file.
 
-        The model can later be loaded back in using `linker.load_model()`.
+        The model can later be loaded into a new linker using
+        `Linker(df, settings="path/to/model.json", db_api=db_api).
+
         The settings dict is also returned in case you want to save it a different way.
 
         Examples:

diff --git a/splink/internals/linker_components/table_management.py b/splink/internals/linker_components/table_management.py
@@ -43,8 +43,7 @@ def compute_tf_table(self, column_name: str) -> SplinkDataFrame:
 
             Real time linkage
             ```py
-            linker = Linker(df, db_api)
-            linker.load_settings("saved_settings.json")
+            linker = Linker(df, settings="saved_settings.json", db_api=db_api)
             linker.table_management.compute_tf_table("surname")
             linker.compare_two_records(record_left, record_right)
             ```