diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 0f166c665a..68fb5d8526 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -35,6 +35,17 @@ jobs: with: python-version: 3.9.10 + #---------------------------------------------- + # Install GLIBC - error occured 2024-06-01 on ubuntu-latest with python3.9 + # Temporary as ubuntu-latest could change or a different python version could + # change this current error. + #---------------------------------------------- + - name: Install GLIBC + + run: | + sudo apt-get update + sudo apt-get install -y libc6 + #---------------------------------------------- # -- save a few section by caching poetry -- #---------------------------------------------- diff --git a/.gitignore b/.gitignore index cc66da9e8c..834ecea95e 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,5 @@ cython_debug/ splink_db splink_db_log spark-warehouse + +scripts/pyspelling/dictionary.dic diff --git a/README.md b/README.md index 70ee44ca7a..2620ffa475 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/) > [!IMPORTANT] -> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html). +> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/04/02/splink-3-updates-and-splink-4-development-announcement---april-2024.html). # Fast, accurate and scalable probabilistic data linkage diff --git a/docs/dev_guides/changing_splink/contributing_to_docs.md b/docs/dev_guides/changing_splink/contributing_to_docs.md index 45dc9530ab..062f53bf0e 100644 --- a/docs/dev_guides/changing_splink/contributing_to_docs.md +++ b/docs/dev_guides/changing_splink/contributing_to_docs.md @@ -16,21 +16,27 @@ Once you've finished updating Splink documentation we ask that you run our spell ## Spellchecking docs -When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. Please note, the spellchecker _only works on markdown files_ and currently only works on systems which support `Homebrew` package manager. Instructions for other operating systems will be released later. +When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. If for whatever reason you can't run the spellchecker on your system, please don't let this prevent you from contributing to the documentation. Please note, the spellchecker _only works on markdown files_. -To run the spellchecker on either a single markdown file or folder of markdown files, you can use the following script: +If you are a Mac user with the `Homebrew` package manager installed, the script below will automatically install +the required system dependency, `aspell`. +If you've created your development environment [using conda](./development_quickstart.md), `aspell` will have been installed as part of that +process. +Instructions for installing `aspell` through other means may be added here in the future. + +To run the spellchecker on either a single markdown file or folder of markdown files, you can run the following bash script: ```sh -source scripts/pyspelling/spellchecker.sh +./scripts/pyspelling/spellchecker.sh ``` Omitting the file/folder path will run the spellchecker on all markdown files contained in the `docs` folder. We recommend running the spellchecker only on files that you have created or edited. The spellchecker uses the Python package [PySpelling](https://facelessuser.github.io/pyspelling/) and its underlying spellchecking tool, Aspell. Running the above script will automatically install these packages along with any other necessary dependencies. -The spellchecker compares words to a [standard British English dictionary](https://github.com/LibreOffice/dictionaries/blob/master/en/en_GB.aff) and a custom dictionary (`scripts/pyspelling/custom_dictionary.txt`) of words. If no spelling mistakes are found, you will see the following terminal printout: +The spellchecker compares words to a standard British English dictionary and a custom dictionary (`scripts/pyspelling/custom_dictionary.txt`) of words. If no spelling mistakes are found, you will see the following terminal printout: -```sh +``` Spelling check passed :) @@ -38,7 +44,7 @@ Spelling check passed :) otherwise, PySpelling will printout the spelling mistakes found in each file. -Correct spellings of words not found in a standard dictionary (e.g. Splink) can be recorded as such by adding them to `scripts/pyspelling/custom_dictionary.txt`. (Don't worry about adding them in alphabetical order or accidental duplication as this will be handled automatically by a GitHub Action future.) +Correct spellings of words not found in a standard dictionary (e.g. "Splink") can be recorded as such by adding them to `scripts/pyspelling/custom_dictionary.txt`. Please correct any mistakes found or update the custom dictionary to ensure the spellchecker passes before putting in a pull request containing updates to the documentation. diff --git a/docs/dev_guides/changing_splink/development_quickstart.md b/docs/dev_guides/changing_splink/development_quickstart.md index d6ea5be7f2..2654e79e22 100644 --- a/docs/dev_guides/changing_splink/development_quickstart.md +++ b/docs/dev_guides/changing_splink/development_quickstart.md @@ -114,7 +114,7 @@ and the teardown script each time you want to stop it: ``` Included in the docker-compose file is a [pgAdmin](https://www.pgadmin.org/) container to allow easy exploration of the database as you work, which can be accessed in-browser on the default port. -The default username is `a@b.com` with password `b`. +The default url: http://localhost:80/ username is `a@b.com` with password `b`. ## Step 3, Conda install option: Install system dependencies diff --git a/docs/dev_guides/settings_validation/extending_settings_validator.md b/docs/dev_guides/settings_validation/extending_settings_validator.md index 61530a26ff..784dd112ba 100644 --- a/docs/dev_guides/settings_validation/extending_settings_validator.md +++ b/docs/dev_guides/settings_validation/extending_settings_validator.md @@ -1,71 +1,38 @@ -## Expanding the Settings Validator +# Enhancing the Settings Validator -If a validation check is currently missing, you might want to expand the existing validation codebase. +## Overview of Current Validation Checks -Before adding any code, it's essential to determine whether the checks you want to include fit into any of the general validation categories already in place. +Below is a summary of the key validation checks currently implemented by our settings validator. For detailed information, please refer to the source code: -In summary, the following validation checks are currently carried out: +- **Blocking Rules and Comparison Levels Validation**: Ensures that the user’s blocking rules and comparison levels are correctly [imported from the designated library](https://github.com/moj-analytical-services/splink/pull/1579), and that they contain the necessary details for effective use within the Splink. +- **Column Existence Verification**: [Verifies the presence of columns](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) specified in the user’s settings across all input dataframes, preventing errors due to missing data fields. +- **Miscellaneous Checks**: Conducts a range of additional checks aimed at providing clear and informative error messages, facilitating smoother user experiences when deviations from typical Splink usage are detected. -* Verifying that the user's blocking rules and comparison levels have been [imported from the correct library](https://github.com/moj-analytical-services/splink/pull/1579) and contain sufficient information for Splink model usage. -* [Performing column lookups](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) to ensure that columns specified in the user's settings dictionary exist within **all** of the user's input dataframes. -* Various miscellaneous checks designed to generate more informative error messages for the user if they happen to employ Splink in an unintended manner. +### Extending Validation Logic -If you plan to introduce checks that differ from those currently in place, it's advisable to create a new script within `splink/settings_validation`. +If you are introducing new validation checks that deviate from the existing ones, please incorporate them as functions within a new script located in the [`splink/settings_validation` directory](https://github.com/moj-analytical-services/splink/tree/master/splink/settings_validation). This ensures that all validation logic is centrally managed and easily maintainable.
-## Splink Exceptions and Warnings +## Error handling and logging -While working on extending the settings validation tools suite, it's important to consider how we notify users when they've included invalid settings or features. +Error handling and logging in the settings validator takes the following forms: -Exception handling and warnings should be integrated into your validation functions to either halt the program or inform the user when errors occur, raising informative error messages as needed. +- **Raising INFO level logs** - These are raised when the settings validator detects an issue with the user's settings dictionary. These logs are intended to provide the user with information on how to rectify the issue, but should not halt the program. +- **Raising single exceptions** - Raise a built-in Python or Splink exception in response to finding an error. +- **Concurrently raising multiple exceptions** - In some instances, it makes sense to raise multiple errors simultaneously, so as not to disrupt the program. This is achieved using the `ErrorLogger` class. -### Warnings in Splink +The first two use standard Python logging and exception handling. The third is a custom class, covered in more detail below. -Warnings should be employed when you want to alert the user that an included setting might lead to unintended consequences, allowing the user to decide if it warrants further action. +You should look to use whichever makes the most sense given your requirements. -This could be applicable in scenarios such as: +### Raising multiple exceptions concurrently -* Parsing SQL where the potential for failure or incorrect column parsing exists. -* Situations where the user is better positioned to determine whether the issue should be treated as an error, like when dealing with exceptionally high values for [probability_two_random_records_match](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json#L29). +Raising multiple exceptions simultaneously provides users with faster and more manageable feedback, avoiding the tedious back-and-forth that typically occurs when errors are reported and addressed one at a time. -Implementing warnings is straightforward and involves creating a logger instance within your script, followed by a warning call. +To enable the logging of multiple errors in a single check, the [`ErrorLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/exceptions.py) class can be utilised. This is designed to operate similarly to a list, allowing the storing of errors using the `append` method. -??? note "Warnings in practice:" - ```py - import logging - logger = logging.getLogger(__name__) - - logger.warning("My warning message") - ``` - - Which will print: - - > `My warning message` - - to both the console and your log file. - -### Splink Exceptions - -Exceptions should be raised when you want the program to halt due to an unequivocal error. - -In addition to the built-in exception types, such as [`SyntaxError`](https://docs.python.org/3/library/exceptions.html#SyntaxError), we have several Splink-specific exceptions available for use. - -These exceptions serve to raise issues specific to Splink or to customise exception behaviour. For instance, you can specify a message prefix by modifying the constructor of an exception, as exemplified in the [`ComparisonSettingsException`](https://github.com/moj-analytical-services/splink/blob/f7c155c27ccf3c906c92180411b527a4cfd1111b/splink/exceptions.py#L14). - -It's crucial to also consider how to inform the user that such behaviour is not permitted. For guidelines on crafting effective error messages, refer to [How to Write Good Error Messages](https://uxplanet.org/how-to-write-good-error-messages-858e4551cd4). - -For a comprehensive list of exceptions native to Splink, visit [the exceptions.py script](https://github.com/moj-analytical-services/splink/blob/master/splink/exceptions.py). - -#### Raising Multiple Exceptions - -Raising multiple errors sequentially without disrupting the program, is a feature we commonly wish to implement across the validation steps. - -In numerous instances, it makes sense to wait until all checks have been performed before raising exceptions captured to the user in one go. - -To enable the logging of multiple errors in a singular check, or across multiple checks, an [`ErrorLogger`](https://github.com/moj-analytical-services/splink/blob/settings_validation_refactor_and_improved_logging/splink/exceptions.py#L34) class is available for use. - -The `ErrorLogger` operates in a similar way to working with a list, allowing you to add additional errors using the `append` method. Once you've logged all of your errors, you can raise them with the `raise_and_log_all_errors` method. +Once all errors have been logged, you can raise them with the `raise_and_log_all_errors` method. This will raise an exception of your choice and report all stored errors to the user. ??? note "`ErrorLogger` in practice" ```py @@ -86,102 +53,82 @@ The `ErrorLogger` operates in a similar way to working with a list, allowing you
-## Expanding our Miscellaneous Checks +## Expanding miscellaneous checks -Miscellaneous checks should typically be added as standalone functions. These functions can then be integrated into the linker's startup process for validation. +Miscellaneous checks should be added as standalone functions within an appropriate check inside `splink/settings_validation`. These functions can then be integrated into the linker's startup process for validation. -In most cases, you have more flexibility in how you structure your solutions. You can place the checks in a script that corresponds to the specific checks being performed, or, if one doesn't already exist, create a new script with a descriptive name. +An example of a miscellaneous check is the [`validate_dialect`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L26) function. This assesses whether the settings dialect aligns with the linker's dialect. -A prime example of a miscellaneous check is [`validate_dialect`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L31), which assesses whether the settings dialect aligns with the linker's dialect. +This is then injected into the `_validate_settings` method within our linker, as seen [here](https://github.com/moj-analytical-services/splink/blob/master/splink/linker.py#L500).
-## Additional Comparison and Blocking Rule Checks +## Additional comparison and blocking rule checks -If your checks pertain to comparisons or blocking rules, most of these checks are currently implemented within the [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) script. +Comparison and Blocking Rule checks can be found within the [`valid_types.py`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L26) script. -Currently, comparison and blocking rule checks are organised in a modular format. +These checks currently interface with the `ErrorLogger` class which is used to store and raise multiple errors simultaneously (see above). -To expand the current suite of tests, you should: +If you wish to expand the current set of tests, it is advised that you incorporate any new checks into either [`log_comparison_errors`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L64) or `_validate_settings` (mentioned above). -1. Create a function to inspect the presence of the error you're evaluating. -2. Define an error message that you intend to add to the `ErrorLogger` class. -3. Integrate these elements into either the [`validate_comparison_levels`](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py#L43) function (or something similar), which appends any detected errors to an `ErrorLogger`. -4. Finally, work out where this function should live in the setup process of the linker object. Typically, you should look to add these checks before any processing of the settings dictionary is performed. +
-The above steps are set to change as we are looking to refactor our settings object. +## Checking for the existence of user specified columns -
+Column and SQL validation is performed within [`log_invalid_columns.py`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py). -## Checking that columns exist +The aim of this script is to check that the columns specified by the user exist within the input dataframe(s). If any invalid columns are found, the script will log this with the user. Should you need to include extra checks to assess the validity of columns supplied by a user, your primary focus should be on the [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) script. -There are currently three classes employed to construct the current log strings. These can be extended to perform additional column checks. +There are two main classes within this script that can be used or extended to perform additional column checks: ??? note "`InvalidCols`" - `InvalidCols` is a `NamedTuple`, used to construct the bulk of our log strings. This accepts a list of columns and the type of error, producing a complete log string when requested. + [`InvalidCols`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_validation_log_strings.py) is a `NamedTuple`, used to construct the bulk of our log strings. This accepts a list of columns and the type of error, producing a complete log string when requested. + + For simplicity, there are three partial implementations to cover the most common cases: + - `MissingColumnsLogGenerator` - missing column identified. + - `InvalidTableNamesLogGenerator` - table name entered by the user is missing or invalid. + - `InvalidColumnSuffixesLogGenerator` - `_l` and `_r` suffixes are missing or invalid. - In practice, this is used as follows: + In practice, this can be used as follows: ```py - # Store the invalid columns and why they're invalid - my_invalid_cols = InvalidCols("invalid_cols", ["first_col", "second_col"]) + # Store our invalid columns + my_invalid_cols = MissingColumnsLogGenerator(["first_col", "second_col"]) # Construct the corresponding log string my_invalid_cols.construct_log_string() ``` -??? note "`InvalidColValidator`" - `InvalidColValidator` houses a series of validation checks to evaluate whether the column(s) contained within either a SQL string or a user's raw input string, are present within the underlying dataframes. - - To achieve this, it employs a range of cleaning functions to standardise our column inputs and conducts a series of checks on these cleaned columns. It utilises `InvalidCols` tuples to log any identified invalid columns. - - It inherits from our the `SettingsValidator` class. - ??? note "`InvalidColumnsLogger`" - The principal logging class for our invalid column checks. - - This class primarily calls our builder functions outlined in `InvalidColValidator`, constructing a series of log strings for output to both the console and the user's log file (if it exists). - - -To extend the column checks, you simply need to add an additional validation method to the [`InvalidColValidator`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L15) class, followed by an extension of the [`InvalidColumnsLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L164). - -### A Practical Example of a Column Check - -For an example of column checks in practice, see [`validate_uid`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L195). - -Here, we call `validate_settings_column`, checking whether the unique ID column submitted by the user is valid. The output of this call yields either an `InvalidCols` tuple, or `None`. - -From there, we can use the built-in log constructor [`construct_generic_settings_log_string`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L329C27-L329C27) to construct and print the required logs. Where the output above was `None`, nothing is logged. - -If your checks aren't part of the initial settings check (say you want to assess additional columns found in blocking rules supplied at a later stage by the user), you should add a new method to `InvalidColumnsLogger`, similar in functionality to [`construct_output_logs`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L319). - -However, it is worth noting that not all checks are performed on a simple string columns. Where you require checks to be performed on SQL strings, there's an additional step required, outlined below. - -### Single Column Checks - -To review single columns, [`validate_settings_column`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L144) should be used. This takes in a `setting_id` (analogous to the title you want to give your log string) and a list of columns to be checked. + `InvalidColumnsLogger` takes in a series of cleansed columns from your settings object (see [`SettingsColumnCleaner`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py#L101)) and runs a series of validation checks to assess whether the column(s) are present within the underlying dataframes. -A working example of this in practice can be found in the section above. + Any invalid columns are stored in an `InvalidCols` instance (see above), which is then used to construct a log string. -### Checking Columns in SQL statements + Logs are output to the user at the `INFO` level. -For raw SQL statements, you should make use of the [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) method. +To extend the column checks, you simply need to add an additional validation method to the [`InvalidColumnsLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L171C7-L171C27) class. Checks must be added as a new method and then called within `construct_output_logs`. -This takes in a list of SQL strings and spits out a list of `InvalidCols` tuples, depending on the checks you ask it to perform. +## Single column, multi-column and SQL checks -Should you need more control, the process is similar to that of the single column case, just with an additional parsing step. +### Single and multi-column -Parsing is handled by [`parse_columns_in_sql`](https://github.com/moj-analytical-services/splink/blob/master/splink/parse_sql.py#L45). This will spit out a list of column names that were identified by SQLGlot. +Single and multi-column checks are relatively straightforward. Assuming you have a clean set of columns, you can leverage the [`check_for_missing_settings_column`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L56) function. -> Note that as this is handled by SQLGlot, it's not always 100% accurate. For our purposes though, its flexibility is unparalleled and allows us to more easily and efficiently extract column names. +This expects the following arguments: +* **settings_id**: the name of the settings ID. This is only used for logging and does not necessarily need to match the true ID. +* **settings_column_to_check**: the column(s) you wish to validate. +* **valid_input_dataframe_columns**: the cleaned columns from your **all** input dataframes. -Once your columns have been parsed, you can again run a series of lookups against your input dataframe(s). This is identical to the steps outlined in the **Single Column Checks** section. +### Checking columns in SQL statements -You may also wish to perform additional checks on the columns, to assess whether they contain valid prefixes, suffixes or some other quality of the column. +Checking SQL statements is a little more complex, given the need to parse SQL in order to extract your column names. -Additional checks can be passed to [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) and should be specified as methods in the `InvalidColValidator` class. +To do this, you can leverage the [`check_for_missing_or_invalid_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L73) function. -See [validate_blocking_rules](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L209) for a practical example where we loop through each blocking rule, parse it and then assess whether it: +This expects the following arguments: +* **sql_dialect**: The SQL dialect used by the linker. +* **sql_strings**: A list of SQL strings. +* **valid_input_dataframe_columns**: The list of columns identified in your input dataframe(s). +* additional_validation_checks: Functions used to check for other issues with the parsed SQL string, namely, table name and column suffix validation. -1. Contains a valid list of columns -2. Each column contains a valid table prefix. +_NB: for nested SQL statements, you'll need to add an additional loop. See [`check_comparison_for_missing_or_invalid_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L140) for more details._ diff --git a/docs/dev_guides/settings_validation/settings_validation_overview.md b/docs/dev_guides/settings_validation/settings_validation_overview.md index 68cb5c9fd8..e8f9013561 100644 --- a/docs/dev_guides/settings_validation/settings_validation_overview.md +++ b/docs/dev_guides/settings_validation/settings_validation_overview.md @@ -1,6 +1,6 @@ ## Settings Validation -A common issue within Splink is users providing invalid settings dictionaries. To prevent this, the settings validator scans through a settings dictionary and provides user-friendly feedback on what needs to be fixed. +A common problem within Splink comes from users providing invalid settings dictionaries. To prevent this, we've built a settings validator to scan through a given settings dictionary and provide user-friendly feedback on what needs to be fixed. At a high level, this includes: @@ -21,9 +21,9 @@ You can modify the schema by manually editing the [json schema](https://github.c Modifications can be used to (amongst other uses): -* Set or remove default values for schema keys. -* Set the required data type for a given key. -* Expand or refine previous titles and descriptions to help with clarity. +- Set or remove default values for schema keys. +- Set the required data type for a given key. +- Expand or refine previous titles and descriptions to help with clarity. Any updates you wish to make to the schema should be discussed with the wider team, to ensure it won't break backwards compatibility and makes sense as a design decision. @@ -33,23 +33,24 @@ Detailed information on the arguments that can be supplied to the json schema ca ## Settings Validator -The settings validation code currently resides in the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) directory of Splink. This code is responsible for executing a secondary series of tests to determine whether all values within the settings dictionary will generate valid SQL. +As long as an input is of the correct data type, it will pass our initial schema checks. This can then mean that user inputs that would generate invalid SQL can slip through and are then often caught by the database engine, [commonly resulting in uninformative errors](https://github.com/moj-analytical-services/splink/issues/1362). This can result in uninformative and confusing errors that the user is unsure of how to resolve. -Numerous inputs pass our initial schema checks before breaking other parts of the codebase. These breaks are typically due to the construction of invalid SQL, that is then passed to the database engine, [commonly resulting in uninformative errors](https://github.com/moj-analytical-services/splink/issues/1362). +The settings validation code (found within the [settings validation](https://github.com/moj-analytical-services/splink/tree/master/splink/settings_validation) directory of Splink) is another layer of validation, executing a series of checks to determine whether values in the user's settings dictionary will generate invalid SQL. Frequently encountered problems include: -* Usage of invalid column names. For example, specifying a [`unique_id_column_name`](https://github.com/moj-analytical-services/splink/blob/settings_validation_docs/splink/files/settings_jsonschema.json#L61) that doesn't exist in the underlying dataframe(s). Such names satisfy the schema requirements as long as they are strings. -* Users not updating default values in the settings schema, even when these values are inappropriate for their provided input dataframes. -* Importing comparisons and blocking rules from incorrect sections of the codebase, or using an inappropriate data type (comparison level vs. comparison). -* Using Splink for an invalid form of linkage. See the [following discussion](https://github.com/moj-analytical-services/splink/issues/1362). +- **Invalid column names**. For example, specifying a [`unique_id_column_name`](https://github.com/moj-analytical-services/splink/blob/settings_validation_docs/splink/files/settings_jsonschema.json#L61) that doesn't exist in the underlying dataframe(s). Such names satisfy the schema requirements as long as they are strings. +- **Using the settings dictionary's default values** +- **Importing comparisons and blocking rules for the wrong dialect**. +- **Using an inappropriate custom data types** - (comparison level vs. comparison within our comparisons). +- **Using Splink for an invalid form of linkage** - See the [following discussion](https://github.com/moj-analytical-services/splink/issues/1362). -Currently, the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) scripts are setup in a modular fashion, to allow each to inherit the checks it needs. -The folder is comprised of three scripts, each of which inspects the settings dictionary at different stages of its journey: +All code relating to [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) can be found within one of the following scripts: -* [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) - This script includes various miscellaneous checks for comparison levels, blocking rules, and linker objects. These checks are primarily performed within settings.py. -* [settings_validator.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/settings_validator.py) - This script includes the core `SettingsValidator` class and contains a series of methods that retrieve information on fields within the user's settings dictionary that contain information on columns to be used in training and prediction. Additionally, it provides supplementary cleaning functions to assist in the removal of quotes, prefixes, and suffixes that may be present in a given column name. -* [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/column_lookups.py) - This script contains helper functions that generate a series of log strings outlining invalid columns identified within your settings dictionary. It primarily consists of methods that run validation checks on either raw SQL or input columns and assesses their presence in **all** dataframes supplied by the user. +- [valid_types.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py) - This script includes various miscellaneous checks for comparison levels, blocking rules, and linker objects. These checks are primarily performed within settings.py. +- [settings_column_cleaner.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py) - Includes a set of functions for cleaning and extracting data, designed to sanitise user inputs in the settings dictionary and retrieve necessary SQL or column identifiers. +- [log_invalid_columns.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py) - Pulls the information extracted in [settings_column_cleaner.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py) and generates any log strings outlining invalid columns or SQL identified within the settings dictionary. Any generated error logs are reported to the user when initialising a linker object at the `INFO` level. +- [settings_validation_log_strings.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_validation_log_strings.py) - a home for any error messages or logs generated by the settings validator. For information on expanding the range of checks available to the validator, see [Extending the Settings Validator](./extending_settings_validator.md). diff --git a/docs/hooks/__init__.py b/docs/hooks/__init__.py index 7ac5dd7a84..ee84b6c6e2 100644 --- a/docs/hooks/__init__.py +++ b/docs/hooks/__init__.py @@ -77,6 +77,7 @@ def re_route_links(markdown: str, page_title: str) -> str | None: # hooks for use by mkdocs + # priority last - run this after any other such hooks # this ensures we are overwriting mknotebooks config, # not the other way round diff --git a/docs/topic_guides/splink_fundamentals/backends/postgres.md b/docs/topic_guides/splink_fundamentals/backends/postgres.md index bfcdaa24eb..25b56bd972 100644 --- a/docs/topic_guides/splink_fundamentals/backends/postgres.md +++ b/docs/topic_guides/splink_fundamentals/backends/postgres.md @@ -91,13 +91,18 @@ The tests will are run using a temporary database and user that are created at t If you are trying to [run tests with Splink](../../../dev_guides/changing_splink/testing.md) on Postgres, or simply develop using Postgres, you may prefer to not actually [install Postgres on you system](https://www.postgresql.org/download/), but to run it instead using [Docker](https://www.docker.com/). In this case you can simply run the setup script (a thin wrapper around `docker-compose`): ```bash -./scripts/postgres/setup.sh +./scripts/postgres_docker/setup.sh ``` Included in the docker-compose file is a [pgAdmin](https://www.pgadmin.org/) container to allow easy exploration of the database as you work, which can be accessed in-browser on the default port. +pgadmin port uri should be: +```bash +http://localhost:5050 +``` + When you are finished you can remove these resources: ```bash -./scripts/postgres/teardown.sh +./scripts/postgres_docker/teardown.sh ``` ### Running with a pre-existing database diff --git a/scripts/conda/development_environment.yaml b/scripts/conda/development_environment.yaml index a8813c35c3..7c92719dcf 100644 --- a/scripts/conda/development_environment.yaml +++ b/scripts/conda/development_environment.yaml @@ -6,6 +6,7 @@ dependencies: - pip - openjdk<18 # Required for Spark; Spark 3.4 only supports older Java - postgresql + - aspell - pip: - poetry==1.4.2 # TODO: Unpin. Current poetry.lock requires older poetry. # NOTE: These are the python packages to install alongside poetry, not within the diff --git a/scripts/conda/development_environment_lock_Linux-x86_64.txt b/scripts/conda/development_environment_lock_Linux-x86_64.txt index 0cb54a1f1f..5ef7c7d942 100644 --- a/scripts/conda/development_environment_lock_Linux-x86_64.txt +++ b/scripts/conda/development_environment_lock_Linux-x86_64.txt @@ -18,6 +18,7 @@ https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.11-hd590300_1.conda https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda +https://conda.anaconda.org/conda-forge/linux-64/gettext-0.21.1-h27087fc_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/giflib-5.2.1-h0b41bf4_3.conda https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda https://conda.anaconda.org/conda-forge/linux-64/icu-73.2-h59595ed_0.conda @@ -55,12 +56,14 @@ https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.cond https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.15-h0b41bf4_0.conda https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.6-h232c23b_1.conda https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.43-hcad00b1_0.conda +https://conda.anaconda.org/conda-forge/linux-64/perl-5.32.1-7_hd590300_perl5.conda https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda https://conda.anaconda.org/conda-forge/linux-64/xorg-fixesproto-5.0-h7f98852_1002.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.4-h7391055_0.conda https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda +https://conda.anaconda.org/conda-forge/linux-64/aspell-0.60.8-pl5321hcb278e6_1.conda https://conda.anaconda.org/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda https://conda.anaconda.org/conda-forge/linux-64/libglib-2.80.0-hf2295e7_1.conda diff --git a/scripts/lint_and_format.sh b/scripts/lint_and_format.sh old mode 100644 new mode 100755 diff --git a/scripts/postgres_docker/docker-compose.yaml b/scripts/postgres_docker/docker-compose.yaml index 457673a434..14c3a63ae6 100644 --- a/scripts/postgres_docker/docker-compose.yaml +++ b/scripts/postgres_docker/docker-compose.yaml @@ -1,5 +1,3 @@ -version: '3.1' - services: postgres-splink: diff --git a/scripts/postgres_docker/setup.sh b/scripts/postgres_docker/setup.sh index cc7dd64a26..7273fafa50 100755 --- a/scripts/postgres_docker/setup.sh +++ b/scripts/postgres_docker/setup.sh @@ -2,4 +2,4 @@ # run from root # add -d for detached mode (run in background) -docker-compose -f scripts/postgres/docker-compose.yaml up +docker-compose -f scripts/postgres_docker/docker-compose.yaml up diff --git a/scripts/postgres_docker/teardown.sh b/scripts/postgres_docker/teardown.sh index ef98e9b3f1..666a41952d 100755 --- a/scripts/postgres_docker/teardown.sh +++ b/scripts/postgres_docker/teardown.sh @@ -2,4 +2,4 @@ # run from root # remove -v (removing volume) if you want to keep data -docker-compose -f scripts/postgres/docker-compose.yaml down -v +docker-compose -f scripts/postgres_docker/docker-compose.yaml down -v diff --git a/scripts/pyspelling/spellchecker.sh b/scripts/pyspelling/spellchecker.sh old mode 100644 new mode 100755 index afd032c240..5ceaf96261 --- a/scripts/pyspelling/spellchecker.sh +++ b/scripts/pyspelling/spellchecker.sh @@ -1,44 +1,46 @@ #!/bin/bash -line_block="==============" +cd "$(dirname "$0")" + +set -e -package_name="aspell" -pyspelling_yaml="scripts/pyspelling/pyspelling.yml" -default_path_to_spellcheck="docs" +line_block="==============" # Use either the first command line arg or the default path to spellcheck -path_to_spellcheck="${1:-$default_path_to_spellcheck}" +path_to_spellcheck="${1:-docs}" echo "Path to spellcheck: $path_to_spellcheck" -# Function to check if necessary packages are installed -source scripts/utils/ensure_packages_installed.sh -ensure_homebrew_packages_installed aspell yq +source ../utils/ensure_packages_installed.sh +if ! command -v aspell &> /dev/null +then + ensure_homebrew_packages_installed aspell +fi -cwd=$(pwd) +# Go up to the root of the repo +cd ../.. # Set up venv, install pyspelling and download dictionary files -if [[ "$VIRTUAL_ENV" != "$cwd/spellcheck-venv" ]]; then +if [[ "$VIRTUAL_ENV" != "$(pwd)/spellcheck-venv" ]]; then # If already in a venv then deactivate if [ -n "$VIRTUAL_ENV" ]; then deactivate fi - # Set up venv - python3 -m venv spellcheck-venv - source spellcheck-venv/bin/activate - # Install pyspelling - echo "$line_block Installing pyspelling $line_block" - python -m pip install pyspelling - # Download dictionary files into correct directory - echo "$line_block Downloading dictionay files to Library/Spelling $line_block" - curl -LJ https://github.com/LibreOffice/dictionaries/raw/master/en/en_GB.dic -o ~/Library/Spelling/en_GB.dic - curl -LJ https://github.com/LibreOffice/dictionaries/blob/master/en/en_GB.aff -o ~/Library/Spelling/en_GB.aff -fi + if ! [ -d spellcheck-venv ]; then + echo "$line_block Creating spellchecking venv $line_block" + python3 -m venv spellcheck-venv + source spellcheck-venv/bin/activate + echo "$line_block Installing pyspelling $line_block" + python -m pip install pyspelling + else + source spellcheck-venv/bin/activate + fi +fi # Finally, validate the path or file that the user has entered to be spellchecked if [ -d "$path_to_spellcheck" ]; then # Checks if a directory has been entered and adds a recursive search for markdown files - source_to_spellcheck="$path_to_spellcheck"/**/*.md + source_to_spellcheck="$path_to_spellcheck"'/**/*.md' elif [ -f "$path_to_spellcheck" ]; then # Checks that the file extension is .md if [[ $path_to_spellcheck == *.md ]]; then @@ -54,11 +56,10 @@ else return 0 2>/dev/null fi -pyspelling_run="Running pyspelling spellchecker on docs" -echo "$line_block $pyspelling_run $line_block" - -# Update pyspelling.yml with a new source path -yq e ".matrix[0].sources = [\"$source_to_spellcheck|!docs/includes/**/*.md\"]" -i "$pyspelling_yaml" +echo "$line_block Running pyspelling spellchecker on docs $line_block" -echo $source_to_spellcheck -pyspelling -c ./$pyspelling_yaml +echo "$source_to_spellcheck" +pyspelling \ + -c ./scripts/pyspelling/pyspelling.yml \ + -n "Markdown docs" \ + -S "$source_to_spellcheck"'|!docs/includes/**/*.md' diff --git a/splink/blocking.py b/splink/blocking.py index b1468067c8..70363712ad 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -461,9 +461,9 @@ def block_using_rules_sqls(linker: Linker): and not linker._find_new_matches_mode and not linker._compare_two_records_mode ): - source_dataset_col = ( - source_dataset_col - ) = linker._settings_obj._source_dataset_column_name + source_dataset_col = source_dataset_col = ( + linker._settings_obj._source_dataset_column_name + ) # Need df_l to be the one with the lowest id to preeserve the property # that the left dataset is the one with the lowest concatenated id diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index d61b8ffea8..17a68de69d 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -684,6 +684,12 @@ def __init__( import splink.sqlite.comparison_level_library as cll cll.jaro_level("name", 0.9) ``` + === ":simple-sqlite: Postgres" + Comparison level with jaro score greater than 0.9 + ``` python + import splink.postgres.comparison_level_library as cll + cll.jaro_level("name", 0.9) + ``` Returns: ComparisonLevel: A comparison level that evaluates the @@ -760,6 +766,12 @@ def __init__( import splink.sqlite.comparison_level_library as cll cll.jaro_winkler_level("name", 0.9) ``` + === ":simple-sqlite: Postgres" + Comparison level with jaro-winkler score greater than 0.9 + ``` python + import splink.postgres.comparison_level_library as cll + cll.jaro_winkler_level("name", 0.9) + ``` Returns: ComparisonLevel: A comparison level that evaluates the diff --git a/splink/databricks/enable_splink.py b/splink/databricks/enable_splink.py index 586fb28abe..2a8d87d964 100644 --- a/splink/databricks/enable_splink.py +++ b/splink/databricks/enable_splink.py @@ -1,4 +1,5 @@ import logging +import os from splink.spark.jar_location import similarity_jar_location @@ -32,37 +33,34 @@ def enable_splink(spark): optionClass = getattr(sc._jvm.scala, "Option$") optionModule = getattr(optionClass, "MODULE$") - # Note(bobby): So dirty + dbr_version = float(os.environ.get("DATABRICKS_RUNTIME_VERSION")) + try: - # This will fix the exception when running on Databricks Runtime 14.x+ - lib = JavaJarId( - JarURI, - ManagedLibraryId.defaultOrganization(), - NoVersionModule.simpleString(), - optionModule.apply(None), - optionModule.apply(None), - optionModule.apply(None), - ) - except Exception as e: - logger.warn("failed to initialize for 14.x+", e) - try: - # This will fix the exception when running on Databricks Runtime 13.x + if dbr_version >= 14: lib = JavaJarId( JarURI, ManagedLibraryId.defaultOrganization(), NoVersionModule.simpleString(), optionModule.apply(None), optionModule.apply(None), + optionModule.apply(None), ) - except Exception as ex: - logger.warn("failed to initialize for 13.x", ex) - - # This will work for < 13.x + elif dbr_version >= 13: + lib = JavaJarId( + JarURI, + ManagedLibraryId.defaultOrganization(), + NoVersionModule.simpleString(), + optionModule.apply(None), + optionModule.apply(None), + ) + else: lib = JavaJarId( JarURI, ManagedLibraryId.defaultOrganization(), NoVersionModule.simpleString(), ) + except Exception as e: + logger.warn("failed to enable similarity jar functions for Databricks", e) libSeq = converters.asScalaBufferConverter((lib,)).asScala().toSeq() diff --git a/splink/em_training_session.py b/splink/em_training_session.py index 2780894a0c..7a5dfa6e48 100644 --- a/splink/em_training_session.py +++ b/splink/em_training_session.py @@ -98,9 +98,9 @@ def __init__( cc_names_to_deactivate = [ cc._output_column_name for cc in comparisons_to_deactivate ] - self._comparisons_that_cannot_be_estimated: list[ - Comparison - ] = comparisons_to_deactivate + self._comparisons_that_cannot_be_estimated: list[Comparison] = ( + comparisons_to_deactivate + ) filtered_ccs = [ cc @@ -286,9 +286,9 @@ def _iteration_history_records(self): for r in records: r["iteration"] = iteration - r[ - "probability_two_random_records_match" - ] = self._settings_obj._probability_two_random_records_match + r["probability_two_random_records_match"] = ( + self._settings_obj._probability_two_random_records_match + ) output_records.extend(records) return output_records @@ -391,12 +391,12 @@ def _max_change_in_parameters_comparison_levels(self): max_change = abs(change_probability_two_random_records_match) max_change_levels["prev_comparison_level"] = None max_change_levels["current_comparison_level"] = None - max_change_levels[ - "max_change_type" - ] = "probability_two_random_records_match" - max_change_levels[ - "max_change_value" - ] = change_probability_two_random_records_match + max_change_levels["max_change_type"] = ( + "probability_two_random_records_match" + ) + max_change_levels["max_change_value"] = ( + change_probability_two_random_records_match + ) max_change_levels["max_abs_change_value"] = abs( change_probability_two_random_records_match ) diff --git a/splink/linker.py b/splink/linker.py index b4163b67f8..0f8cd98e59 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2153,9 +2153,9 @@ def _compute_metrics_nodes( df_node_metrics = self._execute_sql_pipeline() - df_node_metrics.metadata[ - "threshold_match_probability" - ] = threshold_match_probability + df_node_metrics.metadata["threshold_match_probability"] = ( + threshold_match_probability + ) return df_node_metrics def _compute_metrics_edges( @@ -2190,9 +2190,9 @@ def _compute_metrics_edges( df_edge_metrics = compute_edge_metrics( self, df_node_metrics, df_predict, df_clustered, threshold_match_probability ) - df_edge_metrics.metadata[ - "threshold_match_probability" - ] = threshold_match_probability + df_edge_metrics.metadata["threshold_match_probability"] = ( + threshold_match_probability + ) return df_edge_metrics def _compute_metrics_clusters( @@ -2232,9 +2232,9 @@ def _compute_metrics_clusters( self._enqueue_sql(sql["sql"], sql["output_table_name"]) df_cluster_metrics = self._execute_sql_pipeline() - df_cluster_metrics.metadata[ - "threshold_match_probability" - ] = df_node_metrics.metadata["threshold_match_probability"] + df_cluster_metrics.metadata["threshold_match_probability"] = ( + df_node_metrics.metadata["threshold_match_probability"] + ) return df_cluster_metrics def compute_graph_metrics( diff --git a/splink/postgres/comparison_level_library.py b/splink/postgres/comparison_level_library.py index cbc5423bc4..3ae799e7b6 100644 --- a/splink/postgres/comparison_level_library.py +++ b/splink/postgres/comparison_level_library.py @@ -11,6 +11,8 @@ distance_in_km_level, else_level, exact_match_level, + jaro_level, + jaro_winkler_level, levenshtein_level, null_level, percentage_difference_level, diff --git a/splink/postgres/comparison_library.py b/splink/postgres/comparison_library.py index 9eb966bc6c..9fcfaff916 100644 --- a/splink/postgres/comparison_library.py +++ b/splink/postgres/comparison_library.py @@ -1,8 +1,11 @@ from .postgres_helpers.postgres_comparison_imports import ( # noqa: F401 array_intersect_at_sizes, + damerau_levenshtein_at_thresholds, datediff_at_thresholds, distance_function_at_thresholds, distance_in_km_at_thresholds, + else_level, exact_match, levenshtein_at_thresholds, + null_level, ) diff --git a/splink/postgres/comparison_template_library.py b/splink/postgres/comparison_template_library.py index d4bc23ddac..74299c07f6 100644 --- a/splink/postgres/comparison_template_library.py +++ b/splink/postgres/comparison_template_library.py @@ -1,13 +1,7 @@ -# The Comparison Template Library is not currently implemented -# for Postgres due to limited string matching capability in -# cll.comparison_level_library - -import logging - -logger = logging.getLogger(__name__) - -logger.warn( - "The Comparison Template Library is not currently implemented " - "for Postgres due to limited string matching capability in " - "`cll.comparison_level_library`" +from .postgres_helpers.postgres_comparison_imports import ( # noqa: F401 + date_comparison, + email_comparison, + forename_surname_comparison, + name_comparison, + postcode_comparison, ) diff --git a/splink/postgres/linker.py b/splink/postgres/linker.py index 38eb17ca94..99cba4e9c8 100644 --- a/splink/postgres/linker.py +++ b/splink/postgres/linker.py @@ -288,6 +288,155 @@ def _create_array_intersect_function(self): """ self._run_sql_execution(sql) + def _create_damerau_levenshtein_function(self): + sql = """ + CREATE OR REPLACE FUNCTION damerau_levenshtein(s1 TEXT, s2 TEXT) + RETURNS INT AS $$ + DECLARE + s1_len INT := LENGTH(s1); + s2_len INT := LENGTH(s2); + d INT[][]; + i INT; + j INT; + cost INT; + BEGIN + -- Initialize matrix + d := ARRAY(SELECT ARRAY(SELECT generate_series(0, s2_len)) FROM generate_series(0, s1_len)); + + -- Initialize the first column and the first row of the matrix + FOR i IN 0..s1_len LOOP + d[i + 1][1] := i; + END LOOP; + FOR j IN 0..s2_len LOOP + d[1][j + 1] := j; + END LOOP; + + -- Fill the matrix + FOR i IN 1..s1_len LOOP + FOR j IN 1..s2_len LOOP + IF SUBSTRING(s1 FROM i FOR 1) = SUBSTRING(s2 FROM j FOR 1) THEN + cost := 0; + ELSE + cost := 1; + END IF; + + d[i + 1][j + 1] := LEAST( + d[i][j + 1] + 1, -- deletion + d[i + 1][j] + 1, -- insertion + d[i][j] + cost -- substitution + ); + + IF (i > 1 AND j > 1 AND SUBSTRING(s1 FROM i FOR 1) = SUBSTRING(s2 FROM j - 1 FOR 1) AND SUBSTRING(s1 FROM i - 1 FOR 1) = SUBSTRING(s2 FROM j FOR 1)) THEN + d[i + 1][j + 1] := LEAST( + d[i + 1][j + 1], + d[i - 1][j - 1] + cost -- transposition + ); + END IF; + END LOOP; + END LOOP; + + RETURN d[s1_len + 1][s2_len + 1]; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """ # noqa: E501 + self._run_sql_execution(sql) + + def _create_jaro_similarity_function(self): + sql = """ + CREATE OR REPLACE FUNCTION jaro_similarity(s1 TEXT, s2 TEXT) + RETURNS FLOAT AS $$ + DECLARE + s1_len INT := LENGTH(s1); + s2_len INT := LENGTH(s2); + match_distance INT := GREATEST(s1_len, s2_len) / 2 - 1; + matches INT := 0; + transpositions INT := 0; + i INT; + j INT; + s1_matches BOOLEAN[]; + s2_matches BOOLEAN[]; + BEGIN + IF s1_len = 0 OR s2_len = 0 THEN + RETURN 0.0; + END IF; + + s1_matches := ARRAY(SELECT FALSE FROM generate_series(1, s1_len)); + s2_matches := ARRAY(SELECT FALSE FROM generate_series(1, s2_len)); + + FOR i IN 1..s1_len LOOP + FOR j IN GREATEST(1, i - match_distance)..LEAST(s2_len, i + match_distance) LOOP + IF (SUBSTRING(s1 FROM i FOR 1) = SUBSTRING(s2 FROM j FOR 1)) AND s2_matches[j] = FALSE THEN + s1_matches[i] := TRUE; + s2_matches[j] := TRUE; + matches := matches + 1; + EXIT; + END IF; + END LOOP; + END LOOP; + + IF matches = 0 THEN + RETURN 0.0; + END IF; + + j := 1; + FOR i IN 1..s1_len LOOP + IF s1_matches[i] = TRUE THEN + WHILE s2_matches[j] = FALSE LOOP + j := j + 1; + END LOOP; + IF SUBSTRING(s1 FROM i FOR 1) <> SUBSTRING(s2 FROM j FOR 1) THEN + transpositions := transpositions + 1; + END IF; + j := j + 1; + END IF; + END LOOP; + + transpositions := transpositions / 2; + + RETURN (matches::FLOAT / s1_len + matches::FLOAT / s2_len + (matches - transpositions)::FLOAT / matches) / 3.0; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """ # noqa: E501 + self._run_sql_execution(sql) + + def _create_jaro_winkler_similarity_function(self): + sql = """ + CREATE OR REPLACE FUNCTION jaro_winkler_similarity(s1 TEXT, s2 TEXT) + RETURNS FLOAT AS $$ + DECLARE + jaro FLOAT; + prefix_len INT := 0; + max_prefix_len INT := 4; + p FLOAT := 0.1; -- scaling factor + i INT; + BEGIN + jaro := jaro_similarity(s1, s2); + + FOR i IN 1..LEAST(LEAST(LENGTH(s1), LENGTH(s2)), max_prefix_len) LOOP + IF SUBSTRING(s1 FROM i FOR 1) = SUBSTRING(s2 FROM i FOR 1) THEN + prefix_len := prefix_len + 1; + ELSE + EXIT; + END IF; + END LOOP; + + RETURN jaro + (prefix_len * p * (1 - jaro)); + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """ # noqa: E501 + self._run_sql_execution(sql) + + def _create_jaro_winkler_distance_function(self): + sql = """ + CREATE OR REPLACE FUNCTION jaro_winkler_distance(s1 TEXT, s2 TEXT) + RETURNS FLOAT AS $$ + BEGIN + RETURN 1 - jaro_winkler_similarity(s1, s2); + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """ # noqa: E501 + self._run_sql_execution(sql) + def _register_custom_functions(self): # if people have issues with permissions we can allow these to be optional # need for predict_from_comparison_vectors_sql (could adjust) @@ -300,7 +449,14 @@ def _register_custom_functions(self): # extension of round to handle doubles - used in unlinkables self._extend_round_function() + self._create_damerau_levenshtein_function() + self._create_jaro_similarity_function() + self._create_jaro_winkler_similarity_function() + self._create_jaro_winkler_distance_function() + def _register_extensions(self): + # TODO: Lots of string similarity functionality could be enabled: + # CREATE EXTENSION IF NOT EXISTS pg_similarity; sql = """ CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; """ diff --git a/splink/postgres/postgres_helpers/postgres_base.py b/splink/postgres/postgres_helpers/postgres_base.py index 767fc712a9..4d51e5abab 100644 --- a/splink/postgres/postgres_helpers/postgres_base.py +++ b/splink/postgres/postgres_helpers/postgres_base.py @@ -62,3 +62,15 @@ def _datediff_function(self): @property def _size_array_intersect_function(self): return size_array_intersect_sql + + @property + def _damerau_levenshtein_name(self): + return "damerau_levenshtein" + + @property + def _jaro_name(self): + return "jaro_similarity" + + @property + def _jaro_winkler_name(self): + return "jaro_winkler_similarity" diff --git a/splink/postgres/postgres_helpers/postgres_comparison_imports.py b/splink/postgres/postgres_helpers/postgres_comparison_imports.py index 667e7c5e5e..a660028e0e 100644 --- a/splink/postgres/postgres_helpers/postgres_comparison_imports.py +++ b/splink/postgres/postgres_helpers/postgres_comparison_imports.py @@ -1,23 +1,36 @@ from ...comparison_level_library import ( ArrayIntersectLevelBase, ColumnsReversedLevelBase, + DamerauLevenshteinLevelBase, DatediffLevelBase, DistanceFunctionLevelBase, DistanceInKMLevelBase, ElseLevelBase, ExactMatchLevelBase, + JaroLevelBase, + JaroWinklerLevelBase, LevenshteinLevelBase, NullLevelBase, PercentageDifferenceLevelBase, ) from ...comparison_library import ( ArrayIntersectAtSizesBase, + DamerauLevenshteinAtThresholdsBase, DatediffAtThresholdsBase, DistanceFunctionAtThresholdsBase, DistanceInKMAtThresholdsBase, ExactMatchBase, + JaroAtThresholdsBase, + JaroWinklerAtThresholdsBase, LevenshteinAtThresholdsBase, ) +from ...comparison_template_library import ( + DateComparisonBase, + EmailComparisonBase, + ForenameSurnameComparisonBase, + NameComparisonBase, + PostcodeComparisonBase, +) from .postgres_base import ( PostgresBase, ) @@ -53,6 +66,14 @@ def _distance_in_km_level(self): def _levenshtein_level(self): return levenshtein_level + @property + def _jaro_level(self): + return jaro_level + + @property + def _jaro_winkler_level(self): + return jaro_winkler_level + ######################### ### COMPARISON LEVELS ### @@ -81,6 +102,18 @@ class levenshtein_level(PostgresBase, LevenshteinLevelBase): pass +class damerau_levenshtein_level(PostgresBase, DamerauLevenshteinLevelBase): + pass + + +class jaro_level(PostgresBase, JaroLevelBase): + pass + + +class jaro_winkler_level(PostgresBase, JaroWinklerLevelBase): + pass + + class array_intersect_level(PostgresBase, ArrayIntersectLevelBase): pass @@ -104,6 +137,14 @@ class exact_match(PostgresComparisonProperties, ExactMatchBase): pass +class damerau_levenshtein_at_thresholds( + PostgresComparisonProperties, DamerauLevenshteinAtThresholdsBase +): + @property + def _distance_level(self): + return self._damerau_levenshtein_level + + class distance_function_at_thresholds( PostgresComparisonProperties, DistanceFunctionAtThresholdsBase ): @@ -120,6 +161,20 @@ def _distance_level(self): return levenshtein_level +class jaro_at_thresholds(PostgresComparisonProperties, JaroAtThresholdsBase): + @property + def _distance_level(self): + return self._jaro_level + + +class jaro_winkler_at_thresholds( + PostgresComparisonProperties, JaroWinklerAtThresholdsBase +): + @property + def _distance_level(self): + return self._jaro_winkler_level + + class array_intersect_at_sizes(PostgresComparisonProperties, ArrayIntersectAtSizesBase): pass @@ -140,3 +195,31 @@ class distance_in_km_at_thresholds( # Not yet implemented # Currently does not support the necessary comparison levels # required for existing comparison templates +class date_comparison(PostgresComparisonProperties, DateComparisonBase): + @property + def _distance_level(self): + return distance_function_level + + +class name_comparison(PostgresComparisonProperties, NameComparisonBase): + @property + def _distance_level(self): + return distance_function_level + + +class forename_surname_comparison( + PostgresComparisonProperties, ForenameSurnameComparisonBase +): + @property + def _distance_level(self): + return distance_function_level + + +class postcode_comparison(PostgresComparisonProperties, PostcodeComparisonBase): + pass + + +class email_comparison(PostgresComparisonProperties, EmailComparisonBase): + @property + def _distance_level(self): + return distance_function_level diff --git a/splink/predict.py b/splink/predict.py index 3a7f7555a1..efc9e18def 100644 --- a/splink/predict.py +++ b/splink/predict.py @@ -55,7 +55,7 @@ def predict_from_comparison_vectors_sqls( thres_prob_as_weight = prob_to_match_weight(threshold_match_probability) else: thres_prob_as_weight = None - if threshold_match_probability or threshold_match_weight: + if threshold_match_probability is not None or threshold_match_weight is not None: thresholds = [ thres_prob_as_weight, threshold_match_weight, diff --git a/splink/settings.py b/splink/settings.py index df13bd4fae..1bf469ea6c 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -360,9 +360,9 @@ def _parameters_as_detailed_records(self): for i, cc in enumerate(self.comparisons): records = cc._as_detailed_records for r in records: - r[ - "probability_two_random_records_match" - ] = self._probability_two_random_records_match + r["probability_two_random_records_match"] = ( + self._probability_two_random_records_match + ) r["comparison_sort_order"] = i output.extend(records) diff --git a/tests/test_full_example_athena.py b/tests/test_full_example_athena.py index 75dbe3a473..2578bc9966 100644 --- a/tests/test_full_example_athena.py +++ b/tests/test_full_example_athena.py @@ -27,8 +27,7 @@ except: # noqa # If InvalidTable cannot be imported, we need to create a temp value # to prevent an ImportError - class InvalidTable(Exception): - ... + class InvalidTable(Exception): ... # An import error is equivalent to a missing AWS connection aws_connection_valid = False diff --git a/tests/test_input_column.py b/tests/test_input_column.py index 149d1496b5..a9b554925a 100644 --- a/tests/test_input_column.py +++ b/tests/test_input_column.py @@ -183,7 +183,7 @@ def test_illegal_names_error(): "my test column", ) for name in odd_but_legal_names: - InputColumn(name).name_l + InputColumn(name).name_l # noqa: B018 # Check some illegal names we want to raise ParserErrors illegal_names = ('sur "name"', '"sur" name', '"sur" name[0]', "sur \"name\"['lat']")