diff --git a/docs/charts/missingness_chart.ipynb b/docs/charts/missingness_chart.ipynb index 75a8e91148..eb9487bfc7 100644 --- a/docs/charts/missingness_chart.ipynb +++ b/docs/charts/missingness_chart.ipynb @@ -28,86 +28,6 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from splink.datasets import splink_datasets\n", "from splink.duckdb.linker import DuckDBLinker\n", @@ -115,7 +35,8 @@ "df = splink_datasets.historical_50k\n", "linker = DuckDBLinker(df)\n", "linker.missingness_chart()" - ] + ], + "outputs": [] }, { "attachments": {}, diff --git a/splink/em_training_session.py b/splink/em_training_session.py index 3edc8b03dd..84a0c61072 100644 --- a/splink/em_training_session.py +++ b/splink/em_training_session.py @@ -12,8 +12,8 @@ from splink.internals.comparison import Comparison from splink.internals.comparison_level import ComparisonLevel -from .comparison_vector_values import compute_comparison_vector_values_sql -from .constants import LEVEL_NOT_OBSERVED_TEXT +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT from .database_api import DatabaseAPISubClass from .exceptions import EMTrainingException from .expectation_maximisation import expectation_maximisation diff --git a/splink/estimate_u.py b/splink/estimate_u.py index 5c0362361a..bdc574f2d7 100644 --- a/splink/estimate_u.py +++ b/splink/estimate_u.py @@ -7,7 +7,7 @@ from splink.internals.blocking import block_using_rules_sqls, blocking_rule_to_obj -from .comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql from .expectation_maximisation import ( compute_new_parameters_sql, compute_proportions_for_new_parameters, diff --git a/splink/expectation_maximisation.py b/splink/expectation_maximisation.py index a5d1b678f9..7cf3efdb7e 100644 --- a/splink/expectation_maximisation.py +++ b/splink/expectation_maximisation.py @@ -8,7 +8,7 @@ from splink.internals.comparison import Comparison from splink.internals.comparison_level import ComparisonLevel -from .constants import LEVEL_NOT_OBSERVED_TEXT +from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT from .database_api import DatabaseAPISubClass from .input_column import InputColumn from .m_u_records_to_parameters import m_u_records_to_lookup_dict diff --git a/splink/internals/accuracy.py b/splink/internals/accuracy.py index cf76d243b3..cbdaf4155c 100644 --- a/splink/internals/accuracy.py +++ b/splink/internals/accuracy.py @@ -3,7 +3,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from splink.comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql from splink.internals.block_from_labels import block_from_labels from splink.internals.blocking import BlockingRule from splink.misc import calculate_cartesian diff --git a/splink/internals/comparison_level.py b/splink/internals/comparison_level.py index 63cbd1890f..452dada22f 100644 --- a/splink/internals/comparison_level.py +++ b/splink/internals/comparison_level.py @@ -13,7 +13,7 @@ from sqlglot.optimizer.normalize import normalize from sqlglot.optimizer.simplify import simplify -from splink.constants import LEVEL_NOT_OBSERVED_TEXT +from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT from splink.input_column import InputColumn from splink.misc import ( dedupe_preserving_order, diff --git a/splink/internals/comparison_level_library.py b/splink/internals/comparison_level_library.py index ad0aa2a2b7..2cb7470e26 100644 --- a/splink/internals/comparison_level_library.py +++ b/splink/internals/comparison_level_library.py @@ -11,7 +11,7 @@ # import composition functions for export from .comparison_level_composition import And, Not, Or # NOQA: F401 from .comparison_level_creator import ComparisonLevelCreator -from splink.comparison_level_sql import great_circle_distance_km_sql +from splink.internals.comparison_level_sql import great_circle_distance_km_sql from splink.dialects import SplinkDialect # type aliases: diff --git a/splink/comparison_level_sql.py b/splink/internals/comparison_level_sql.py similarity index 100% rename from splink/comparison_level_sql.py rename to splink/internals/comparison_level_sql.py diff --git a/splink/comparison_library.py b/splink/internals/comparison_library.py similarity index 99% rename from splink/comparison_library.py rename to splink/internals/comparison_library.py index 8be9600e12..accc4c5ac0 100644 --- a/splink/comparison_library.py +++ b/splink/internals/comparison_library.py @@ -2,11 +2,11 @@ from typing import Any, Iterable, List, Optional, Union -from .internals import comparison_level_library as cll +from splink.internals import comparison_level_library as cll from splink.internals.comparison_creator import ComparisonCreator from splink.internals.comparison_level_creator import ComparisonLevelCreator from splink.internals.comparison_level_library import CustomLevel, DateMetricType -from .misc import ensure_is_iterable +from splink.misc import ensure_is_iterable class CustomComparison(ComparisonCreator): diff --git a/splink/comparison_template_library.py b/splink/internals/comparison_template_library.py similarity index 99% rename from splink/comparison_template_library.py rename to splink/internals/comparison_template_library.py index ba7d74dd00..5bd632fe44 100644 --- a/splink/comparison_template_library.py +++ b/splink/internals/comparison_template_library.py @@ -2,12 +2,12 @@ from typing import List, Type, Union -from .internals import comparison_level_library as cll +from splink.internals import comparison_level_library as cll from splink.internals.column_expression import ColumnExpression from splink.internals.comparison_creator import ComparisonCreator from splink.internals.comparison_level_creator import ComparisonLevelCreator from splink.internals.comparison_level_library import DateMetricType -from .misc import ensure_is_iterable +from splink.misc import ensure_is_iterable # alternatively we could stick an inheritance layer in these, just for typing: _fuzzy_cll_type = Union[ diff --git a/splink/comparison_vector_distribution.py b/splink/internals/comparison_vector_distribution.py similarity index 100% rename from splink/comparison_vector_distribution.py rename to splink/internals/comparison_vector_distribution.py diff --git a/splink/comparison_vector_values.py b/splink/internals/comparison_vector_values.py similarity index 100% rename from splink/comparison_vector_values.py rename to splink/internals/comparison_vector_values.py diff --git a/splink/connected_components.py b/splink/internals/connected_components.py similarity index 98% rename from splink/connected_components.py rename to splink/internals/connected_components.py index a771aff7f7..7e0c48e10b 100644 --- a/splink/connected_components.py +++ b/splink/internals/connected_components.py @@ -13,16 +13,16 @@ import time from typing import TYPE_CHECKING, Optional -from .input_column import InputColumn -from .pipeline import CTEPipeline -from .splink_dataframe import SplinkDataFrame -from .unique_id_concat import ( +from splink.input_column import InputColumn +from splink.pipeline import CTEPipeline +from splink.splink_dataframe import SplinkDataFrame +from splink.unique_id_concat import ( _composite_unique_id_from_edges_sql, _composite_unique_id_from_nodes_sql, ) if TYPE_CHECKING: - from .linker import Linker + from splink.linker import Linker logger = logging.getLogger(__name__) diff --git a/splink/constants.py b/splink/internals/constants.py similarity index 100% rename from splink/constants.py rename to splink/internals/constants.py diff --git a/splink/linker.py b/splink/linker.py index 54713758ac..8ac349afd0 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -42,11 +42,11 @@ from splink.internals.comparison import Comparison from splink.internals.comparison_level import ComparisonLevel -from .comparison_vector_distribution import ( +from splink.internals.comparison_vector_distribution import ( comparison_vector_distribution_sql, ) -from .comparison_vector_values import compute_comparison_vector_values_sql -from .connected_components import ( +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.connected_components import ( _cc_create_unique_id_cols, solve_connected_components, ) diff --git a/splink/m_from_labels.py b/splink/m_from_labels.py index 5ec10c443f..1dd8840dd3 100644 --- a/splink/m_from_labels.py +++ b/splink/m_from_labels.py @@ -2,7 +2,7 @@ from splink.internals.block_from_labels import block_from_labels -from .comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql from .expectation_maximisation import ( compute_new_parameters_sql, compute_proportions_for_new_parameters, diff --git a/splink/m_training.py b/splink/m_training.py index f1c6ce68ca..b56e9617c9 100644 --- a/splink/m_training.py +++ b/splink/m_training.py @@ -3,7 +3,7 @@ from splink.internals.blocking import BlockingRule, block_using_rules_sqls -from .comparison_vector_values import compute_comparison_vector_values_sql +from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql from .expectation_maximisation import ( compute_new_parameters_sql, compute_proportions_for_new_parameters, diff --git a/splink/m_u_records_to_parameters.py b/splink/m_u_records_to_parameters.py index 91376ebc2d..815527c449 100644 --- a/splink/m_u_records_to_parameters.py +++ b/splink/m_u_records_to_parameters.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List from splink.internals.comparison_level import ComparisonLevel -from .constants import LEVEL_NOT_OBSERVED_TEXT +from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT logger = logging.getLogger(__name__) diff --git a/splink/settings_creator.py b/splink/settings_creator.py index 1c9c9559d1..6fc70926dd 100644 --- a/splink/settings_creator.py +++ b/splink/settings_creator.py @@ -10,7 +10,7 @@ from splink.internals.blocking_rule_creator_utils import to_blocking_rule_creator from splink.internals.comparison_creator import ComparisonCreator -from .comparison_library import CustomComparison +from splink.internals.comparison_library import CustomComparison from .settings import Settings diff --git a/tests/cc_testing_utils.py b/tests/cc_testing_utils.py index 94ef600f81..c3ebbdab7d 100644 --- a/tests/cc_testing_utils.py +++ b/tests/cc_testing_utils.py @@ -4,7 +4,7 @@ import pandas as pd from networkx.algorithms import connected_components as cc_nx -from splink.connected_components import solve_connected_components +from splink.internals.connected_components import solve_connected_components from splink.duckdb.database_api import DuckDBAPI from splink.duckdb.dataframe import DuckDBDataFrame from splink.linker import Linker diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py index 58b2ec825a..3c1d229931 100644 --- a/tests/test_accuracy.py +++ b/tests/test_accuracy.py @@ -3,7 +3,7 @@ import pytest from splink import SettingsCreator -from splink.comparison_library import ExactMatch +from splink.internals.comparison_library import ExactMatch from splink.duckdb.database_api import DuckDBAPI from splink.internals.accuracy import ( predictions_from_sample_of_pairwise_labels_sql, diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py index f431514de2..3d0c728b03 100644 --- a/tests/test_array_based_blocking.py +++ b/tests/test_array_based_blocking.py @@ -2,7 +2,7 @@ import pandas as pd -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from tests.decorator import mark_with_dialects_including diff --git a/tests/test_array_columns.py b/tests/test_array_columns.py index 7e379c58ca..13ef56bb7e 100644 --- a/tests/test_array_columns.py +++ b/tests/test_array_columns.py @@ -1,6 +1,6 @@ import pytest -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from tests.decorator import mark_with_dialects_excluding from tests.literal_utils import ( ComparisonTestSpec, diff --git a/tests/test_caching_tables.py b/tests/test_caching_tables.py index 5d96459156..0f205b9ffc 100644 --- a/tests/test_caching_tables.py +++ b/tests/test_caching_tables.py @@ -1,7 +1,7 @@ import duckdb import pandas as pd -from splink.comparison_library import ExactMatch, LevenshteinAtThresholds +from splink.internals.comparison_library import ExactMatch, LevenshteinAtThresholds from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_charts.py b/tests/test_charts.py index 5c200c3f46..24bcc6b1f9 100644 --- a/tests/test_charts.py +++ b/tests/test_charts.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_comparison_lib.py b/tests/test_comparison_lib.py index d975d10952..bf24eb7fb8 100644 --- a/tests/test_comparison_lib.py +++ b/tests/test_comparison_lib.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.internals.column_expression import ColumnExpression from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py index 52909db109..69e297b78b 100644 --- a/tests/test_comparison_template_lib.py +++ b/tests/test_comparison_template_lib.py @@ -1,6 +1,6 @@ import pandas as pd -import splink.comparison_template_library as ctl +import splink.internals.comparison_template_library as ctl from .decorator import mark_with_dialects_excluding diff --git a/tests/test_compound_comparison_levels.py b/tests/test_compound_comparison_levels.py index ee802575e4..f605b4570e 100644 --- a/tests/test_compound_comparison_levels.py +++ b/tests/test_compound_comparison_levels.py @@ -1,7 +1,7 @@ import pandas as pd import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_correctness_of_convergence.py b/tests/test_correctness_of_convergence.py index 58c9d789ba..161121817d 100644 --- a/tests/test_correctness_of_convergence.py +++ b/tests/test_correctness_of_convergence.py @@ -36,7 +36,7 @@ import pandas as pd import pytest -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.duckdb.dataframe import DuckDBDataFrame from splink.em_training_session import EMTrainingSession diff --git a/tests/test_date_levels_and_comparisons.py b/tests/test_date_levels_and_comparisons.py index 199bd2b1c0..a9ba3d3eef 100644 --- a/tests/test_date_levels_and_comparisons.py +++ b/tests/test_date_levels_and_comparisons.py @@ -3,8 +3,8 @@ import pytest import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl -import splink.comparison_template_library as ctl +import splink.internals.comparison_library as cl +import splink.internals.comparison_template_library as ctl from splink.internals.column_expression import ColumnExpression from tests.decorator import mark_with_dialects_excluding, mark_with_dialects_including from tests.literal_utils import ( diff --git a/tests/test_expectation_maximisation.py b/tests/test_expectation_maximisation.py index 2b9f6df71c..c68fb2569d 100644 --- a/tests/test_expectation_maximisation.py +++ b/tests/test_expectation_maximisation.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.exceptions import EMTrainingException from splink.linker import Linker diff --git a/tests/test_find_new_matches.py b/tests/test_find_new_matches.py index e1db050b23..2fb1847bd7 100644 --- a/tests/test_find_new_matches.py +++ b/tests/test_find_new_matches.py @@ -2,7 +2,7 @@ import pandas as pd -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.internals.blocking_rule_library import block_on from splink.pipeline import CTEPipeline from splink.vertically_concatenate import compute_df_concat_with_tf diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index afe2175a58..e4f212e4ba 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -7,7 +7,7 @@ import pytest import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.blocking_analysis import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI from splink.exploratory import completeness_chart, profile_columns diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py index 9f52b481c7..d55561ccd2 100644 --- a/tests/test_full_example_spark.py +++ b/tests/test_full_example_spark.py @@ -6,7 +6,7 @@ from pyspark.sql.types import StringType, StructField, StructType import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.exploratory import completeness_chart, profile_columns from splink.linker import Linker from splink.spark.database_api import SparkAPI diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index e4b16e724d..5af7b31d64 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -4,7 +4,7 @@ from pandas.testing import assert_frame_equal from pytest import approx, raises -from splink.comparison_library import ExactMatch +from splink.internals.comparison_library import ExactMatch from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py index 5532c201f7..03a14051c7 100644 --- a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py +++ b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py @@ -3,7 +3,7 @@ import pandas as pd -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_km_distance_level.py b/tests/test_km_distance_level.py index 862b70b234..64f455d4e9 100644 --- a/tests/test_km_distance_level.py +++ b/tests/test_km_distance_level.py @@ -1,7 +1,7 @@ import pandas as pd import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_linker_variants.py b/tests/test_linker_variants.py index 44440e42de..8e973b0ce2 100644 --- a/tests/test_linker_variants.py +++ b/tests/test_linker_variants.py @@ -2,7 +2,7 @@ import pandas as pd -from splink.comparison_library import ExactMatch +from splink.internals.comparison_library import ExactMatch from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_m_train.py b/tests/test_m_train.py index 1d2c8589d1..4a80d91c4b 100644 --- a/tests/test_m_train.py +++ b/tests/test_m_train.py @@ -1,6 +1,6 @@ import pandas as pd -from splink.comparison_library import LevenshteinAtThresholds +from splink.internals.comparison_library import LevenshteinAtThresholds from splink.duckdb.database_api import DuckDBAPI from splink.linker import Linker diff --git a/tests/test_new_comparison_levels.py b/tests/test_new_comparison_levels.py index 4153256733..4059d19e4a 100644 --- a/tests/test_new_comparison_levels.py +++ b/tests/test_new_comparison_levels.py @@ -4,8 +4,8 @@ import pytest import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl -import splink.comparison_template_library as ctl +import splink.internals.comparison_library as cl +import splink.internals.comparison_template_library as ctl from splink.internals.column_expression import ColumnExpression from .decorator import mark_with_dialects_excluding diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 9c15eddb54..ad913540a1 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -1,7 +1,7 @@ import os import splink.internals.comparison_level_library as cll -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink import block_on from splink.blocking_analysis import ( cumulative_comparisons_to_be_scored_from_blocking_rules_chart, diff --git a/tests/test_settings_options.py b/tests/test_settings_options.py index f04ab9754c..d6f35f5ac9 100644 --- a/tests/test_settings_options.py +++ b/tests/test_settings_options.py @@ -2,7 +2,7 @@ import pandas as pd -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink import block_on from .decorator import mark_with_dialects_excluding diff --git a/tests/test_settings_validation.py b/tests/test_settings_validation.py index 744638c5f4..7433f564e1 100644 --- a/tests/test_settings_validation.py +++ b/tests/test_settings_validation.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from splink.comparison_library import CustomComparison, LevenshteinAtThresholds +from splink.internals.comparison_library import CustomComparison, LevenshteinAtThresholds from splink.duckdb.database_api import DuckDBAPI from splink.internals.blocking_rule_library import block_on from splink.linker import Linker diff --git a/tests/test_splink_datasets.py b/tests/test_splink_datasets.py index c57a480cbe..26fe6c49ef 100644 --- a/tests/test_splink_datasets.py +++ b/tests/test_splink_datasets.py @@ -1,4 +1,4 @@ -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.datasets import splink_datasets diff --git a/tests/test_u_train.py b/tests/test_u_train.py index ac3310c177..3dd1496ced 100644 --- a/tests/test_u_train.py +++ b/tests/test_u_train.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -import splink.comparison_library as cl +import splink.internals.comparison_library as cl from splink.estimate_u import _proportion_sample_size_link_only from splink.pipeline import CTEPipeline from tests.decorator import mark_with_dialects_excluding