Skip to content

Commit

Permalink
move further modules to internals
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed May 20, 2024
1 parent 1c02a03 commit 14d528e
Show file tree
Hide file tree
Showing 45 changed files with 53 additions and 132 deletions.
83 changes: 2 additions & 81 deletions docs/charts/missingness_chart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,94 +28,15 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<style>\n",
" #altair-viz-529a742e56c04aa69ce439a4a1eda480.vega-embed {\n",
" width: 100%;\n",
" display: flex;\n",
" }\n",
"\n",
" #altair-viz-529a742e56c04aa69ce439a4a1eda480.vega-embed details,\n",
" #altair-viz-529a742e56c04aa69ce439a4a1eda480.vega-embed details summary {\n",
" position: relative;\n",
" }\n",
"</style>\n",
"<div id=\"altair-viz-529a742e56c04aa69ce439a4a1eda480\"></div>\n",
"<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-529a742e56c04aa69ce439a4a1eda480\") {\n",
" outputDiv = document.getElementById(\"altair-viz-529a742e56c04aa69ce439a4a1eda480\");\n",
" }\n",
" const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
" \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
" \"vega-lite\": \"https://cdn.jsdelivr.net/npm/[email protected]?noext\",\n",
" \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
" };\n",
"\n",
" function maybeLoadScript(lib, version) {\n",
" var key = `${lib.replace(\"-\", \"\")}_version`;\n",
" return (VEGA_DEBUG[key] == version) ?\n",
" Promise.resolve(paths[lib]) :\n",
" new Promise(function(resolve, reject) {\n",
" var s = document.createElement('script');\n",
" document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
" s.async = true;\n",
" s.onload = () => {\n",
" VEGA_DEBUG[key] = version;\n",
" return resolve(paths[lib]);\n",
" };\n",
" s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
" s.src = paths[lib];\n",
" });\n",
" }\n",
"\n",
" function showError(err) {\n",
" outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
" throw err;\n",
" }\n",
"\n",
" function displayChart(vegaEmbed) {\n",
" vegaEmbed(outputDiv, spec, embedOpt)\n",
" .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
" }\n",
"\n",
" if(typeof define === \"function\" && define.amd) {\n",
" requirejs.config({paths});\n",
" require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
" } else {\n",
" maybeLoadScript(\"vega\", \"5\")\n",
" .then(() => maybeLoadScript(\"vega-lite\", \"5.8.0\"))\n",
" .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
" .catch(showError)\n",
" .then(() => displayChart(vegaEmbed));\n",
" }\n",
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}, \"axis\": {\"labelFontSize\": 11}}, \"layer\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"field\": \"null_proportion\", \"legend\": {\"format\": \".0%\", \"offset\": 30}, \"scale\": {\"domain\": [0, 1], \"range\": \"heatmap\"}, \"title\": \"Missingness\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"column_name\", \"title\": \"Column\", \"type\": \"nominal\"}, {\"field\": \"null_count\", \"format\": \",.0f\", \"title\": \"Count of nulls\", \"type\": \"quantitative\"}, {\"field\": \"null_proportion\", \"format\": \".2%\", \"title\": \"Percentage of nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_record_count\", \"format\": \",.0f\", \"title\": \"Total record count\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"format\": \"%\", \"labelAlign\": \"center\", \"title\": \"Percentage of nulls\"}, \"field\": \"null_proportion\", \"scale\": {\"domain\": [0, 1]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"\"}, \"field\": \"column_name\", \"sort\": \"-x\", \"type\": \"nominal\"}}, \"title\": \"Missingness per column out of 50,578 records\"}], \"data\": {\"values\": [{\"null_proportion\": 0.0, \"null_count\": 0, \"total_record_count\": 50578, \"column_name\": \"None\"}, {\"null_proportion\": 0.0, \"null_count\": 0, \"total_record_count\": 50578, \"column_name\": \"unique_id\"}, {\"null_proportion\": 0.0, \"null_count\": 0, \"total_record_count\": 50578, \"column_name\": \"cluster\"}, {\"null_proportion\": 0.0013247132301330566, \"null_count\": 67, \"total_record_count\": 50578, \"column_name\": \"full_name\"}, {\"null_proportion\": 0.0013247132301330566, \"null_count\": 67, \"total_record_count\": 50578, \"column_name\": \"first_and_surname\"}, {\"null_proportion\": 0.0013247132301330566, \"null_count\": 67, \"total_record_count\": 50578, \"column_name\": \"first_name\"}, {\"null_proportion\": 0.08926808834075928, \"null_count\": 4515, \"total_record_count\": 50578, \"column_name\": \"surname\"}, {\"null_proportion\": 0.22545373439788818, \"null_count\": 11403, \"total_record_count\": 50578, \"column_name\": \"dob\"}, {\"null_proportion\": 0.13695675134658813, \"null_count\": 6927, \"total_record_count\": 50578, \"column_name\": \"birth_place\"}, {\"null_proportion\": 0.22590851783752441, \"null_count\": 11426, \"total_record_count\": 50578, \"column_name\": \"postcode_fake\"}, {\"null_proportion\": 0.2220333218574524, \"null_count\": 11230, \"total_record_count\": 50578, \"column_name\": \"gender\"}, {\"null_proportion\": 0.5, \"null_count\": 25289, \"total_record_count\": 50578, \"column_name\": \"occupation\"}], \"name\": \"data-0e7bce5a1d2f132e282789d6ef7780fe\"}, \"title\": \"\", \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
"</script>"
],
"text/plain": [
"alt.LayerChart(...)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from splink.datasets import splink_datasets\n",
"from splink.duckdb.linker import DuckDBLinker\n",
"\n",
"df = splink_datasets.historical_50k\n",
"linker = DuckDBLinker(df)\n",
"linker.missingness_chart()"
]
],
"outputs": []
},
{
"attachments": {},
Expand Down
4 changes: 2 additions & 2 deletions splink/em_training_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

from splink.internals.comparison import Comparison
from splink.internals.comparison_level import ComparisonLevel
from .comparison_vector_values import compute_comparison_vector_values_sql
from .constants import LEVEL_NOT_OBSERVED_TEXT
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT
from .database_api import DatabaseAPISubClass
from .exceptions import EMTrainingException
from .expectation_maximisation import expectation_maximisation
Expand Down
2 changes: 1 addition & 1 deletion splink/estimate_u.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from splink.internals.blocking import block_using_rules_sqls, blocking_rule_to_obj

from .comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from .expectation_maximisation import (
compute_new_parameters_sql,
compute_proportions_for_new_parameters,
Expand Down
2 changes: 1 addition & 1 deletion splink/expectation_maximisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from splink.internals.comparison import Comparison
from splink.internals.comparison_level import ComparisonLevel
from .constants import LEVEL_NOT_OBSERVED_TEXT
from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT
from .database_api import DatabaseAPISubClass
from .input_column import InputColumn
from .m_u_records_to_parameters import m_u_records_to_lookup_dict
Expand Down
2 changes: 1 addition & 1 deletion splink/internals/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from copy import deepcopy
from typing import TYPE_CHECKING

from splink.comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.block_from_labels import block_from_labels
from splink.internals.blocking import BlockingRule
from splink.misc import calculate_cartesian
Expand Down
2 changes: 1 addition & 1 deletion splink/internals/comparison_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sqlglot.optimizer.normalize import normalize
from sqlglot.optimizer.simplify import simplify

from splink.constants import LEVEL_NOT_OBSERVED_TEXT
from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT
from splink.input_column import InputColumn
from splink.misc import (
dedupe_preserving_order,
Expand Down
2 changes: 1 addition & 1 deletion splink/internals/comparison_level_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# import composition functions for export
from .comparison_level_composition import And, Not, Or # NOQA: F401
from .comparison_level_creator import ComparisonLevelCreator
from splink.comparison_level_sql import great_circle_distance_km_sql
from splink.internals.comparison_level_sql import great_circle_distance_km_sql
from splink.dialects import SplinkDialect

# type aliases:
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from typing import Any, Iterable, List, Optional, Union

from .internals import comparison_level_library as cll
from splink.internals import comparison_level_library as cll
from splink.internals.comparison_creator import ComparisonCreator
from splink.internals.comparison_level_creator import ComparisonLevelCreator
from splink.internals.comparison_level_library import CustomLevel, DateMetricType
from .misc import ensure_is_iterable
from splink.misc import ensure_is_iterable


class CustomComparison(ComparisonCreator):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from typing import List, Type, Union

from .internals import comparison_level_library as cll
from splink.internals import comparison_level_library as cll
from splink.internals.column_expression import ColumnExpression
from splink.internals.comparison_creator import ComparisonCreator
from splink.internals.comparison_level_creator import ComparisonLevelCreator
from splink.internals.comparison_level_library import DateMetricType
from .misc import ensure_is_iterable
from splink.misc import ensure_is_iterable

# alternatively we could stick an inheritance layer in these, just for typing:
_fuzzy_cll_type = Union[
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@
import time
from typing import TYPE_CHECKING, Optional

from .input_column import InputColumn
from .pipeline import CTEPipeline
from .splink_dataframe import SplinkDataFrame
from .unique_id_concat import (
from splink.input_column import InputColumn
from splink.pipeline import CTEPipeline
from splink.splink_dataframe import SplinkDataFrame
from splink.unique_id_concat import (
_composite_unique_id_from_edges_sql,
_composite_unique_id_from_nodes_sql,
)

if TYPE_CHECKING:
from .linker import Linker
from splink.linker import Linker

logger = logging.getLogger(__name__)

Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions splink/linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@

from splink.internals.comparison import Comparison
from splink.internals.comparison_level import ComparisonLevel
from .comparison_vector_distribution import (
from splink.internals.comparison_vector_distribution import (
comparison_vector_distribution_sql,
)
from .comparison_vector_values import compute_comparison_vector_values_sql
from .connected_components import (
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.connected_components import (
_cc_create_unique_id_cols,
solve_connected_components,
)
Expand Down
2 changes: 1 addition & 1 deletion splink/m_from_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from splink.internals.block_from_labels import block_from_labels

from .comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from .expectation_maximisation import (
compute_new_parameters_sql,
compute_proportions_for_new_parameters,
Expand Down
2 changes: 1 addition & 1 deletion splink/m_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from splink.internals.blocking import BlockingRule, block_using_rules_sqls

from .comparison_vector_values import compute_comparison_vector_values_sql
from splink.internals.comparison_vector_values import compute_comparison_vector_values_sql
from .expectation_maximisation import (
compute_new_parameters_sql,
compute_proportions_for_new_parameters,
Expand Down
2 changes: 1 addition & 1 deletion splink/m_u_records_to_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any, Dict, List

from splink.internals.comparison_level import ComparisonLevel
from .constants import LEVEL_NOT_OBSERVED_TEXT
from splink.internals.constants import LEVEL_NOT_OBSERVED_TEXT

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion splink/settings_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from splink.internals.blocking_rule_creator_utils import to_blocking_rule_creator

from splink.internals.comparison_creator import ComparisonCreator
from .comparison_library import CustomComparison
from splink.internals.comparison_library import CustomComparison
from .settings import Settings


Expand Down
2 changes: 1 addition & 1 deletion tests/cc_testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from networkx.algorithms import connected_components as cc_nx

from splink.connected_components import solve_connected_components
from splink.internals.connected_components import solve_connected_components
from splink.duckdb.database_api import DuckDBAPI
from splink.duckdb.dataframe import DuckDBDataFrame
from splink.linker import Linker
Expand Down
2 changes: 1 addition & 1 deletion tests/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from splink import SettingsCreator
from splink.comparison_library import ExactMatch
from splink.internals.comparison_library import ExactMatch
from splink.duckdb.database_api import DuckDBAPI
from splink.internals.accuracy import (
predictions_from_sample_of_pairwise_labels_sql,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_array_based_blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from tests.decorator import mark_with_dialects_including


Expand Down
2 changes: 1 addition & 1 deletion tests/test_array_columns.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from tests.decorator import mark_with_dialects_excluding
from tests.literal_utils import (
ComparisonTestSpec,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_caching_tables.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import duckdb
import pandas as pd

from splink.comparison_library import ExactMatch, LevenshteinAtThresholds
from splink.internals.comparison_library import ExactMatch, LevenshteinAtThresholds
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker

Expand Down
2 changes: 1 addition & 1 deletion tests/test_charts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker

Expand Down
2 changes: 1 addition & 1 deletion tests/test_comparison_lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.internals.column_expression import ColumnExpression
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker
Expand Down
2 changes: 1 addition & 1 deletion tests/test_comparison_template_lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

import splink.comparison_template_library as ctl
import splink.internals.comparison_template_library as ctl

from .decorator import mark_with_dialects_excluding

Expand Down
2 changes: 1 addition & 1 deletion tests/test_compound_comparison_levels.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd

import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker

Expand Down
2 changes: 1 addition & 1 deletion tests/test_correctness_of_convergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import pandas as pd
import pytest

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.duckdb.dataframe import DuckDBDataFrame
from splink.em_training_session import EMTrainingSession
Expand Down
4 changes: 2 additions & 2 deletions tests/test_date_levels_and_comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import pytest

import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.comparison_template_library as ctl
import splink.internals.comparison_library as cl
import splink.internals.comparison_template_library as ctl
from splink.internals.column_expression import ColumnExpression
from tests.decorator import mark_with_dialects_excluding, mark_with_dialects_including
from tests.literal_utils import (
Expand Down
2 changes: 1 addition & 1 deletion tests/test_expectation_maximisation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.duckdb.database_api import DuckDBAPI
from splink.exceptions import EMTrainingException
from splink.linker import Linker
Expand Down
2 changes: 1 addition & 1 deletion tests/test_find_new_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.internals.blocking_rule_library import block_on
from splink.pipeline import CTEPipeline
from splink.vertically_concatenate import compute_df_concat_with_tf
Expand Down
2 changes: 1 addition & 1 deletion tests/test_full_example_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest

import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.blocking_analysis import count_comparisons_from_blocking_rule
from splink.duckdb.database_api import DuckDBAPI
from splink.exploratory import completeness_chart, profile_columns
Expand Down
2 changes: 1 addition & 1 deletion tests/test_full_example_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pyspark.sql.types import StringType, StructField, StructType

import splink.internals.comparison_level_library as cll
import splink.comparison_library as cl
import splink.internals.comparison_library as cl
from splink.exploratory import completeness_chart, profile_columns
from splink.linker import Linker
from splink.spark.database_api import SparkAPI
Expand Down
2 changes: 1 addition & 1 deletion tests/test_graph_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pandas.testing import assert_frame_equal
from pytest import approx, raises

from splink.comparison_library import ExactMatch
from splink.internals.comparison_library import ExactMatch
from splink.duckdb.database_api import DuckDBAPI
from splink.linker import Linker

Expand Down
Loading

0 comments on commit 14d528e

Please sign in to comment.