Skip to content

Commit

Permalink
move accuracy to internals
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed May 20, 2024
1 parent 520ced7 commit c06e812
Show file tree
Hide file tree
Showing 34 changed files with 3,151 additions and 15,146 deletions.
83 changes: 2 additions & 81 deletions docs/charts/accuracy_chart_from_labels_table.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/charts/cluster_studio_dashboard.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from splink.duckdb.linker import DuckDBLinker\n",
"import splink.duckdb.comparison_library as cl\n",
Expand Down Expand Up @@ -79,7 +78,8 @@
"IFrame(\n",
" src=\"./img/cluster_studio.html\", width=\"100%\", height=1200\n",
")"
]
],
"outputs": []
},
{
"attachments": {},
Expand Down
41 changes: 2 additions & 39 deletions docs/charts/comparison_viewer_dashboard.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,44 +30,6 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4007ece5fbbb449f92d734eb3e7e7bba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"100%\"\n",
" height=\"1200\"\n",
" src=\"./img/scv.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7fba792fd8b0>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from splink.duckdb.linker import DuckDBLinker\n",
"import splink.duckdb.comparison_library as cl\n",
Expand Down Expand Up @@ -115,7 +77,8 @@
"IFrame(\n",
" src=\"./img/scv.html\", width=\"100%\", height=1200\n",
") \n"
]
],
"outputs": []
},
{
"attachments": {},
Expand Down
83 changes: 2 additions & 81 deletions docs/charts/completeness_chart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,86 +27,6 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<style>\n",
" #altair-viz-794f04df820342ad8cb562508ffae4fd.vega-embed {\n",
" width: 100%;\n",
" display: flex;\n",
" }\n",
"\n",
" #altair-viz-794f04df820342ad8cb562508ffae4fd.vega-embed details,\n",
" #altair-viz-794f04df820342ad8cb562508ffae4fd.vega-embed details summary {\n",
" position: relative;\n",
" }\n",
"</style>\n",
"<div id=\"altair-viz-794f04df820342ad8cb562508ffae4fd\"></div>\n",
"<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-794f04df820342ad8cb562508ffae4fd\") {\n",
" outputDiv = document.getElementById(\"altair-viz-794f04df820342ad8cb562508ffae4fd\");\n",
" }\n",
" const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
" \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
" \"vega-lite\": \"https://cdn.jsdelivr.net/npm/[email protected]?noext\",\n",
" \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
" };\n",
"\n",
" function maybeLoadScript(lib, version) {\n",
" var key = `${lib.replace(\"-\", \"\")}_version`;\n",
" return (VEGA_DEBUG[key] == version) ?\n",
" Promise.resolve(paths[lib]) :\n",
" new Promise(function(resolve, reject) {\n",
" var s = document.createElement('script');\n",
" document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
" s.async = true;\n",
" s.onload = () => {\n",
" VEGA_DEBUG[key] = version;\n",
" return resolve(paths[lib]);\n",
" };\n",
" s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
" s.src = paths[lib];\n",
" });\n",
" }\n",
"\n",
" function showError(err) {\n",
" outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
" throw err;\n",
" }\n",
"\n",
" function displayChart(vegaEmbed) {\n",
" vegaEmbed(outputDiv, spec, embedOpt)\n",
" .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
" }\n",
"\n",
" if(typeof define === \"function\" && define.amd) {\n",
" requirejs.config({paths});\n",
" require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
" } else {\n",
" maybeLoadScript(\"vega\", \"5\")\n",
" .then(() => maybeLoadScript(\"vega-lite\", \"5.8.0\"))\n",
" .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
" .catch(showError)\n",
" .then(() => displayChart(vegaEmbed));\n",
" }\n",
" })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-11261f6de8ec59fe3e6da9e6afeaddcc\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-11261f6de8ec59fe3e6da9e6afeaddcc\": [{\"source_dataset\": \"df_left\", \"column_name\": \"first_name\", \"total_null_rows\": 79, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8420000076293945}, {\"source_dataset\": \"df_right\", \"column_name\": \"first_name\", \"total_null_rows\": 90, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8199999928474426}, {\"source_dataset\": \"df_left\", \"column_name\": \"surname\", \"total_null_rows\": 93, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8140000104904175}, {\"source_dataset\": \"df_right\", \"column_name\": \"surname\", \"total_null_rows\": 88, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8240000009536743}, {\"source_dataset\": \"df_left\", \"column_name\": \"dob\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 500, \"completeness\": 1.0}, {\"source_dataset\": \"df_right\", \"column_name\": \"dob\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 500, \"completeness\": 1.0}, {\"source_dataset\": \"df_left\", \"column_name\": \"city\", \"total_null_rows\": 93, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8140000104904175}, {\"source_dataset\": \"df_right\", \"column_name\": \"city\", \"total_null_rows\": 94, \"total_rows_inc_nulls\": 500, \"completeness\": 0.8119999766349792}, {\"source_dataset\": \"df_left\", \"column_name\": \"email\", \"total_null_rows\": 109, \"total_rows_inc_nulls\": 500, \"completeness\": 0.7820000052452087}, {\"source_dataset\": \"df_right\", \"column_name\": \"email\", \"total_null_rows\": 102, \"total_rows_inc_nulls\": 500, \"completeness\": 0.7960000038146973}]}}, {\"mode\": \"vega-lite\"});\n",
"</script>"
],
"text/plain": [
"alt.LayerChart(...)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from splink.duckdb.linker import DuckDBLinker\n",
"import splink.duckdb.comparison_library as cl\n",
Expand Down Expand Up @@ -138,7 +58,8 @@
"linker = DuckDBLinker([df_l, df_r], settings, input_table_aliases=[\"df_left\", \"df_right\"])\n",
"\n",
"linker.completeness_chart(cols=[\"first_name\", \"surname\", \"dob\", \"city\", \"email\"])"
]
],
"outputs": []
},
{
"attachments": {},
Expand Down
Loading

0 comments on commit c06e812

Please sign in to comment.