From 4f6b8dc2d7c7af070d78d3821e09b7651ed7c4e2 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 12 Dec 2023 16:12:09 -0500 Subject: [PATCH] Table diff tools (#3128) * Diff tables' individual values instead of by row * Add a simple notebook that actually runs some diffs. * Add empty-case test + some docs in notebook. --------- Co-authored-by: Zane Selvans --- devtools/sqlite-table-diff.ipynb | 102 +++++++++++++++++++++++++++++++ src/pudl/helpers.py | 51 +++++++++++++++- test/unit/helpers_test.py | 66 ++++++++++++++++++++ 3 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 devtools/sqlite-table-diff.ipynb diff --git a/devtools/sqlite-table-diff.ipynb b/devtools/sqlite-table-diff.ipynb new file mode 100644 index 0000000000..a5fef66f71 --- /dev/null +++ b/devtools/sqlite-table-diff.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Example of diffing tables across multiple different SQLite DBs.\n", + "\n", + "The tables must have the same name/schema. This is intended for use in\n", + "investigating validation test errors.\n", + "\"\"\"\n", + "import sqlite3\n", + "from pathlib import Path\n", + "from typing import Iterable\n", + "\n", + "import pandas as pd\n", + "\n", + "from pudl.helpers import diff_wide_tables, TableDiff\n", + "from pudl.metadata.classes import Resource\n", + "from pudl.metadata.fields import apply_pudl_dtypes\n", + "\n", + "\n", + "def table_diff(\n", + " table_name: str,\n", + " old_conn,\n", + " new_conn,\n", + " ignore_cols: Iterable[str] = (\"plant_id_ferc1\",),\n", + " addl_key_cols: Iterable[str] = (),\n", + " ) -> TableDiff:\n", + " \"\"\"Diff two versions of the same table that live in SQL databases.\n", + "\n", + " The table has to have the same name + columns in both DBs.\n", + "\n", + " Args:\n", + " table_name: the name, in the SQL database, of the table you want to compare.\n", + " old_conn: SQLite connection to the old version of the database.\n", + " new_conn: SQLite connection to the new version of the database.\n", + " ignore_cols: a list of columns that you would like to ignore diffs in.\n", + " addl_key_cols: \n", + " columns that aren't necessarily in the primary key, but that you'd\n", + " like to use as key columns for the diff - for example, if your\n", + " table only uses `record_id` as primary_key, but you want to group\n", + " the rows by `record_year` and `utility_id` as well, you would pass\n", + " those in.\n", + " \"\"\"\n", + " query = f\"SELECT * FROM {table_name}\" # noqa: S608\n", + " old_table = apply_pudl_dtypes(pd.read_sql(query, old_conn))\n", + " new_table = apply_pudl_dtypes(pd.read_sql(query, new_conn))\n", + "\n", + " cols = list(set(old_table.columns) - set(ignore_cols))\n", + "\n", + " primary_key = list(set(Resource.from_id(table_name).schema.primary_key).union(set(addl_key_cols)))\n", + " return diff_wide_tables(primary_key, old_table[cols], new_table[cols])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_db = sqlite3.connect(Path(\"~/Downloads/pudl.sqlite\").expanduser())\n", + "old_db = sqlite3.connect(Path(\"~/Downloads/pudl (2).sqlite\").expanduser())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_name = \"denorm_plants_steam_ferc1\"\n", + "diff = table_diff(table_name, old_db, new_db, ignore_cols=(\"plant_id_ferc1\", \"plant_id_pudl\"), addl_key_cols=(\"report_year\", \"utility_id_pudl\"))\n", + "diff.changed" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pudl-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py index 4e1fb73f5d..abf5288935 100644 --- a/src/pudl/helpers.py +++ b/src/pudl/helpers.py @@ -15,7 +15,7 @@ from collections.abc import Generator, Iterable from functools import partial from io import BytesIO -from typing import Any, Literal +from typing import Any, Literal, NamedTuple import addfips import numpy as np @@ -1857,3 +1857,52 @@ def assert_cols_areclose( f"{message} Mismatch ratio {mismatch_ratio:.01%} > " f"threshold {mismatch_threshold:.01%}." ) + + +class TableDiff(NamedTuple): + """Represent a diff between two versions of the same table.""" + + deleted: pd.DataFrame + added: pd.DataFrame + changed: pd.DataFrame + old_df: pd.DataFrame + new_df: pd.DataFrame + + +def diff_wide_tables( + primary_key: Iterable[str], old: pd.DataFrame, new: pd.DataFrame +) -> TableDiff: + """Diff values across multiple iterations of the same wide table. + + We often have tables with many value columns; a straightforward comparison of two + versions of the same table will show you that two rows are different, but + won't show which of the many values changed. + + So we melt the table based on some sort of primary key columns then diff + the old and new values. + """ + old_melted = old.melt(id_vars=primary_key, var_name="field").set_index( + primary_key + ["field"] + ) + new_melted = new.melt(id_vars=primary_key, var_name="field").set_index( + primary_key + ["field"] + ) + old_aligned, new_aligned = old_melted.align(new_melted) + comparison = old_aligned.compare(new_aligned, result_names=("old", "new")) + if comparison.empty: + return TableDiff( + deleted=pd.DataFrame(), + added=pd.DataFrame(), + changed=pd.DataFrame(), + old_df=old, + new_df=new, + ) + + old_values = comparison[("value", "old")] + new_values = comparison[("value", "new")] + added = comparison[old_values.isna() & new_values.notna()] + deleted = comparison[old_values.notna() & new_values.isna()] + changed = comparison[old_values.notna() & new_values.notna()] + return TableDiff( + deleted=deleted, added=added, changed=changed, old_df=old, new_df=new + ) diff --git a/test/unit/helpers_test.py b/test/unit/helpers_test.py index 3ca5671b41..ddf6d32029 100644 --- a/test/unit/helpers_test.py +++ b/test/unit/helpers_test.py @@ -14,6 +14,7 @@ convert_df_to_excel_file, convert_to_date, date_merge, + diff_wide_tables, expand_timeseries, fix_eia_na, flatten_list, @@ -666,3 +667,68 @@ def test_convert_col_to_bool(df): .isin([False, np.nan]) .all() ) + + +def test_diff_wide_tables(): + # has 2020-2021 data for utils 1 and 2; fact 2 for utility 1 just never reported + old = pd.DataFrame.from_records( + [ + {"u_id": 1, "year": 2020, "fact1": "u1f1y20"}, + {"u_id": 1, "year": 2021, "fact1": "u1f1y21"}, + {"u_id": 2, "year": 2020, "fact1": "u2f1y20", "fact2": "u2f2y20"}, + {"u_id": 2, "year": 2021, "fact1": "u2f1y21", "fact2": "u2f2y21"}, + ] + ) + + # has 2020-2022 data for utils 1 and 2, but: + # - utility 1 is missing 2020 data for fact 1 and fact 2; otherwise, just missing fact 2 as usual + # - utility 2 has an updated value for 2021 fact 1 + new = pd.DataFrame.from_records( + [ + {"u_id": 1, "year": 2020}, + {"u_id": 1, "year": 2021, "fact1": "u1f1y21"}, + {"u_id": 1, "year": 2022, "fact1": "u1f1y22"}, + {"u_id": 2, "year": 2020, "fact1": "u2f1y20", "fact2": "u2f2y20"}, + {"u_id": 2, "year": 2021, "fact1": "u2f1y21_updated", "fact2": "u2f2y21"}, + {"u_id": 2, "year": 2022, "fact1": "u2f1y22", "fact2": "u2f2y22"}, + ] + ) + + empty_diff = diff_wide_tables(primary_key=["u_id", "year"], old=old, new=old) + assert empty_diff.added.empty + assert empty_diff.deleted.empty + assert empty_diff.changed.empty + + def assert_diff_equal(observed, expected): + observed_reshaped = observed.droplevel(level=0, axis="columns") + expected_reshaped = expected.set_index(observed_reshaped.index.names) + assert_frame_equal(observed_reshaped, expected_reshaped) + + diff_output = diff_wide_tables(primary_key=["u_id", "year"], old=old, new=new) + + expected_deleted = pd.DataFrame.from_records( + [{"u_id": 1, "year": 2020, "field": "fact1", "old": "u1f1y20", "new": None}] + ) + assert_diff_equal(diff_output.deleted, expected_deleted) + + expected_added = pd.DataFrame.from_records( + [ + {"u_id": 1, "year": 2022, "field": "fact1", "old": None, "new": "u1f1y22"}, + {"u_id": 2, "year": 2022, "field": "fact1", "old": None, "new": "u2f1y22"}, + {"u_id": 2, "year": 2022, "field": "fact2", "old": None, "new": "u2f2y22"}, + ] + ) + assert_diff_equal(diff_output.added, expected_added) + + expected_changed = pd.DataFrame.from_records( + [ + { + "u_id": 2, + "year": 2021, + "field": "fact1", + "old": "u2f1y21", + "new": "u2f1y21_updated", + } + ] + ) + assert_diff_equal(diff_output.changed, expected_changed)