-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This change introduces some cleanups, including: - creating separate model for object path, encapsulating its features and behaviors - adding support for running diff tool via poetry command - move task queue into `task_queue.py` module, and rework its internal to use runners on n-threads. This way, we can manage task order and choose priority, or enforce constraints - for now, only permit one task per sqlite database to avoid collisions when accessing the underlying file - add some extra tests
- Loading branch information
Showing
10 changed files
with
453 additions
and
224 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,17 @@ | ||
FROM python:3.10-buster | ||
RUN pip install poetry==1.6 | ||
|
||
# TODO(rousik): Presumably, when we run this tool, we will want to | ||
# either mount remote paths as "local" directories, or cache the remote | ||
# files locally using plentiful storage. | ||
|
||
# --cache-dir argument can be used to control where things are cached. | ||
|
||
|
||
WORKDIR /app | ||
COPY pyproject.toml poetry.lock /app | ||
COPY README.md /app | ||
COPY ./src /app/src | ||
RUN poetry config virtualenvs.create false | ||
RUN poetry install --only main | ||
ENTRYPOINT ["poetry", "run", "python", "-m", "pudl_output_differ.main"] | ||
ENTRYPOINT ["poetry", "run", "diff"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,9 @@ authors = ["Jan Rous <[email protected]>"] | |
readme = "README.md" | ||
packages = [{include = "pudl_output_differ", from = "src"}] | ||
|
||
[tool.poetry.scripts] | ||
diff = "pudl_output_differ.cli:main" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.10" | ||
pydantic = "^2.3.0" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,65 @@ | ||
"""Module for comparing contents of parquet files.""" | ||
# """Module for comparing contents of parquet files.""" | ||
|
||
import logging | ||
from pudl_output_differ.sqlite import RowCountDiff | ||
from pudl_output_differ.types import DiffEvaluatorBase, DiffTreeNode, KeySetDiff, TaskQueue | ||
import pyarrow.parquet as pq | ||
# import logging | ||
# from pudl_output_differ.sqlite import RowCountDiff | ||
# from pudl_output_differ.types import DiffEvaluatorBase, DiffTreeNode, KeySetDiff, TaskQueue | ||
# import pyarrow.parquet as pq | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
# logger = logging.getLogger(__name__) | ||
|
||
|
||
class ParquetEvaluator(DiffEvaluatorBase): | ||
left_path: str | ||
right_path: str | ||
# class ParquetEvaluator(DiffEvaluatorBase): | ||
# left_path: str | ||
# right_path: str | ||
|
||
|
||
def get_columns(self, schema: pq.ParquetSchema)-> list[str]: | ||
"""Return list containing column_name::column_type.""" | ||
ret = [] | ||
for i in len(schema.names): | ||
ret.append( | ||
schema.column(i).name + "::" + schema.column(i).logical_type.type | ||
) | ||
return ret | ||
# def get_columns(self, schema: pq.ParquetSchema)-> list[str]: | ||
# """Return list containing column_name::column_type.""" | ||
# ret = [] | ||
# for i in len(schema.names): | ||
# ret.append( | ||
# schema.column(i).name + "::" + schema.column(i).logical_type.type | ||
# ) | ||
# return ret | ||
|
||
def execute(self, task_queue: TaskQueue) -> list[DiffTreeNode]: | ||
"""Compare two parquet files.""" | ||
diffs = [] | ||
# lfs, lpath = fsspec.core.url_to_fs(self.left_path) | ||
# rfs, rpath = fsspec.open(self.right_path) | ||
# def execute(self, task_queue: TaskQueue) -> list[DiffTreeNode]: | ||
# """Compare two parquet files.""" | ||
# diffs = [] | ||
# # lfs, lpath = fsspec.core.url_to_fs(self.left_path) | ||
# # rfs, rpath = fsspec.open(self.right_path) | ||
|
||
lmeta = pq.read_metadata(self.left_path) | ||
rmeta = pq.read_metadata(self.right_path) | ||
if not lmeta.schema.equals(rmeta.schema): | ||
logger.info("Parquet schemas are different.") | ||
diffs.append( | ||
self.parent_node.add_child( | ||
DiffTreeNode( | ||
name="ParquetSchema", | ||
diff=KeySetDiff.from_sets( | ||
set(self.get_columns(lmeta.schema)), | ||
set(self.get_columns(rmeta.schema)), | ||
entity="columns", | ||
) | ||
) | ||
) | ||
) | ||
# Now, go on to compare the metadata more broadly. | ||
if not lmeta.equals(rmeta): | ||
logger.info("Parquet metadata are different.") | ||
logger.info(f"Left metadata: {lmeta}") | ||
logger.info(f"Right metadata: {rmeta}") | ||
if lmeta.num_rows != rmeta.num_rows: | ||
logger.info("Number of rows are different.") | ||
diffs.append( | ||
self.parent_node.add_child( | ||
DiffTreeNode( | ||
name="ParquetNumRows", | ||
diff=RowCountDiff( | ||
left_rows=lmeta.num_rows, | ||
right_rows=rmeta.num_rows) | ||
) | ||
) | ||
) | ||
return diffs | ||
# lmeta = pq.read_metadata(self.left_path) | ||
# rmeta = pq.read_metadata(self.right_path) | ||
# if not lmeta.schema.equals(rmeta.schema): | ||
# logger.info("Parquet schemas are different.") | ||
# diffs.append( | ||
# self.parent_node.add_child( | ||
# DiffTreeNode( | ||
# name="ParquetSchema", | ||
# diff=KeySetDiff.from_sets( | ||
# set(self.get_columns(lmeta.schema)), | ||
# set(self.get_columns(rmeta.schema)), | ||
# entity="columns", | ||
# ) | ||
# ) | ||
# ) | ||
# ) | ||
# # Now, go on to compare the metadata more broadly. | ||
# if not lmeta.equals(rmeta): | ||
# logger.info("Parquet metadata are different.") | ||
# logger.info(f"Left metadata: {lmeta}") | ||
# logger.info(f"Right metadata: {rmeta}") | ||
# if lmeta.num_rows != rmeta.num_rows: | ||
# logger.info("Number of rows are different.") | ||
# diffs.append( | ||
# self.parent_node.add_child( | ||
# DiffTreeNode( | ||
# name="ParquetNumRows", | ||
# diff=RowCountDiff( | ||
# left_rows=lmeta.num_rows, | ||
# right_rows=rmeta.num_rows) | ||
# ) | ||
# ) | ||
# ) | ||
# return diffs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.