Skip to content

Commit

Permalink
Add additional logging to read_from_parquet (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
simw authored Nov 14, 2023
1 parent ac36067 commit dde1b41
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "pipedata"
version = "0.2.1"
version = "0.2.2"
description = "Framework for building pipelines for data processing"
authors = ["Simon Wicks <[email protected]>"]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion src/pipedata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.2.1"
__version__ = "0.2.2"

__all__ = [
"__version__",
Expand Down
7 changes: 6 additions & 1 deletion src/pipedata/ops/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ def parquet_batch_reader(
for file_ref in file_refs:
logger.info(f"Reading parquet file {file_ref}")
ds = pa_dataset.dataset(file_ref, format="parquet")
for batch in ds.to_batches(columns=columns, batch_size=batch_size):
for i, batch in enumerate(
ds.to_batches(columns=columns, batch_size=batch_size)
):
logger.info(
f"Processing batch {i} (length {len(batch)}) from {file_ref}"
)
if return_as == "recordbatch":
yield batch
elif return_as == "record":
Expand Down

0 comments on commit dde1b41

Please sign in to comment.