Skip to content

Commit

Permalink
feat!(excelreader): only load dtypes for columns specified via `use_c…
Browse files Browse the repository at this point in the history
…olumns` (#329)

* feat!(excelsheet,exceltable): make available_columns a method rather than a property

Signed-off-by: Luka Peschke <[email protected]>

* refactor: impl From<FastExcelError> for PyErr

This allows to return FastExcelResult<T> in pymethods

Signed-off-by: Luka Peschke <[email protected]>

* feat!(excelreader): only load dtypes for columns specified via `use_columns`

Signed-off-by: Luka Peschke <[email protected]>

* refactor: rename ColumnInfoBuilder to ColumnInfoNoDtype

Signed-off-by: Luka Peschke <[email protected]>

---------

Signed-off-by: Luka Peschke <[email protected]>
Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Feb 19, 2025
1 parent 22a3096 commit bf4a229
Show file tree
Hide file tree
Showing 13 changed files with 338 additions and 172 deletions.
43 changes: 33 additions & 10 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
CalamineError,
CannotRetrieveCellDataError,
ColumnInfo,
ColumnInfoNoDtype,
ColumnNotFoundError,
FastExcelError,
InvalidParametersError,
Expand Down Expand Up @@ -82,10 +83,9 @@ def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
return self._sheet.selected_columns

@property
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
return self._sheet.available_columns
return self._sheet.available_columns()

@property
def specified_dtypes(self) -> DTypeMap | None:
Expand Down Expand Up @@ -161,10 +161,9 @@ def selected_columns(self) -> list[ColumnInfo]:
"""The table's selected columns"""
return self._table.selected_columns

@property
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given table"""
return self._table.available_columns
return self._table.available_columns()

@property
def specified_dtypes(self) -> DTypeMap | None:
Expand Down Expand Up @@ -212,7 +211,11 @@ def load_sheet(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet lazily by index or name.
Expand Down Expand Up @@ -289,7 +292,11 @@ def load_table(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> ExcelTable: ...
Expand All @@ -304,7 +311,11 @@ def load_table(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
Expand All @@ -318,7 +329,11 @@ def load_table(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: bool = False,
) -> ExcelTable | pa.RecordBatch:
Expand Down Expand Up @@ -416,7 +431,11 @@ def load_sheet_by_name(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand Down Expand Up @@ -445,7 +464,11 @@ def load_sheet_by_idx(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand Down
35 changes: 29 additions & 6 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@ ColumnNameFrom = Literal["provided", "looked_up", "generated"]
DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
SheetVisible = Literal["visible", "hidden", "veryhidden"]

class ColumnInfoNoDtype:
def __init__(self, *, name: str, index: int, column_name_from: ColumnNameFrom) -> None: ...
@property
def name(self) -> str: ...
@property
def index(self) -> int: ...
@property
def column_name_from(self) -> ColumnNameFrom: ...

class ColumnInfo:
def __init__(
self,
Expand Down Expand Up @@ -51,7 +60,6 @@ class _ExcelSheet:
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
@property
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
@property
Expand Down Expand Up @@ -85,7 +93,6 @@ class _ExcelTable:
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The table's selected columns"""
@property
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given table"""
@property
Expand All @@ -108,7 +115,11 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelSheet: ...
Expand All @@ -123,7 +134,11 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
Expand All @@ -138,7 +153,11 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelTable: ...
Expand All @@ -153,7 +172,11 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
Expand Down
2 changes: 1 addition & 1 deletion python/tests/test_alias_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] |
)

sheet = excel_reader.load_sheet(0, use_columns=use_columns)
assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"]
assert [col.name for col in sheet.available_columns()] == ["col", "col_1", "col_2"]

pd_assert_frame_equal(
sheet.to_pandas(),
Expand Down
37 changes: 14 additions & 23 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_single_sheet_all_columns(

sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
assert sheet.selected_columns == expected_column_info
assert sheet.available_columns == expected_column_info
assert sheet.available_columns() == expected_column_info

expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
expected_pd_df = pd.DataFrame(expected)
Expand Down Expand Up @@ -68,7 +68,7 @@ def test_single_sheet_subset_by_str(
for idx, col in enumerate(["Month", "Year"]):
sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col])
assert sheet.selected_columns == [expected_column_info[idx]]
assert sheet.available_columns == expected_column_info
assert sheet.available_columns() == expected_column_info

pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))
Expand All @@ -88,7 +88,7 @@ def test_single_sheet_subset_by_index(
for idx, col_name in enumerate(["Month", "Year"]):
sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx])
assert sheet.selected_columns == [expected_column_info[idx]]
assert sheet.available_columns == expected_column_info
assert sheet.available_columns() == expected_column_info

pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))
Expand Down Expand Up @@ -161,7 +161,7 @@ def test_single_sheet_with_unnamed_columns(
sheet_with_unnamed_columns_expected_column_info[2],
sheet_with_unnamed_columns_expected_column_info[3],
]
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
Expand All @@ -174,7 +174,7 @@ def test_single_sheet_with_unnamed_columns(
sheet_with_unnamed_columns_expected_column_info[2],
sheet_with_unnamed_columns_expected_column_info[3],
]
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
Expand All @@ -198,15 +198,15 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, n_rows=1
)
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, n_rows=1
)
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
Expand All @@ -221,15 +221,15 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=1
)
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1
)
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == expected_columns_names
assert [col.name for col in sheet.available_columns()] == expected_columns_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
Expand All @@ -274,7 +274,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == expected_columns_names
assert [col.name for col in sheet.available_columns()] == expected_columns_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
Expand All @@ -298,7 +298,7 @@ def test_single_sheet_with_unnamed_columns_and_str_range(
sheet_with_unnamed_columns_expected_column_info[:1]
+ sheet_with_unnamed_columns_expected_column_info[2:]
)
assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

Expand Down Expand Up @@ -360,7 +360,7 @@ def test_use_columns_with_column_names() -> None:
column_names=["bools_renamed", "dates_renamed"],
)

assert sheet.available_columns == [
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="__UNNAMED__0",
column_name_from="generated",
Expand Down Expand Up @@ -420,7 +420,7 @@ def test_use_columns_with_callable() -> None:

sheet = excel_reader.load_sheet(2)
assert (
[(c.name, c.dtype) for c in sheet.available_columns]
[(c.name, c.dtype) for c in sheet.available_columns()]
== [(c.name, c.dtype) for c in sheet.selected_columns]
== [
("col1", "float"),
Expand Down Expand Up @@ -450,15 +450,6 @@ def test_use_columns_with_callable() -> None:
("__UNNAMED__3", "float"),
]

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.dtype == "string",
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col3", "string"),
("col5", "string"),
]


def test_use_columns_with_bad_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
Expand Down
7 changes: 4 additions & 3 deletions python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from pytest_mock import MockerFixture

from utils import path_for_fixture

Expand Down Expand Up @@ -262,7 +263,7 @@ def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
def test_one_dtype_for_all() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes="string")
assert sheet.available_columns == [
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="Employee ID",
index=0,
Expand Down Expand Up @@ -316,7 +317,7 @@ def test_one_dtype_for_all() -> None:
assert sheet.to_polars().dtypes == [pl.String] * 7


def test_fallback_infer_dtypes(mocker) -> None:
def test_fallback_infer_dtypes(mocker: MockerFixture) -> None:
"""it should fallback to string if it can't infer the dtype"""
import logging

Expand All @@ -336,7 +337,7 @@ def test_fallback_infer_dtypes(mocker) -> None:
mocker.ANY,
)

assert sheet.available_columns == [
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="id",
index=0,
Expand Down
2 changes: 1 addition & 1 deletion python/tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_load_table(path: str) -> None:
assert users_tbl.name == "users"
assert users_tbl.sheet_name == "sheet1"
assert users_tbl.specified_dtypes is None
assert users_tbl.available_columns == [
assert users_tbl.available_columns() == [
fastexcel.ColumnInfo(
name="User Id",
index=0,
Expand Down
1 change: 1 addition & 0 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::{
},
};

#[derive(Debug)]
pub(crate) enum ExcelSheetData<'r> {
Owned(Range<CalData>),
Ref(Range<CalDataRef<'r>>),
Expand Down
Loading

0 comments on commit bf4a229

Please sign in to comment.