feat!(excelreader): only load dtypes for columns specified via `use_c…

…olumns` (#329) * feat!(excelsheet,exceltable): make available_columns a method rather than a property Signed-off-by: Luka Peschke <[email protected]> * refactor: impl From<FastExcelError> for PyErr This allows to return FastExcelResult<T> in pymethods Signed-off-by: Luka Peschke <[email protected]> * feat!(excelreader): only load dtypes for columns specified via `use_columns` Signed-off-by: Luka Peschke <[email protected]> * refactor: rename ColumnInfoBuilder to ColumnInfoNoDtype Signed-off-by: Luka Peschke <[email protected]> --------- Signed-off-by: Luka Peschke <[email protected]> Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Feb 19, 2025 · bf4a229 · bf4a229
1 parent 22a3096
commit bf4a229
Show file tree

Hide file tree

Showing 13 changed files with 338 additions and 172 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -24,6 +24,7 @@
     CalamineError,
     CannotRetrieveCellDataError,
     ColumnInfo,
+    ColumnInfoNoDtype,
     ColumnNotFoundError,
     FastExcelError,
     InvalidParametersError,
@@ -82,10 +83,9 @@ def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
         return self._sheet.selected_columns
 
-    @property
     def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
-        return self._sheet.available_columns
+        return self._sheet.available_columns()
 
     @property
     def specified_dtypes(self) -> DTypeMap | None:
@@ -161,10 +161,9 @@ def selected_columns(self) -> list[ColumnInfo]:
         """The table's selected columns"""
         return self._table.selected_columns
 
-    @property
     def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given table"""
-        return self._table.available_columns
+        return self._table.available_columns()
 
     @property
     def specified_dtypes(self) -> DTypeMap | None:
@@ -212,7 +211,11 @@ def load_sheet(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet lazily by index or name.
@@ -289,7 +292,11 @@ def load_table(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> ExcelTable: ...
@@ -304,7 +311,11 @@ def load_table(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
@@ -318,7 +329,11 @@ def load_table(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: bool = False,
     ) -> ExcelTable | pa.RecordBatch:
@@ -416,7 +431,11 @@ def load_sheet_by_name(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name.
@@ -445,7 +464,11 @@ def load_sheet_by_idx(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by index.

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -11,6 +11,15 @@ ColumnNameFrom = Literal["provided", "looked_up", "generated"]
 DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
 SheetVisible = Literal["visible", "hidden", "veryhidden"]
 
+class ColumnInfoNoDtype:
+    def __init__(self, *, name: str, index: int, column_name_from: ColumnNameFrom) -> None: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def index(self) -> int: ...
+    @property
+    def column_name_from(self) -> ColumnNameFrom: ...
+
 class ColumnInfo:
     def __init__(
         self,
@@ -51,7 +60,6 @@ class _ExcelSheet:
     @property
     def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
-    @property
     def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
     @property
@@ -85,7 +93,6 @@ class _ExcelTable:
     @property
     def selected_columns(self) -> list[ColumnInfo]:
         """The table's selected columns"""
-    @property
     def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given table"""
     @property
@@ -108,7 +115,11 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> _ExcelSheet: ...
@@ -123,7 +134,11 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
@@ -138,7 +153,11 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> _ExcelTable: ...
@@ -153,7 +172,11 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
-        use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
+        use_columns: list[str]
+        | list[int]
+        | str
+        | Callable[[ColumnInfoNoDtype], bool]
+        | None = None,
         dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...

diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py
@@ -19,7 +19,7 @@ def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] |
     )
 
     sheet = excel_reader.load_sheet(0, use_columns=use_columns)
-    assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"]
+    assert [col.name for col in sheet.available_columns()] == ["col", "col_1", "col_2"]
 
     pd_assert_frame_equal(
         sheet.to_pandas(),

diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
@@ -39,7 +39,7 @@ def test_single_sheet_all_columns(
 
     sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
     assert sheet.selected_columns == expected_column_info
-    assert sheet.available_columns == expected_column_info
+    assert sheet.available_columns() == expected_column_info
 
     expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
     expected_pd_df = pd.DataFrame(expected)
@@ -68,7 +68,7 @@ def test_single_sheet_subset_by_str(
         for idx, col in enumerate(["Month", "Year"]):
             sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col])
             assert sheet.selected_columns == [expected_column_info[idx]]
-            assert sheet.available_columns == expected_column_info
+            assert sheet.available_columns() == expected_column_info
 
             pd_df = sheet.to_pandas()
             pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))
@@ -88,7 +88,7 @@ def test_single_sheet_subset_by_index(
         for idx, col_name in enumerate(["Month", "Year"]):
             sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx])
             assert sheet.selected_columns == [expected_column_info[idx]]
-            assert sheet.available_columns == expected_column_info
+            assert sheet.available_columns() == expected_column_info
 
             pd_df = sheet.to_pandas()
             pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))
@@ -161,7 +161,7 @@ def test_single_sheet_with_unnamed_columns(
         sheet_with_unnamed_columns_expected_column_info[2],
         sheet_with_unnamed_columns_expected_column_info[3],
     ]
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -174,7 +174,7 @@ def test_single_sheet_with_unnamed_columns(
         sheet_with_unnamed_columns_expected_column_info[2],
         sheet_with_unnamed_columns_expected_column_info[3],
     ]
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -198,15 +198,15 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, n_rows=1
     )
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
 
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, n_rows=1
     )
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -221,15 +221,15 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, skip_rows=1
     )
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
 
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=1
     )
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -263,7 +263,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
     )
-    assert [col.name for col in sheet.available_columns] == expected_columns_names
+    assert [col.name for col in sheet.available_columns()] == expected_columns_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -274,7 +274,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
     )
-    assert [col.name for col in sheet.available_columns] == expected_columns_names
+    assert [col.name for col in sheet.available_columns()] == expected_columns_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
@@ -298,7 +298,7 @@ def test_single_sheet_with_unnamed_columns_and_str_range(
         sheet_with_unnamed_columns_expected_column_info[:1]
         + sheet_with_unnamed_columns_expected_column_info[2:]
     )
-    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
+    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
 
@@ -360,7 +360,7 @@ def test_use_columns_with_column_names() -> None:
         column_names=["bools_renamed", "dates_renamed"],
     )
 
-    assert sheet.available_columns == [
+    assert sheet.available_columns() == [
         fastexcel.ColumnInfo(
             name="__UNNAMED__0",
             column_name_from="generated",
@@ -420,7 +420,7 @@ def test_use_columns_with_callable() -> None:
 
     sheet = excel_reader.load_sheet(2)
     assert (
-        [(c.name, c.dtype) for c in sheet.available_columns]
+        [(c.name, c.dtype) for c in sheet.available_columns()]
         == [(c.name, c.dtype) for c in sheet.selected_columns]
         == [
             ("col1", "float"),
@@ -450,15 +450,6 @@ def test_use_columns_with_callable() -> None:
         ("__UNNAMED__3", "float"),
     ]
 
-    sheet = excel_reader.load_sheet(
-        2,
-        use_columns=lambda col: col.dtype == "string",
-    )
-    assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
-        ("col3", "string"),
-        ("col5", "string"),
-    ]
-
 
 def test_use_columns_with_bad_callable() -> None:
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -10,6 +10,7 @@
 import pytest
 from pandas.testing import assert_frame_equal as pd_assert_frame_equal
 from polars.testing import assert_frame_equal as pl_assert_frame_equal
+from pytest_mock import MockerFixture
 
 from utils import path_for_fixture
 
@@ -262,7 +263,7 @@ def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
 def test_one_dtype_for_all() -> None:
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
     sheet = excel_reader.load_sheet(0, dtypes="string")
-    assert sheet.available_columns == [
+    assert sheet.available_columns() == [
         fastexcel.ColumnInfo(
             name="Employee ID",
             index=0,
@@ -316,7 +317,7 @@ def test_one_dtype_for_all() -> None:
     assert sheet.to_polars().dtypes == [pl.String] * 7
 
 
-def test_fallback_infer_dtypes(mocker) -> None:
+def test_fallback_infer_dtypes(mocker: MockerFixture) -> None:
     """it should fallback to string if it can't infer the dtype"""
     import logging
 
@@ -336,7 +337,7 @@ def test_fallback_infer_dtypes(mocker) -> None:
         mocker.ANY,
     )
 
-    assert sheet.available_columns == [
+    assert sheet.available_columns() == [
         fastexcel.ColumnInfo(
             name="id",
             index=0,

diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py
@@ -38,7 +38,7 @@ def test_load_table(path: str) -> None:
     assert users_tbl.name == "users"
     assert users_tbl.sheet_name == "sheet1"
     assert users_tbl.specified_dtypes is None
-    assert users_tbl.available_columns == [
+    assert users_tbl.available_columns() == [
         fastexcel.ColumnInfo(
             name="User Id",
             index=0,

diff --git a/src/data.rs b/src/data.rs
@@ -14,6 +14,7 @@ use crate::{
     },
 };
 
+#[derive(Debug)]
 pub(crate) enum ExcelSheetData<'r> {
     Owned(Range<CalData>),
     Ref(Range<CalDataRef<'r>>),