Merge branch 'main' into feat/relativedelta

FBruzzesi · Nov 12, 2024 · 2d774a6 · 2d774a6
2 parents 3467f3d + 0627a92
commit 2d774a6
Show file tree

Hide file tree

Showing 7 changed files with 109 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -42,6 +42,11 @@ We introduce two main classes:
 
 - Considering the above choices, we also provide a scikit-learn compatible splitter: [`TimeBasedCVSplitter`](https://fbruzzesi.github.io/timebasedcv/api/sklearn/#timebasedcv.sklearn.TimeBasedCVSplitter). Considering the signature that `.split(...)` requires and the fact that CV Splitters need to know a priori the number of splits, `TimeBasedCVSplitter` is initialized with the time series containing the time information used to generate the train and test indices of each split.
 
+### Dataframe and array agnostic
+
+- Thanks to [Narwhals](https://narwhals-dev.github.io/narwhals/), `TimeBasedSplit` works out of the box with `pandas`, `polars`, `pyarrow` and any other dataframe library supported by Narwhals.
+- Thanks to the array API, `TimeBasedSplit` works out of the box with `numpy`, `cupy`, `dask.array` and any other array library that support slicing à la numpy.
+
 ## Installation 💻
 
 TL;DR:

diff --git a/docs/index.md b/docs/index.md
@@ -48,6 +48,11 @@ We introduce two main classes:
 
 - Considering the above choices, we also provide a scikit-learn compatible splitter: [`TimeBasedCVSplitter`](api/sklearn.md#timebasedcv.sklearn.TimeBasedCVSplitter){:target="_blank"}. Considering the signature that `.split(...)` requires and the fact that CV Splitters need to know a priori the number of splits, `TimeBasedCVSplitter` is initialized with the time series containing the time information used to generate the train and test indices of each split.
 
+### Dataframe and array agnostic
+
+- Thanks to [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}, `TimeBasedSplit` works out of the box with `pandas`, `polars`, `pyarrow` and any other dataframe library supported by Narwhals.
+- Thanks to the array API, `TimeBasedSplit` works out of the box with `numpy`, `cupy`, `dask.array` and any other array library that support slicing à la numpy.
+
 ## Installation 💻
 
 TL;DR:

diff --git a/docs/installation.md b/docs/installation.md
@@ -27,7 +27,7 @@
 !!! info
     The minimum Python version supported is 3.8.
 
-- Since **v0.1.0**, the only two dependencies are [`numpy`](https://numpy.org/doc/stable/index.html){:target="_blank"} and [`narwhals>=0.7.15`](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
+- Since **v0.1.0**, the only two dependencies are [`numpy`](https://numpy.org/doc/stable/index.html){:target="_blank"} and [`narwhals>=1.0.0`](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
 
     **Narwhals** allows to have a compatibility layer between polars, pandas and other dataframe libraries. Therefore, as long as narwhals supports such dataframe object, we will as well.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,8 +38,6 @@ repository = "https://github.com/fbruzzesi/timebasedcv"
 issue-tracker = "https://github.com/fbruzzesi/timebasedcv/issues"
 
 [project.optional-dependencies]
-polars = ["polars>=0.20.3"]
-pandas = ["pandas>=1.2.0"]
 scikit-learn = ["scikit-learn>=0.19"]
 
 dev = [
@@ -63,9 +61,13 @@ test = [
     "pytest==7.2.0",
     "pytest-xdist==3.2.1",
     "coverage==7.2.1",
+    "pandas>=1.2.0",
+    "polars>=0.20.3",
+    "pyarrow>=11.0.0",
+    "dask>=2023.1.0"
 ]
 
-all = ["timebasedcv[pandas,polars,scikit-learn]"]
+all = ["timebasedcv[scikit-learn]"]
 all-dev = ["timebasedcv[all,dev,docs,lint,test]"]
 
 [tool.hatch.build.targets.sdist]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,16 +1,28 @@
 from __future__ import annotations
 
+from datetime import datetime
+from datetime import timedelta
+from typing import TYPE_CHECKING
 from typing import Any
+from typing import Callable
 from typing import Dict
 from typing import List
 from typing import Literal
 from typing import Tuple
 from typing import Union
 
+import dask.array as da
 import numpy as np
 import pandas as pd
+import polars as pl
+import pyarrow as pa
 import pytest
 
+if TYPE_CHECKING:
+    from narwhals.typing import IntoDataFrame
+
+    from timebasedcv.utils._types import TensorLike
+
 
 @pytest.fixture()
 def sample_list() -> List[int]:
@@ -96,25 +108,30 @@ def valid_kwargs(
 
 
 @pytest.fixture()
-def generate_test_data():
+def generate_test_data() -> Tuple[datetime, datetime, np.ndarray, np.ndarray, np.ndarray]:
     """Generate start and end time, time series, X, and y for testing purposes.
 
     Returns:
-        tuple: A tuple containing the start datetime, end datetime, time series,
-                X (dataframe with columns "a" and "b"), and y (series).
+        tuple: A tuple containing the start datetime, end datetime, time series, X and y.
     """
     RNG = np.random.default_rng()
 
-    start_dt = pd.Timestamp(2023, 1, 1)
-    end_dt = pd.Timestamp(2023, 1, 31)
-
-    time_series = pd.Series(pd.date_range(start_dt, end_dt, freq="D"))
+    start_dt, end_dt = datetime(2023, 1, 1), datetime(2023, 1, 31)
+    time_series = np.arange(start_dt, end_dt, timedelta(days=1))
     size = len(time_series)
 
-    df = pd.DataFrame(data=RNG.normal(size=(size, 2)), columns=["a", "b"]).assign(
-        y=lambda t: t[["a", "b"]].sum(axis=1),
-    )
+    X = RNG.normal(size=(size, 2))
+    y = X.sum(axis=1) + RNG.normal(size=size) / 100
+    return start_dt, end_dt, time_series, X, y
 
-    X, y = df[["a", "b"]], df["y"]
 
-    return start_dt, end_dt, time_series, X, y
+@pytest.fixture(params=[pd.DataFrame, pl.DataFrame, pa.table])
+def frame_constructor(request) -> Callable[[Dict[str, Any]], IntoDataFrame]:
+    """Fixture to return a eager dataframe constructor."""
+    return request.param
+
+
+@pytest.fixture(params=[np.asarray, da.from_array])
+def array_constructor(request) -> Callable[[np.ndarray], TensorLike]:
+    """Fixture to return an array constructor."""
+    return request.param
diff --git a/tests/test_timebasedsplit.py b/tests/test_timebasedsplit.py
@@ -4,6 +4,7 @@
 from datetime import date
 from datetime import datetime
 
+import narwhals as nw
 import numpy as np
 import pandas as pd
 import pytest
@@ -173,38 +174,66 @@ def test_timebasedcv_split_invalid(valid_kwargs, kwargs):
         next(cv.split(*arrays_, time_series=time_series_, start_dt=start_dt_, end_dt=end_dt_))
 
 
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {"arrays": (X,)},
-        {"arrays": (X, y, X.to_numpy())},  # multi-type arrays
-        # arrays shape different from time_series shape
-        {"start_dt": pd.Timestamp(2023, 1, 1), "end_dt": pd.Timestamp(2023, 1, 31)},
-        {"return_splitstate": True},
-    ],
-)
-def test_timebasedcv_split(valid_kwargs, kwargs):
-    """Tests the TimeBasedSplit.split method."""
+@pytest.mark.parametrize("return_splitstate", [True, False])
+def test_timebasedcv_split_dataframes(valid_kwargs, frame_constructor, generate_test_data, return_splitstate):
+    """Tests the TimeBasedSplit.split method on different dataframe constructors."""
     cv = TimeBasedSplit(**valid_kwargs)
 
-    arrays_ = kwargs.get("arrays", (X, y))
-    time_series_ = kwargs.get("time_series", time_series)
-    start_dt_ = kwargs.get("start_dt")
-    end_dt_ = kwargs.get("end_dt")
-    return_splitstate_ = kwargs.get("return_splitstate", False)
+    start_dt, end_dt, time_series, X, y = generate_test_data
+
+    data = {
+        "x0": X[:, 0],
+        "x1": X[:, 1],
+        "y": y,
+        "ts": time_series,
+    }
+
+    df = nw.from_native(frame_constructor(data), eager_only=True)
+
+    arrays_ = (df.select("x0", "x1").to_native(), df["y"].to_native())
+    time_series_ = df["ts"].to_native()
+
+    n_arrays = len(arrays_)
+    split_results = next(
+        cv.split(
+            *arrays_,
+            time_series=time_series_,
+            start_dt=start_dt,
+            end_dt=end_dt,
+            return_splitstate=return_splitstate,
+        ),
+    )
+
+    if return_splitstate:
+        train_forecast, _ = split_results
+    else:
+        train_forecast = split_results
+
+    assert len(train_forecast) == n_arrays * 2
+
+
+@pytest.mark.parametrize("return_splitstate", [True, False])
+def test_timebasedcv_split_arrays(valid_kwargs, array_constructor, generate_test_data, return_splitstate):
+    """Tests the TimeBasedSplit.split method on different dataframe constructors."""
+    cv = TimeBasedSplit(**valid_kwargs)
+
+    start_dt, end_dt, time_series, X, y = generate_test_data
+
+    arrays_ = (array_constructor(X), array_constructor(y))
+    time_series_ = array_constructor(time_series)
 
     n_arrays = len(arrays_)
     split_results = next(
         cv.split(
             *arrays_,
             time_series=time_series_,
-            start_dt=start_dt_,
-            end_dt=end_dt_,
-            return_splitstate=return_splitstate_,
+            start_dt=start_dt,
+            end_dt=end_dt,
+            return_splitstate=return_splitstate,
         ),
     )
 
-    if return_splitstate_:
+    if return_splitstate:
         train_forecast, _ = split_results
     else:
         train_forecast = split_results

diff --git a/timebasedcv/core.py b/timebasedcv/core.py
@@ -37,7 +37,7 @@
 _window_values = get_args(WindowType)
 _mode_values = get_args(ModeType)
 
-TL = TypeVar("TL", bound=TensorLike)
+TensorLikeT = TypeVar("TensorLikeT", bound=TensorLike)
 
 
 class _CoreTimeBasedSplit:
@@ -431,45 +431,45 @@ class TimeBasedSplit(_CoreTimeBasedSplit):
     @overload
     def split(
         self: Self,
-        *arrays: TL,
+        *arrays: TensorLikeT,
         time_series: SeriesLike[DateTimeLike],
         start_dt: NullableDatetime = None,
         end_dt: NullableDatetime = None,
         return_splitstate: Literal[False],
-    ) -> Generator[Tuple[TL, ...], None, None]: ...  # pragma: no cover
+    ) -> Generator[Tuple[TensorLikeT, ...], None, None]: ...  # pragma: no cover
 
     @overload
     def split(
         self: Self,
-        *arrays: TL,
+        *arrays: TensorLikeT,
         time_series: SeriesLike[DateTimeLike],
         start_dt: NullableDatetime = None,
         end_dt: NullableDatetime = None,
         return_splitstate: Literal[True],
-    ) -> Generator[Tuple[Tuple[TL, ...], SplitState], None, None]: ...  # pragma: no cover
+    ) -> Generator[Tuple[Tuple[TensorLikeT, ...], SplitState], None, None]: ...  # pragma: no cover
 
     @overload
     def split(
         self: Self,
-        *arrays: TL,
+        *arrays: TensorLikeT,
         time_series: SeriesLike[DateTimeLike],
         start_dt: NullableDatetime = None,
         end_dt: NullableDatetime = None,
         return_splitstate: bool = False,
     ) -> Generator[
-        Union[Tuple[TL, ...], Tuple[Tuple[TL, ...], SplitState]],
+        Union[Tuple[TensorLikeT, ...], Tuple[Tuple[TensorLikeT, ...], SplitState]],
         None,
         None,
     ]: ...  # pragma: no cover
 
     def split(
         self: Self,
-        *arrays: TL,
+        *arrays: TensorLikeT,
         time_series: SeriesLike[DateTimeLike],
         start_dt: NullableDatetime = None,
         end_dt: NullableDatetime = None,
         return_splitstate: bool = False,
-    ) -> Generator[Union[Tuple[TL, ...], Tuple[Tuple[TL, ...], SplitState]], None, None]:
+    ) -> Generator[Union[Tuple[TensorLikeT, ...], Tuple[Tuple[TensorLikeT, ...], SplitState]], None, None]:
         """Returns a generator of split arrays based on the `time_series`.
 
         The `time_series` argument is split on split state values to create boolean masks for training - from train_
@@ -523,16 +523,16 @@ def split(
             msg = "At least one array required as input"
             raise ValueError(msg)
 
-        ts_shape = time_series.shape
-        if len(ts_shape) != 1:
-            msg = f"Time series must be 1-dimensional. Got {len(ts_shape)} dimensions."
-            raise ValueError(msg)
-
         arrays_: Tuple[Union[nw.DataFrame, nw.Series, np.ndarray], ...] = tuple(
             nw.from_native(array, eager_only=True, allow_series=True, strict=False) for array in arrays
         )
         time_series_: Union[nw.Series, np.ndarray] = nw.from_native(time_series, series_only=True, strict=False)
 
+        ts_shape = time_series_.shape
+        if len(ts_shape) != 1:
+            msg = f"Time series must be 1-dimensional. Got {len(ts_shape)} dimensions."
+            raise ValueError(msg)
+
         a0 = arrays[0]
         arr_len = a0.shape[0]