Skip to content

Commit

Permalink
Merge branch 'main' into feat/relativedelta
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi authored Nov 12, 2024
2 parents 3467f3d + 0627a92 commit 2d774a6
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 51 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ We introduce two main classes:

- Considering the above choices, we also provide a scikit-learn compatible splitter: [`TimeBasedCVSplitter`](https://fbruzzesi.github.io/timebasedcv/api/sklearn/#timebasedcv.sklearn.TimeBasedCVSplitter). Considering the signature that `.split(...)` requires and the fact that CV Splitters need to know a priori the number of splits, `TimeBasedCVSplitter` is initialized with the time series containing the time information used to generate the train and test indices of each split.

### Dataframe and array agnostic

- Thanks to [Narwhals](https://narwhals-dev.github.io/narwhals/), `TimeBasedSplit` works out of the box with `pandas`, `polars`, `pyarrow` and any other dataframe library supported by Narwhals.
- Thanks to the array API, `TimeBasedSplit` works out of the box with `numpy`, `cupy`, `dask.array` and any other array library that support slicing à la numpy.

## Installation 💻

TL;DR:
Expand Down
5 changes: 5 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ We introduce two main classes:

- Considering the above choices, we also provide a scikit-learn compatible splitter: [`TimeBasedCVSplitter`](api/sklearn.md#timebasedcv.sklearn.TimeBasedCVSplitter){:target="_blank"}. Considering the signature that `.split(...)` requires and the fact that CV Splitters need to know a priori the number of splits, `TimeBasedCVSplitter` is initialized with the time series containing the time information used to generate the train and test indices of each split.

### Dataframe and array agnostic

- Thanks to [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}, `TimeBasedSplit` works out of the box with `pandas`, `polars`, `pyarrow` and any other dataframe library supported by Narwhals.
- Thanks to the array API, `TimeBasedSplit` works out of the box with `numpy`, `cupy`, `dask.array` and any other array library that support slicing à la numpy.

## Installation 💻

TL;DR:
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
!!! info
The minimum Python version supported is 3.8.

- Since **v0.1.0**, the only two dependencies are [`numpy`](https://numpy.org/doc/stable/index.html){:target="_blank"} and [`narwhals>=0.7.15`](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
- Since **v0.1.0**, the only two dependencies are [`numpy`](https://numpy.org/doc/stable/index.html){:target="_blank"} and [`narwhals>=1.0.0`](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.

**Narwhals** allows to have a compatibility layer between polars, pandas and other dataframe libraries. Therefore, as long as narwhals supports such dataframe object, we will as well.

Expand Down
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ repository = "https://github.com/fbruzzesi/timebasedcv"
issue-tracker = "https://github.com/fbruzzesi/timebasedcv/issues"

[project.optional-dependencies]
polars = ["polars>=0.20.3"]
pandas = ["pandas>=1.2.0"]
scikit-learn = ["scikit-learn>=0.19"]

dev = [
Expand All @@ -63,9 +61,13 @@ test = [
"pytest==7.2.0",
"pytest-xdist==3.2.1",
"coverage==7.2.1",
"pandas>=1.2.0",
"polars>=0.20.3",
"pyarrow>=11.0.0",
"dask>=2023.1.0"
]

all = ["timebasedcv[pandas,polars,scikit-learn]"]
all = ["timebasedcv[scikit-learn]"]
all-dev = ["timebasedcv[all,dev,docs,lint,test]"]

[tool.hatch.build.targets.sdist]
Expand Down
41 changes: 29 additions & 12 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
from __future__ import annotations

from datetime import datetime
from datetime import timedelta
from typing import TYPE_CHECKING
from typing import Any
from typing import Callable
from typing import Dict
from typing import List
from typing import Literal
from typing import Tuple
from typing import Union

import dask.array as da
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest

if TYPE_CHECKING:
from narwhals.typing import IntoDataFrame

from timebasedcv.utils._types import TensorLike


@pytest.fixture()
def sample_list() -> List[int]:
Expand Down Expand Up @@ -96,25 +108,30 @@ def valid_kwargs(


@pytest.fixture()
def generate_test_data():
def generate_test_data() -> Tuple[datetime, datetime, np.ndarray, np.ndarray, np.ndarray]:
"""Generate start and end time, time series, X, and y for testing purposes.
Returns:
tuple: A tuple containing the start datetime, end datetime, time series,
X (dataframe with columns "a" and "b"), and y (series).
tuple: A tuple containing the start datetime, end datetime, time series, X and y.
"""
RNG = np.random.default_rng()

start_dt = pd.Timestamp(2023, 1, 1)
end_dt = pd.Timestamp(2023, 1, 31)

time_series = pd.Series(pd.date_range(start_dt, end_dt, freq="D"))
start_dt, end_dt = datetime(2023, 1, 1), datetime(2023, 1, 31)
time_series = np.arange(start_dt, end_dt, timedelta(days=1))
size = len(time_series)

df = pd.DataFrame(data=RNG.normal(size=(size, 2)), columns=["a", "b"]).assign(
y=lambda t: t[["a", "b"]].sum(axis=1),
)
X = RNG.normal(size=(size, 2))
y = X.sum(axis=1) + RNG.normal(size=size) / 100
return start_dt, end_dt, time_series, X, y

X, y = df[["a", "b"]], df["y"]

return start_dt, end_dt, time_series, X, y
@pytest.fixture(params=[pd.DataFrame, pl.DataFrame, pa.table])
def frame_constructor(request) -> Callable[[Dict[str, Any]], IntoDataFrame]:
"""Fixture to return a eager dataframe constructor."""
return request.param


@pytest.fixture(params=[np.asarray, da.from_array])
def array_constructor(request) -> Callable[[np.ndarray], TensorLike]:
"""Fixture to return an array constructor."""
return request.param
71 changes: 50 additions & 21 deletions tests/test_timebasedsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date
from datetime import datetime

import narwhals as nw
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -173,38 +174,66 @@ def test_timebasedcv_split_invalid(valid_kwargs, kwargs):
next(cv.split(*arrays_, time_series=time_series_, start_dt=start_dt_, end_dt=end_dt_))


@pytest.mark.parametrize(
"kwargs",
[
{"arrays": (X,)},
{"arrays": (X, y, X.to_numpy())}, # multi-type arrays
# arrays shape different from time_series shape
{"start_dt": pd.Timestamp(2023, 1, 1), "end_dt": pd.Timestamp(2023, 1, 31)},
{"return_splitstate": True},
],
)
def test_timebasedcv_split(valid_kwargs, kwargs):
"""Tests the TimeBasedSplit.split method."""
@pytest.mark.parametrize("return_splitstate", [True, False])
def test_timebasedcv_split_dataframes(valid_kwargs, frame_constructor, generate_test_data, return_splitstate):
"""Tests the TimeBasedSplit.split method on different dataframe constructors."""
cv = TimeBasedSplit(**valid_kwargs)

arrays_ = kwargs.get("arrays", (X, y))
time_series_ = kwargs.get("time_series", time_series)
start_dt_ = kwargs.get("start_dt")
end_dt_ = kwargs.get("end_dt")
return_splitstate_ = kwargs.get("return_splitstate", False)
start_dt, end_dt, time_series, X, y = generate_test_data

data = {
"x0": X[:, 0],
"x1": X[:, 1],
"y": y,
"ts": time_series,
}

df = nw.from_native(frame_constructor(data), eager_only=True)

arrays_ = (df.select("x0", "x1").to_native(), df["y"].to_native())
time_series_ = df["ts"].to_native()

n_arrays = len(arrays_)
split_results = next(
cv.split(
*arrays_,
time_series=time_series_,
start_dt=start_dt,
end_dt=end_dt,
return_splitstate=return_splitstate,
),
)

if return_splitstate:
train_forecast, _ = split_results
else:
train_forecast = split_results

assert len(train_forecast) == n_arrays * 2


@pytest.mark.parametrize("return_splitstate", [True, False])
def test_timebasedcv_split_arrays(valid_kwargs, array_constructor, generate_test_data, return_splitstate):
"""Tests the TimeBasedSplit.split method on different dataframe constructors."""
cv = TimeBasedSplit(**valid_kwargs)

start_dt, end_dt, time_series, X, y = generate_test_data

arrays_ = (array_constructor(X), array_constructor(y))
time_series_ = array_constructor(time_series)

n_arrays = len(arrays_)
split_results = next(
cv.split(
*arrays_,
time_series=time_series_,
start_dt=start_dt_,
end_dt=end_dt_,
return_splitstate=return_splitstate_,
start_dt=start_dt,
end_dt=end_dt,
return_splitstate=return_splitstate,
),
)

if return_splitstate_:
if return_splitstate:
train_forecast, _ = split_results
else:
train_forecast = split_results
Expand Down
28 changes: 14 additions & 14 deletions timebasedcv/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
_window_values = get_args(WindowType)
_mode_values = get_args(ModeType)

TL = TypeVar("TL", bound=TensorLike)
TensorLikeT = TypeVar("TensorLikeT", bound=TensorLike)


class _CoreTimeBasedSplit:
Expand Down Expand Up @@ -431,45 +431,45 @@ class TimeBasedSplit(_CoreTimeBasedSplit):
@overload
def split(
self: Self,
*arrays: TL,
*arrays: TensorLikeT,
time_series: SeriesLike[DateTimeLike],
start_dt: NullableDatetime = None,
end_dt: NullableDatetime = None,
return_splitstate: Literal[False],
) -> Generator[Tuple[TL, ...], None, None]: ... # pragma: no cover
) -> Generator[Tuple[TensorLikeT, ...], None, None]: ... # pragma: no cover

@overload
def split(
self: Self,
*arrays: TL,
*arrays: TensorLikeT,
time_series: SeriesLike[DateTimeLike],
start_dt: NullableDatetime = None,
end_dt: NullableDatetime = None,
return_splitstate: Literal[True],
) -> Generator[Tuple[Tuple[TL, ...], SplitState], None, None]: ... # pragma: no cover
) -> Generator[Tuple[Tuple[TensorLikeT, ...], SplitState], None, None]: ... # pragma: no cover

@overload
def split(
self: Self,
*arrays: TL,
*arrays: TensorLikeT,
time_series: SeriesLike[DateTimeLike],
start_dt: NullableDatetime = None,
end_dt: NullableDatetime = None,
return_splitstate: bool = False,
) -> Generator[
Union[Tuple[TL, ...], Tuple[Tuple[TL, ...], SplitState]],
Union[Tuple[TensorLikeT, ...], Tuple[Tuple[TensorLikeT, ...], SplitState]],
None,
None,
]: ... # pragma: no cover

def split(
self: Self,
*arrays: TL,
*arrays: TensorLikeT,
time_series: SeriesLike[DateTimeLike],
start_dt: NullableDatetime = None,
end_dt: NullableDatetime = None,
return_splitstate: bool = False,
) -> Generator[Union[Tuple[TL, ...], Tuple[Tuple[TL, ...], SplitState]], None, None]:
) -> Generator[Union[Tuple[TensorLikeT, ...], Tuple[Tuple[TensorLikeT, ...], SplitState]], None, None]:
"""Returns a generator of split arrays based on the `time_series`.
The `time_series` argument is split on split state values to create boolean masks for training - from train_
Expand Down Expand Up @@ -523,16 +523,16 @@ def split(
msg = "At least one array required as input"
raise ValueError(msg)

ts_shape = time_series.shape
if len(ts_shape) != 1:
msg = f"Time series must be 1-dimensional. Got {len(ts_shape)} dimensions."
raise ValueError(msg)

arrays_: Tuple[Union[nw.DataFrame, nw.Series, np.ndarray], ...] = tuple(
nw.from_native(array, eager_only=True, allow_series=True, strict=False) for array in arrays
)
time_series_: Union[nw.Series, np.ndarray] = nw.from_native(time_series, series_only=True, strict=False)

ts_shape = time_series_.shape
if len(ts_shape) != 1:
msg = f"Time series must be 1-dimensional. Got {len(ts_shape)} dimensions."
raise ValueError(msg)

a0 = arrays[0]
arr_len = a0.shape[0]

Expand Down

0 comments on commit 2d774a6

Please sign in to comment.