diff --git a/.github/workflows/check_docs.yml b/.github/workflows/check_docs.yml new file mode 100644 index 0000000000..182181335d --- /dev/null +++ b/.github/workflows/check_docs.yml @@ -0,0 +1,28 @@ +name: Check Documentation + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + check-docs: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Check for documentation updates + run: ./scripts/check_docs.sh diff --git a/ci/check_docs.sh b/ci/check_docs.sh new file mode 100755 index 0000000000..14cc75b100 --- /dev/null +++ b/ci/check_docs.sh @@ -0,0 +1,8 @@ +set -e + +if git diff --quiet HEAD^ HEAD -- docs/; then + echo "Documentation has not been updated." + exit 1 +else + echo "Documentation is up-to-date." +fi diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py index 6f8b769181..0833b07b77 100644 --- a/daft/dataframe/dataframe.py +++ b/daft/dataframe/dataframe.py @@ -744,15 +744,49 @@ def write_lance( -------- + >>> import daft >>> df = daft.from_pydict({"a": [1, 2, 3, 4]}) >>> df.write_lance("/tmp/lance/my_table.lance") - >>> daft.read_lance("/tmp/lance/my_table.lance") + ╭───────────────┬──────────────────┬─────────────────┬─────────╮ + │ num_fragments ┆ num_deleted_rows ┆ num_small_files ┆ version │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════════════╪══════════════════╪═════════════════╪═════════╡ + │ 1 ┆ 0 ┆ 1 ┆ 1 │ + ╰───────────────┴──────────────────┴─────────────────┴─────────╯ + + (Showing first 1 of 1 rows) + + >>> daft.read_lance("/tmp/lance/my_table.lance").collect() + ╭───────╮ + │ a │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ├╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ├╌╌╌╌╌╌╌┤ + │ 4 │ + ╰───────╯ + + (Showing first 4 of 4 rows) # Pass additional keyword arguments to the Lance writer # All additional keyword arguments are passed to `lance.write_fragments` >>> df.write_lance("/tmp/lance/my_table.lance", mode="overwrite", max_bytes_per_file=1024) - + ╭───────────────┬──────────────────┬─────────────────┬─────────╮ + │ num_fragments ┆ num_deleted_rows ┆ num_small_files ┆ version │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════════════╪══════════════════╪═════════════════╪═════════╡ + │ 1 ┆ 0 ┆ 1 ┆ 2 │ + ╰───────────────┴──────────────────┴─────────────────┴─────────╯ + + (Showing first 1 of 1 rows) """ import sys @@ -897,6 +931,7 @@ def _add_monotonically_increasing_id(self, column_name: Optional[str] = None) -> in the lower 36 bits. This allows for 2^28 ≈ 268 million partitions and 2^40 ≈ 68 billion rows per partition. Example: + >>> import daft >>> df = daft.from_pydict({"a": [1, 2, 3, 4]}).into_partitions(2) >>> df = df._add_monotonically_increasing_id() >>> df.show() @@ -913,6 +948,7 @@ def _add_monotonically_increasing_id(self, column_name: Optional[str] = None) -> ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ │ 68719476737 ┆ 4 │ ╰─────────────┴───────╯ + (Showing first 4 of 4 rows) Args: @@ -929,19 +965,24 @@ def _add_monotonically_increasing_id(self, column_name: Optional[str] = None) -> def select(self, *columns: ColumnInputType) -> "DataFrame": """Creates a new DataFrame from the provided expressions, similar to a SQL ``SELECT`` - Example: - - >>> # names of columns as strings - >>> df = df.select('x', 'y') - >>> - >>> # names of columns as expressions - >>> df = df.select(col('x'), col('y')) - >>> - >>> # call expressions - >>> df = df.select(col('x') * col('y')) - >>> - >>> # any mix of the above - >>> df = df.select('x', col('y'), col('z') + 1) + Examples: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) + >>> df = df.select('x', daft.col('y'), daft.col('z') + 1) + >>> df.show() + ╭───────┬───────┬───────╮ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪═══════╡ + │ 1 ┆ 4 ┆ 8 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 5 ┆ 9 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 6 ┆ 10 │ + ╰───────┴───────┴───────╯ + + (Showing first 3 of 3 rows) Args: *columns (Union[str, Expression]): columns to select from the current DataFrame @@ -958,7 +999,21 @@ def distinct(self) -> "DataFrame": """Computes unique rows, dropping duplicates Example: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 2], "y": [4, 5, 5], "z": [7, 8, 8]}) >>> unique_df = df.distinct() + >>> unique_df.show() + ╭───────┬───────┬───────╮ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪═══════╡ + │ 2 ┆ 5 ┆ 8 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 1 ┆ 4 ┆ 7 │ + ╰───────┴───────┴───────╯ + + (Showing first 2 of 2 rows) Returns: DataFrame: DataFrame that has only unique rows. @@ -977,7 +1032,20 @@ def sample( """Samples a fraction of rows from the DataFrame Example: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) >>> sampled_df = df.sample(0.5) + >>> # Samples will vary from output to output + >>> # here is a sample output + >>> # ╭───────┬───────┬───────╮ + >>> # │ x ┆ y ┆ z │ + >>> # │ --- ┆ --- ┆ --- │ + >>> # │ Int64 ┆ Int64 ┆ Int64 │ + >>> # |═══════╪═══════╪═══════╡ + >>> # │ 2 ┆ 5 ┆ 8 │ + >>> # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + >>> # │ 3 ┆ 6 ┆ 9 │ + >>> # ╰───────┴───────┴───────╯ Args: fraction (float): fraction of rows to sample. @@ -1000,7 +1068,23 @@ def exclude(self, *names: str) -> "DataFrame": This is equivalent of performing a select with all the columns but the ones excluded. Example: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) >>> df_without_x = df.exclude('x') + >>> df_without_x.show() + ╭───────┬───────╮ + │ y ┆ z │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 4 ┆ 7 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 5 ┆ 8 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 6 ┆ 9 │ + ╰───────┴───────╯ + + (Showing first 3 of 3 rows) Args: *names (str): names to exclude @@ -1016,7 +1100,21 @@ def where(self, predicate: Expression) -> "DataFrame": """Filters rows via a predicate expression, similar to SQL ``WHERE``. Example: - >>> filtered_df = df.where((col('x') < 10) & (col('y') == 10)) + + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) + >>> df.where((col('x') > 1) & (col('y') > 1)).collect() + ╭───────┬───────┬───────╮ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪═══════╡ + │ 2 ┆ 5 ┆ 8 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 6 ┆ 9 │ + ╰───────┴───────┴───────╯ + + (Showing first 2 of 2 rows) Args: predicate (Expression): expression that keeps row if evaluates to True. @@ -1038,7 +1136,23 @@ def with_column( with all current columns and the new one Example: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3]}) >>> new_df = df.with_column('x+1', col('x') + 1) + >>> new_df.show() + ╭───────┬───────╮ + │ x ┆ x+1 │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 1 ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 3 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 │ + ╰───────┴───────╯ + + (Showing first 3 of 3 rows) Args: column_name (str): name of new column @@ -1060,13 +1174,9 @@ def with_columns( with all current columns and the new ones Example: + >>> import daft >>> df = daft.from_pydict({'x': [1, 2, 3], 'y': [4, 5, 6]}) - >>> - >>> new_df = df.with_columns({ - 'foo': df['x'] + 1, - 'bar': df['y'] - df['x'] - }) - + >>> new_df = df.with_columns({'foo': df['x'] + 1,'bar': df['y'] - df['x']}) >>> new_df.show() ╭───────┬───────┬───────┬───────╮ │ x ┆ y ┆ foo ┆ bar │ @@ -1079,6 +1189,7 @@ def with_columns( ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ │ 3 ┆ 6 ┆ 4 ┆ 3 │ ╰───────┴───────┴───────┴───────╯ + (Showing first 3 of 3 rows) Args: @@ -1104,14 +1215,50 @@ def sort( ) -> "DataFrame": """Sorts DataFrame globally - Example: - >>> sorted_df = df.sort(col('x') + col('y')) - >>> sorted_df = df.sort([col('x'), col('y')], desc=[False, True]) - >>> sorted_df = df.sort(['z', col('x'), col('y')], desc=[True, False, True]) - Note: * Since this a global sort, this requires an expensive repartition which can be quite slow. * Supports multicolumn sorts and can have unique `descending` flag per column. + + Example: + >>> import daft + >>> df = daft.from_pydict({"x": [3, 2, 1], "y": [6, 4, 5]}) + >>> sorted_df = df.sort(col('x') + col('y')) + >>> sorted_df.show() + ╭───────┬───────╮ + │ x ┆ y │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 2 ┆ 4 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 1 ┆ 5 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 6 │ + ╰───────┴───────╯ + + (Showing first 3 of 3 rows) + + You can also sort by multiple columns, and specify the 'descending' flag for each column: + + >>> df = daft.from_pydict({"x": [1, 2, 1, 2], "y": [9, 8, 7, 6]}) + >>> sorted_df = df.sort(["x", "y"], [True, False]) + >>> sorted_df.show() + ╭───────┬───────╮ + │ x ┆ y │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 2 ┆ 6 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 8 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 1 ┆ 7 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 1 ┆ 9 │ + ╰───────┴───────╯ + + (Showing first 4 of 4 rows) + Args: column (Union[ColumnInputType, List[ColumnInputType]]): column to sort by. Can be `str` or expression as well as a list of either. desc (Union[bool, List[bool]), optional): Sort by descending order. Defaults to False. @@ -1132,7 +1279,27 @@ def limit(self, num: int) -> "DataFrame": """Limits the rows in the DataFrame to the first ``N`` rows, similar to a SQL ``LIMIT`` Example: - >>> df_limited = df.limit(10) # returns 10 rows + >>> import daft + >>> df = df = daft.from_pydict({"x": [1, 2, 3, 4, 5, 6, 7]}) + >>> df_limited = df.limit(5) # returns 5 rows + >>> df_limited.show() + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ├╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ├╌╌╌╌╌╌╌┤ + │ 4 │ + ├╌╌╌╌╌╌╌┤ + │ 5 │ + ╰───────╯ + + (Showing first 5 of 5 rows) Args: num (int): maximum rows to allow. @@ -1174,8 +1341,11 @@ def repartition(self, num: Optional[int], *partition_by: ColumnInputType) -> "Da which avoids shuffling of data in favor of splitting/coalescing adjacent partitions where appropriate. Example: - >>> random_repart_df = df.repartition(4) - >>> part_by_df = df.repartition(4, 'x', col('y') + 1) + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) + >>> repartitioned_df = df.repartition(3) + >>> repartitioned_df.num_partitions() + 3 Args: num (Optional[int]): Number of target partitions; if None, the number of partitions will not be changed. @@ -1203,7 +1373,11 @@ def into_partitions(self, num: int) -> "DataFrame": (i.e. if there are 2 partitions, and change it into 3, this function will just split the bigger one) Example: + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) >>> df_with_5_partitions = df.into_partitions(5) + >>> df_with_5_partitions.num_partitions() + 5 Args: num (int): number of target partitions. @@ -1304,10 +1478,38 @@ def drop_nan(self, *cols: ColumnInputType): If column names are supplied, it will drop only those rows that contains NaNs in one of these columns. Example: + >>> import daft >>> df = daft.from_pydict({"a": [1.0, 2.2, 3.5, float("nan")]}) - >>> df.drop_na() # drops rows where any column contains NaN values + >>> df.drop_nan().collect() # drops rows where any column contains NaN values + ╭─────────╮ + │ a │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 1 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ 2.2 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ 3.5 │ + ╰─────────╯ + + (Showing first 3 of 3 rows) + >>> df = daft.from_pydict({"a": [1.6, 2.5, 3.3, float("nan")]}) - >>> df.drop_na("a") # drops rows where column a contains NaN values + >>> df.drop_nan("a").collect() # drops rows where column a contains NaN values + ╭─────────╮ + │ a │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 1.6 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ 2.5 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ 3.3 │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Args: *cols (str): column names by which rows containing nans/NULLs should be filtered @@ -1342,10 +1544,22 @@ def drop_null(self, *cols: ColumnInputType): If column names are supplied, it will drop only those rows that contains NULLs in one of these columns. Example: - >>> df = daft.from_pydict({"a": [1.0, 2.2, 3.5, float("NaN")]}) - >>> df.drop_null() # drops rows where any column contains Null/NaN values + >>> import daft >>> df = daft.from_pydict({"a": [1.6, 2.5, None, float("NaN")]}) - >>> df.drop_null("a") # drops rows where column a contains Null/NaN values + >>> df.drop_null("a").collect() + ╭─────────╮ + │ a │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 1.6 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ 2.5 │ + ├╌╌╌╌╌╌╌╌╌┤ + │ NaN │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Args: *cols (str): column names by which rows containing nans should be filtered @@ -1370,27 +1584,31 @@ def explode(self, *columns: ColumnInputType) -> "DataFrame": Exploding Null values or empty lists will create a single Null entry (see example below). Example: - >>> df = daft.from_pydict({ - >>> "x": [[1], [2, 3]], - >>> "y": [["a"], ["b", "c"]], - >>> "z": [1.0, 2.0], - >>> ]}) - >>> - >>> df.explode(col("x"), col("y")) - >>> - >>> # +------+-----------+-----+ +------+------+-----+ - >>> # | x | y | z | | x | y | z | - >>> # +------+-----------+-----+ +------+------+-----+ - >>> # |[1] | ["a"] | 1.0 | | 1 | "a" | 1.0 | - >>> # +------+-----------+-----+ -> +------+------+-----+ - >>> # |[2, 3]| ["b", "c"]| 2.0 | | 2 | "b" | 2.0 | - >>> # +------+-----------+-----+ +------+------+-----+ - >>> # |[] | [] | 3.0 | | 3 | "c" | 2.0 | - >>> # +------+-----------+-----+ +------+------+-----+ - >>> # |None | None | 4.0 | | None | None | 3.0 | - >>> # +------+-----------+-----+ +------+------+-----+ - >>> # | None | None | 4.0 | - >>> # +------+------+-----+ + >>> import daft + >>> df = daft.from_pydict( + ... { + ... "x": [[1], [2, 3]], + ... "y": [["a"], ["b", "c"]], + ... "z": [ + ... [1.0], + ... [2.0, 2.0], + ... ], + ... } + ... ) + >>> df.explode(col("x"), col("y")).collect() + ╭───────┬──────┬───────────────╮ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Utf8 ┆ List[Float64] │ + ╞═══════╪══════╪═══════════════╡ + │ 1 ┆ a ┆ [1] │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ b ┆ [2, 2] │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ c ┆ [2, 2] │ + ╰───────┴──────┴───────────────╯ + + (Showing first 3 of 3 rows) Args: *columns (ColumnInputType): columns to explode @@ -1413,24 +1631,12 @@ def unpivot( """Unpivots a DataFrame from wide to long format. Example: + >>> import daft >>> df = daft.from_pydict({ ... "year": [2020, 2021, 2022], ... "Jan": [10, 30, 50], ... "Feb": [20, 40, 60], ... }) - >>> df - ╭───────┬───────┬───────╮ - │ year ┆ Jan ┆ Feb │ - │ --- ┆ --- ┆ --- │ - │ Int64 ┆ Int64 ┆ Int64 │ - ╞═══════╪═══════╪═══════╡ - │ 2020 ┆ 10 ┆ 20 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 2021 ┆ 30 ┆ 40 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 2022 ┆ 50 ┆ 60 │ - ╰───────┴───────┴───────╯ - (Showing first 3 of 3 rows) >>> df = df.unpivot("year", ["Jan", "Feb"], variable_name="month", value_name="inventory") >>> df = df.sort("year") >>> df.show() @@ -1451,6 +1657,7 @@ def unpivot( ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ │ 2022 ┆ Feb ┆ 60 │ ╰───────┴───────┴───────────╯ + (Showing first 6 of 6 rows) Args: @@ -1673,6 +1880,7 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": ╞═════════╪════════════════════╪════════════════════╪═══════════╡ │ 0.55 ┆ 0.8500000000000001 ┆ 0.6000000000000001 ┆ 0.85 │ ╰─────────┴────────────────────┴────────────────────┴───────────╯ + (Showing first 1 of 1 rows) Args: @@ -1711,6 +1919,7 @@ def groupby(self, *group_by: ManyColumnsInputType) -> "GroupedDataFrame": ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ │ dog ┆ 2 ┆ 3 ┆ 2 ┆ Jordan │ ╰──────┴─────────┴─────────┴────────┴────────╯ + (Showing first 2 of 2 rows) Args: @@ -1738,12 +1947,13 @@ def pivot( determine the unique values to pivot on. Example: + >>> import daft >>> data = { - "id": [1, 2, 3, 4], - "version": ["3.8", "3.8", "3.9", "3.9"], - "platform": ["macos", "macos", "macos", "windows"], - "downloads": [100, 200, 150, 250], - } + ... "id": [1, 2, 3, 4], + ... "version": ["3.8", "3.8", "3.9", "3.9"], + ... "platform": ["macos", "macos", "macos", "windows"], + ... "downloads": [100, 200, 150, 250], + ... } >>> df = daft.from_pydict(data) >>> df = df.pivot("version", "platform", "downloads", "sum") >>> df.show() @@ -1756,6 +1966,8 @@ def pivot( ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ │ 3.8 ┆ None ┆ 300 │ ╰─────────┴─────────┴───────╯ + + (Showing first 2 of 2 rows) Args: group_by (ManyColumnsInputType): columns to group by @@ -1903,7 +2115,10 @@ def __contains__(self, col_name: str) -> bool: """Returns whether the column exists in the dataframe. Example: - >>> "x" in df + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) + >>> 'x' in df + True Args: col_name (str): column name @@ -2312,6 +2527,7 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ │ dog ┆ 2 ┆ 3 ┆ 2 ┆ Jordan │ ╰──────┴─────────┴─────────┴────────┴────────╯ + (Showing first 2 of 2 rows) Args: @@ -2326,12 +2542,11 @@ def map_groups(self, udf: Expression) -> "DataFrame": """Apply a user-defined function to each group. The name of the resultant column will default to the name of the first input column. Example: + >>> import daft, statistics >>> df = daft.from_pydict({"group": ["a", "a", "a", "b", "b", "b"], "data": [1, 20, 30, 4, 50, 600]}) - >>> >>> @daft.udf(return_dtype=daft.DataType.float64()) ... def std_dev(data): ... return [statistics.stdev(data.to_pylist())] - >>> >>> df = df.groupby("group").map_groups(std_dev(df["data"])) >>> df.show() ╭───────┬────────────────────╮ @@ -2343,6 +2558,7 @@ def map_groups(self, udf: Expression) -> "DataFrame": ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ b ┆ 331.62026476076517 │ ╰───────┴────────────────────╯ + (Showing first 2 of 2 rows) Args: diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 05ab6e81e8..6c5ac15697 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -64,7 +64,23 @@ def lit(value: object) -> Expression: """Creates an Expression representing a column with every value set to the provided value Example: - >>> col("x") + lit(1) + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3]}) + >>> df = df.with_column("y", daft.lit(1)) + >>> df.show() + ╭───────┬───────╮ + │ x ┆ y │ + │ --- ┆ --- │ + │ Int64 ┆ Int32 │ + ╞═══════╪═══════╡ + │ 1 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 1 │ + ╰───────┴───────╯ + + (Showing first 3 of 3 rows) Args: val: value of column @@ -121,7 +137,6 @@ def col(name: str) -> Expression: (Showing first 3 of 3 rows) - Args: name: Name of column @@ -329,7 +344,23 @@ def alias(self, name: builtins.str) -> Expression: by which subsequent expressions can refer to the results of this expression. Example: - >>> col("x").alias("y") + >>> import daft + >>> df = daft.from_pydict({"x": [1, 2, 3]}) + >>> df = df.select(col("x").alias("y")) + >>> df.show() + ╭───────╮ + │ y │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ├╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ╰───────╯ + + (Showing first 3 of 3 rows) Args: name: New name for expression @@ -345,12 +376,23 @@ def cast(self, dtype: DataType) -> Expression: """Casts an expression to the given datatype if possible Example: - - >>> # [1.0, 2.5, None]: float32 -> [1, 2, None]: int64 - >>> col("float").cast(DataType.int64()) - >>> - >>> # [Path("/tmp1"), Path("/tmp2"), Path("/tmp3")]: Python -> ["/tmp1", "/tmp1", "/tmp1"]: utf8 - >>> col("path_obj_col").cast(DataType.string()) + >>> import daft + >>> df = daft.from_pydict({"float": [1.0, 2.5, None]}) + >>> df = df.select(daft.col("float").cast(daft.DataType.int64())) + >>> df.show() + ╭───────╮ + │ float │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ├╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ None │ + ╰───────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Expression with the specified new datatype @@ -513,47 +555,47 @@ def approx_percentiles(self, percentiles: builtins.float | builtins.list[builtin 3. If ``percentiles`` are supplied as a single float, then the resultant column is a ``Float64`` column 4. If ``percentiles`` is supplied as a list, then the resultant column is a ``FixedSizeList[Float64; N]`` column, where ``N`` is the length of the supplied list. - Example of a global calculation of approximate percentiles: - - >>> df = daft.from_pydict({"scores": [1, 2, 3, 4, 5, None]}) - >>> df = df.agg( - >>> df["scores"].approx_percentiles(0.5).alias("approx_median_score"), - >>> df["scores"].approx_percentiles([0.25, 0.5, 0.75]).alias("approx_percentiles_scores"), - >>> ) - >>> df.show() - ╭─────────────────────┬────────────────────────────────╮ - │ approx_median_score ┆ approx_percentiles_scores │ - │ --- ┆ --- │ - │ Float64 ┆ FixedSizeList[Float64; 3] │ - ╞═════════════════════╪════════════════════════════════╡ - │ 2.9742334234767167 ┆ [1.993661701417351, 2.9742334… │ - ╰─────────────────────┴────────────────────────────────╯ - (Showing first 1 of 1 rows) - - Example of a grouped calculation of approximate percentiles: - - >>> df = daft.from_pydict({ - >>> "class": ["a", "a", "a", "b", "c"], - >>> "scores": [1, 2, 3, 1, None], - >>> }) - >>> df = df.groupby("class").agg( - >>> df["scores"].approx_percentiles(0.5).alias("approx_median_score"), - >>> df["scores"].approx_percentiles([0.25, 0.5, 0.75]).alias("approx_percentiles_scores"), - >>> ) - >>> df.show() - ╭───────┬─────────────────────┬────────────────────────────────╮ - │ class ┆ approx_median_score ┆ approx_percentiles_scores │ - │ --- ┆ --- ┆ --- │ - │ Utf8 ┆ Float64 ┆ FixedSizeList[Float64; 3] │ - ╞═══════╪═════════════════════╪════════════════════════════════╡ - │ c ┆ None ┆ None │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ a ┆ 1.993661701417351 ┆ [0.9900000000000001, 1.993661… │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ b ┆ 0.9900000000000001 ┆ [0.9900000000000001, 0.990000… │ - ╰───────┴─────────────────────┴────────────────────────────────╯ - (Showing first 3 of 3 rows) + Example: + A global calculation of approximate percentiles: + >>> import daft + >>> df = daft.from_pydict({"scores": [1, 2, 3, 4, 5, None]}) + >>> df = df.agg( + ... df["scores"].approx_percentiles(0.5).alias("approx_median_score"), + ... df["scores"].approx_percentiles([0.25, 0.5, 0.75]).alias("approx_percentiles_scores"), + ... ) + >>> df.show() + ╭─────────────────────┬────────────────────────────────╮ + │ approx_median_score ┆ approx_percentiles_scores │ + │ --- ┆ --- │ + │ Float64 ┆ FixedSizeList[Float64; 3] │ + ╞═════════════════════╪════════════════════════════════╡ + │ 2.9742334234767167 ┆ [1.993661701417351, 2.9742334… │ + ╰─────────────────────┴────────────────────────────────╯ + + (Showing first 1 of 1 rows) + + A grouped calculation of approximate percentiles: + + >>> df = daft.from_pydict({"class": ["a", "a", "a", "b", "c"], "scores": [1, 2, 3, 1, None]}) + >>> df = df.groupby("class").agg( + ... df["scores"].approx_percentiles(0.5).alias("approx_median_score"), + ... df["scores"].approx_percentiles([0.25, 0.5, 0.75]).alias("approx_percentiles_scores"), + ... ) + >>> df.show() + ╭───────┬─────────────────────┬────────────────────────────────╮ + │ class ┆ approx_median_score ┆ approx_percentiles_scores │ + │ --- ┆ --- ┆ --- │ + │ Utf8 ┆ Float64 ┆ FixedSizeList[Float64; 3] │ + ╞═══════╪═════════════════════╪════════════════════════════════╡ + │ c ┆ None ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ a ┆ 1.993661701417351 ┆ [0.9900000000000001, 1.993661… │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ b ┆ 0.9900000000000001 ┆ [0.9900000000000001, 0.990000… │ + ╰───────┴─────────────────────┴────────────────────────────────╯ + + (Showing first 3 of 3 rows) Args: percentiles: the percentile(s) at which to find approximate values at. Can be provided as a single @@ -607,12 +649,23 @@ def if_else(self, if_true: Expression, if_false: Expression) -> Expression: """Conditionally choose values between two expressions using the current boolean expression as a condition Example: - >>> # x = [2, 2, 2] - >>> # y = [1, 2, 3] - >>> # a = ["a", "a", "a"] - >>> # b = ["b", "b", "b"] - >>> # if_else_result = ["a", "b", "b"] - >>> (col("x") > col("y")).if_else(col("a"), col("b")) + >>> import daft + >>> df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + >>> df = df.with_column("A_if_bigger_else_B", (df["A"] > df["B"]).if_else(df["A"], df["B"]),) + >>> df.collect() + ╭───────┬───────┬────────────────────╮ + │ A ┆ B ┆ A_if_bigger_else_B │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪════════════════════╡ + │ 1 ┆ 0 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 ┆ 4 │ + ╰───────┴───────┴────────────────────╯ + + (Showing first 3 of 3 rows) Args: if_true (Expression): Values to choose if condition is true @@ -634,10 +687,27 @@ def apply(self, func: Callable, return_dtype: DataType) -> Expression: use a UDF instead. Example: + >>> import daft + >>> df = daft.from_pydict({"x": ["1", "2", "tim"]}) >>> def f(x_val: str) -> int: - >>> return int(x_val) if x_val.isnumeric() else 0 - >>> - >>> col("x").apply(f, return_dtype=DataType.int64()) + ... if x_val.isnumeric(): + ... return int(x_val) + ... else: + ... return 0 + >>> df.with_column("num_x", df['x'].apply(f, return_dtype=daft.DataType.int64())).collect() + ╭──────┬───────╮ + │ x ┆ num_x │ + │ --- ┆ --- │ + │ Utf8 ┆ Int64 │ + ╞══════╪═══════╡ + │ 1 ┆ 1 │ + ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 │ + ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ tim ┆ 0 │ + ╰──────┴───────╯ + + (Showing first 3 of 3 rows) Args: func: Function to run per value of the expression @@ -657,8 +727,23 @@ def is_null(self) -> Expression: """Checks if values in the Expression are Null (a special value indicating missing data) Example: - >>> # [1., None, NaN] -> [False, True, False] - >>> col("x").is_null() + >>> import daft + >>> df = daft.from_pydict({"x": [1., None, float("nan")]}) + >>> df = df.select(df['x'].is_null()) + >>> df.collect() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Boolean Expression indicating whether values are missing @@ -670,8 +755,23 @@ def not_null(self) -> Expression: """Checks if values in the Expression are not Null (a special value indicating missing data) Example: - >>> # [1., None, NaN] -> [True, False, True] - >>> col("x").not_null() + >>> import daft + >>> df = daft.from_pydict({"x": [1., None, float("nan")]}) + >>> df = df.select(df['x'].not_null()) + >>> df.collect() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Boolean Expression indicating whether values are not missing @@ -683,6 +783,7 @@ def fill_null(self, fill_value: Expression) -> Expression: """Fills null values in the Expression with the provided fill_value Example: + >>> import daft >>> df = daft.from_pydict({"data": [1, None, 3]}) >>> df = df.select(df["data"].fill_null(2)) >>> df.collect() @@ -697,6 +798,8 @@ def fill_null(self, fill_value: Expression) -> Expression: ├╌╌╌╌╌╌╌┤ │ 3 │ ╰───────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Expression with null values filled with the provided fill_value @@ -710,8 +813,23 @@ def is_in(self, other: Any) -> Expression: """Checks if values in the Expression are in the provided list Example: - >>> # [1, 2, 3] -> [True, False, True] - >>> col("x").is_in([1, 3]) + >>> import daft + >>> df = daft.from_pydict({"data": [1, 2, 3]}) + >>> df = df.select(df["data"].is_in([1, 3])) + >>> df.collect() + ╭─────────╮ + │ data │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Boolean Expression indicating whether values are in the provided list @@ -728,8 +846,25 @@ def between(self, lower: Any, upper: Any) -> Expression: """Checks if values in the Expression are between lower and upper, inclusive. Example: - >>> # [1, 2, 3, 4] -> [True, True, False, False] - >>> col("x").between(1, 2) + >>> import daft + >>> df = daft.from_pydict({"data": [1, 2, 3, 4]}) + >>> df = df.select(df["data"].between(1, 2)) + >>> df.collect() + ╭─────────╮ + │ data │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 4 of 4 rows) Returns: Expression: Boolean Expression indicating whether values are between lower and upper, inclusive. @@ -886,8 +1021,23 @@ def is_nan(self) -> Expression: Nulls will be propagated! I.e. this operation will return a null for null values. Example: - >>> # [1., None, NaN] -> [False, None, True] - >>> col("x").float.is_nan() + >>> import daft + >>> df = daft.from_pydict({"data": [1., None, float("nan")]}) + >>> df = df.select(df["data"].float.is_nan()) + >>> df.collect() + ╭─────────╮ + │ data │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ None │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Boolean Expression indicating whether values are invalid. @@ -901,8 +1051,25 @@ def is_inf(self) -> Expression: Nulls will be propagated! I.e. this operation will return a null for null values. Example: - >>> # [-float("inf"), 0., float("inf"), None] -> [True, False, True, None] - >>> col("x").float.is_inf() + >>> import daft + >>> df = daft.from_pydict({"data": [-float("inf"), 0., float("inf"), None]}) + >>> df = df.select(df["data"].float.is_inf()) + >>> df.collect() + ╭─────────╮ + │ data │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ None │ + ╰─────────╯ + + (Showing first 4 of 4 rows) Returns: Expression: Boolean Expression indicating whether values are Infinity. @@ -916,8 +1083,23 @@ def not_nan(self) -> Expression: Nulls will be propagated! I.e. this operation will return a null for null values. Example: - >>> # [1., None, NaN] -> [True, None, False] - >>> col("x").not_nan() + >>> import daft + >>> df = daft.from_pydict({"x": [1.0, None, float("nan")]}) + >>> df = df.select(df["x"].float.not_nan()) + >>> df.collect() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ None │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Boolean Expression indicating whether values are not invalid. @@ -928,6 +1110,7 @@ def fill_nan(self, fill_value: Expression) -> Expression: """Fills NaN values in the Expression with the provided fill_value Example: + >>> import daft >>> df = daft.from_pydict({"data": [1.1, float("nan"), 3.3]}) >>> df = df.with_column("filled", df["data"].float.fill_nan(2.2)) >>> df.show() @@ -942,6 +1125,8 @@ def fill_nan(self, fill_value: Expression) -> Expression: ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ │ 3.3 ┆ 3.3 │ ╰─────────┴─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: Expression with Nan values filled with the provided fill_value @@ -957,7 +1142,31 @@ def date(self) -> Expression: """Retrieves the date for a datetime column Example: - >>> col("x").dt.date() + >>> import daft, datetime + >>> df = daft.from_pydict( + ... { + ... "x": [ + ... datetime.datetime(2021, 1, 1, 5, 1, 1), + ... datetime.datetime(2021, 1, 2, 6, 1, 59), + ... datetime.datetime(2021, 1, 3, 7, 2, 0), + ... ], + ... } + ... ) + >>> df = df.with_column("date", df["x"].dt.date()) + >>> df.show() + ╭───────────────────────────────┬────────────╮ + │ x ┆ date │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ Date │ + ╞═══════════════════════════════╪════════════╡ + │ 2021-01-01 05:01:01 ┆ 2021-01-01 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-02 06:01:59 ┆ 2021-01-02 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-03 07:02:00 ┆ 2021-01-03 │ + ╰───────────────────────────────┴────────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a Date expression @@ -968,7 +1177,31 @@ def day(self) -> Expression: """Retrieves the day for a datetime column Example: - >>> col("x").dt.day() + >>> import daft, datetime + >>> df = daft.from_pydict( + ... { + ... "x": [ + ... datetime.datetime(2021, 1, 1, 5, 1, 1), + ... datetime.datetime(2021, 1, 2, 6, 1, 59), + ... datetime.datetime(2021, 1, 3, 7, 2, 0), + ... ], + ... } + ... ) + >>> df = df.with_column("day", df["x"].dt.day()) + >>> df.show() + ╭───────────────────────────────┬────────╮ + │ x ┆ day │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪════════╡ + │ 2021-01-01 05:01:01 ┆ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-02 06:01:59 ┆ 2 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-03 07:02:00 ┆ 3 │ + ╰───────────────────────────────┴────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the day extracted from a datetime column @@ -979,7 +1212,31 @@ def hour(self) -> Expression: """Retrieves the day for a datetime column Example: - >>> col("x").dt.day() + >>> import daft, datetime + >>> df = daft.from_pydict( + ... { + ... "x": [ + ... datetime.datetime(2021, 1, 1, 5, 1, 1), + ... datetime.datetime(2021, 1, 2, 6, 1, 59), + ... datetime.datetime(2021, 1, 3, 7, 2, 0), + ... ], + ... } + ... ) + >>> df = df.with_column("hour", df["x"].dt.hour()) + >>> df.show() + ╭───────────────────────────────┬────────╮ + │ x ┆ hour │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪════════╡ + │ 2021-01-01 05:01:01 ┆ 5 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-02 06:01:59 ┆ 6 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-03 07:02:00 ┆ 7 │ + ╰───────────────────────────────┴────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the day extracted from a datetime column @@ -990,7 +1247,31 @@ def minute(self) -> Expression: """Retrieves the minute for a datetime column Example: - >>> col("x").dt.minute() + >>> import daft, datetime + >>> df = daft.from_pydict( + ... { + ... "x": [ + ... datetime.datetime(2021, 1, 1, 5, 1, 1), + ... datetime.datetime(2021, 1, 2, 6, 1, 59), + ... datetime.datetime(2021, 1, 3, 7, 2, 0), + ... ], + ... } + ... ) + >>> df = df.with_column("minute", df["x"].dt.minute()) + >>> df.show() + ╭───────────────────────────────┬────────╮ + │ x ┆ minute │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪════════╡ + │ 2021-01-01 05:01:01 ┆ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-02 06:01:59 ┆ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-03 07:02:00 ┆ 2 │ + ╰───────────────────────────────┴────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the minute extracted from a datetime column @@ -1001,7 +1282,31 @@ def second(self) -> Expression: """Retrieves the second for a datetime column Example: - >>> col("x").dt.second() + >>> import daft, datetime + >>> df = daft.from_pydict( + ... { + ... "x": [ + ... datetime.datetime(2021, 1, 1, 0, 1, 1), + ... datetime.datetime(2021, 1, 1, 0, 1, 59), + ... datetime.datetime(2021, 1, 1, 0, 2, 0), + ... ], + ... } + ... ) + >>> df = df.with_column("second", df["x"].dt.second()) + >>> df.show() + ╭───────────────────────────────┬────────╮ + │ x ┆ second │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪════════╡ + │ 2021-01-01 00:01:01 ┆ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-01 00:01:59 ┆ 59 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2021-01-01 00:02:00 ┆ 0 │ + ╰───────────────────────────────┴────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the second extracted from a datetime column @@ -1023,7 +1328,29 @@ def month(self) -> Expression: """Retrieves the month for a datetime column Example: - >>> col("x").dt.month() + >>> import daft, datetime + >>> df = daft.from_pydict({ + ... "datetime": [ + ... datetime.datetime(2024, 7, 3, 0, 0, 0), + ... datetime.datetime(2024, 6, 4, 0, 0, 0), + ... datetime.datetime(2024, 5, 5, 0, 0, 0), + ... ], + ... } + ... ) + >>> df.with_column("month", df["datetime"].dt.month()).collect() + ╭───────────────────────────────┬────────╮ + │ datetime ┆ month │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪════════╡ + │ 2024-07-03 00:00:00 ┆ 7 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2024-06-04 00:00:00 ┆ 6 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + │ 2024-05-05 00:00:00 ┆ 5 │ + ╰───────────────────────────────┴────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the month extracted from a datetime column @@ -1034,7 +1361,30 @@ def year(self) -> Expression: """Retrieves the year for a datetime column Example: - >>> col("x").dt.year() + >>> import daft, datetime + >>> df = daft.from_pydict({ + ... "datetime": [ + ... datetime.datetime(2024, 7, 3, 0, 0, 0), + ... datetime.datetime(2023, 7, 4, 0, 0, 0), + ... datetime.datetime(2022, 7, 5, 0, 0, 0), + ... ], + ... } + ... ) + >>> df.with_column("year", df["datetime"].dt.year()).collect() + ╭───────────────────────────────┬───────╮ + │ datetime ┆ year │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ Int32 │ + ╞═══════════════════════════════╪═══════╡ + │ 2024-07-03 00:00:00 ┆ 2024 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2023-07-04 00:00:00 ┆ 2023 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2022-07-05 00:00:00 ┆ 2022 │ + ╰───────────────────────────────┴───────╯ + + (Showing first 3 of 3 rows) + Returns: Expression: a UInt32 expression with just the year extracted from a datetime column @@ -1045,7 +1395,29 @@ def day_of_week(self) -> Expression: """Retrieves the day of the week for a datetime column, starting at 0 for Monday and ending at 6 for Sunday Example: - >>> col("x").dt.day_of_week() + >>> import daft, datetime + >>> df = daft.from_pydict({ + ... "datetime": [ + ... datetime.datetime(2024, 7, 3, 0, 0, 0), + ... datetime.datetime(2024, 7, 4, 0, 0, 0), + ... datetime.datetime(2024, 7, 5, 0, 0, 0), + ... ], + ... } + ... ) + >>> df.with_column("day_of_week", df["datetime"].dt.day_of_week()).collect() + ╭───────────────────────────────┬─────────────╮ + │ datetime ┆ day_of_week │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ UInt32 │ + ╞═══════════════════════════════╪═════════════╡ + │ 2024-07-03 00:00:00 ┆ 2 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2024-07-04 00:00:00 ┆ 3 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2024-07-05 00:00:00 ┆ 4 │ + ╰───────────────────────────────┴─────────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a UInt32 expression with just the day_of_week extracted from a datetime column @@ -1057,8 +1429,7 @@ def truncate(self, interval: str, relative_to: Expression | None = None) -> Expr Example: >>> import daft, datetime - >>> df = daft.from_pydict( - ... { + >>> df = daft.from_pydict({ ... "datetime": [ ... datetime.datetime(2021, 1, 1, 0, 1, 1), ... datetime.datetime(2021, 1, 1, 0, 1, 59), @@ -1072,12 +1443,14 @@ def truncate(self, interval: str, relative_to: Expression | None = None) -> Expr │ --- ┆ --- │ │ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │ ╞═══════════════════════════════╪═══════════════════════════════╡ - │ 2021-01-01T00:01:01.000000 ┆ 2021-01-01T00:01:00.000000 │ + │ 2021-01-01 00:01:01 ┆ 2021-01-01 00:01:00 │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 2021-01-01T00:01:59.000000 ┆ 2021-01-01T00:01:00.000000 │ + │ 2021-01-01 00:01:59 ┆ 2021-01-01 00:01:00 │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 2021-01-01T00:02:00.000000 ┆ 2021-01-01T00:02:00.000000 │ + │ 2021-01-01 00:02:00 ┆ 2021-01-01 00:02:00 │ ╰───────────────────────────────┴───────────────────────────────╯ + + (Showing first 3 of 3 rows) Args: interval: The interval to truncate to. Must be a string representing a valid interval in "{integer} {unit}" format, e.g. "1 day". Valid time units are: 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'day', 'week'. @@ -1095,7 +1468,23 @@ def contains(self, substr: str | Expression) -> Expression: """Checks whether each string contains the given pattern in a string column Example: - >>> col("x").str.contains(col("foo")) + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) + >>> df = df.select(df["x"].str.contains("o")) + >>> df.show() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Args: pattern: pattern to search for as a literal string, or as a column to pick values from @@ -1110,19 +1499,22 @@ def match(self, pattern: str | Expression) -> Expression: """Checks whether each string matches the given regular expression pattern in a string column Example: + >>> import daft >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) >>> df.with_column("match", df["x"].str.match("ba.")).collect() - ╭─────────╮ - │ match │ - │ --- │ - │ Boolean │ - ╞═════════╡ - │ false │ - ├╌╌╌╌╌╌╌╌╌┤ - │ true │ - ├╌╌╌╌╌╌╌╌╌┤ - │ true │ - ╰─────────╯ + ╭──────┬─────────╮ + │ x ┆ match │ + │ --- ┆ --- │ + │ Utf8 ┆ Boolean │ + ╞══════╪═════════╡ + │ foo ┆ false │ + ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ bar ┆ true │ + ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ baz ┆ true │ + ╰──────┴─────────╯ + + (Showing first 3 of 3 rows) Args: pattern: Regex pattern to search for as string or as a column to pick values from @@ -1137,7 +1529,22 @@ def endswith(self, suffix: str | Expression) -> Expression: """Checks whether each string ends with the given pattern in a string column Example: - >>> col("x").str.endswith(col("foo")) + >>> import daft + >>> df = daft.from_pydict({"x": ["geftdaft", "lazy", "daft.io"]}) + >>> df.with_column("match", df["x"].str.endswith("daft")).collect() + ╭──────────┬─────────╮ + │ x ┆ match │ + │ --- ┆ --- │ + │ Utf8 ┆ Boolean │ + ╞══════════╪═════════╡ + │ geftdaft ┆ true │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ lazy ┆ false │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ daft.io ┆ false │ + ╰──────────┴─────────╯ + + (Showing first 3 of 3 rows) Args: pattern: pattern to search for as a literal string, or as a column to pick values from @@ -1152,7 +1559,22 @@ def startswith(self, prefix: str | Expression) -> Expression: """Checks whether each string starts with the given pattern in a string column Example: - >>> col("x").str.startswith(col("foo")) + >>> import daft + >>> df = daft.from_pydict({"x": ["geftdaft", "lazy", "daft.io"]}) + >>> df.with_column("match", df["x"].str.startswith("daft")).collect() + ╭──────────┬─────────╮ + │ x ┆ match │ + │ --- ┆ --- │ + │ Utf8 ┆ Boolean │ + ╞══════════╪═════════╡ + │ geftdaft ┆ false │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ lazy ┆ false │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ daft.io ┆ true │ + ╰──────────┴─────────╯ + + (Showing first 3 of 3 rows) Args: pattern: pattern to search for as a literal string, or as a column to pick values from @@ -1167,35 +1589,41 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression: r"""Splits each string on the given literal or regex pattern, into a list of strings. Example: - >>> df = daft.from_pydict({"data": ["foo.bar.baz", "a.b.c", "1.2.3"]}) + >>> import daft + >>> df = daft.from_pydict({"data": ["daft.distributed.query", "a.b.c", "1.2.3"]}) >>> df.with_column("split", df["data"].str.split(".")).collect() - ╭─────────────┬─────────────────╮ - │ data ┆ split │ - │ --- ┆ --- │ - │ Utf8 ┆ List[Utf8] │ - ╞═════════════╪═════════════════╡ - │ foo.bar.baz ┆ [foo, bar, baz] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ a.b.c ┆ [a, b, c] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 1.2.3 ┆ [1, 2, 3] │ - ╰─────────────┴─────────────────╯ + ╭────────────────────────┬────────────────────────────╮ + │ data ┆ split │ + │ --- ┆ --- │ + │ Utf8 ┆ List[Utf8] │ + ╞════════════════════════╪════════════════════════════╡ + │ daft.distributed.query ┆ [daft, distributed, query] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ a.b.c ┆ [a, b, c] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 1.2.3 ┆ [1, 2, 3] │ + ╰────────────────────────┴────────────────────────────╯ + + (Showing first 3 of 3 rows) Split on a regex pattern - >>> df = daft.from_pydict({"data": ["foo.bar...baz", "a.....b.c", "1.2...3.."]}) + >>> import daft + >>> df = daft.from_pydict({"data": ["daft.distributed...query", "a.....b.c", "1.2...3.."]}) >>> df.with_column("split", df["data"].str.split(r"\.+", regex=True)).collect() - ╭───────────────┬─────────────────╮ - │ data ┆ split │ - │ --- ┆ --- │ - │ Utf8 ┆ List[Utf8] │ - ╞═══════════════╪═════════════════╡ - │ foo.bar...baz ┆ [foo, bar, baz] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ a.....b.c ┆ [a, b, c] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 1.2...3.. ┆ [1, 2, 3, ] │ - ╰───────────────┴─────────────────╯ + ╭──────────────────────────┬────────────────────────────╮ + │ data ┆ split │ + │ --- ┆ --- │ + │ Utf8 ┆ List[Utf8] │ + ╞══════════════════════════╪════════════════════════════╡ + │ daft.distributed...query ┆ [daft, distributed, query] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ a.....b.c ┆ [a, b, c] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 1.2...3.. ┆ [1, 2, 3, ] │ + ╰──────────────────────────┴────────────────────────────╯ + + (Showing first 3 of 3 rows) Args: @@ -1215,8 +1643,23 @@ def concat(self, other: str) -> Expression: Another (easier!) way to invoke this functionality is using the Python `+` operator which is aliased to using `.str.concat`. These are equivalent: - >>> col("x").str.concat(col("y")) - >>> col("x") + col("y") + Example: + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"]}) + >>> df.select(col("x").str.concat(col("y"))).collect() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ fooa │ + ├╌╌╌╌╌╌┤ + │ barb │ + ├╌╌╌╌╌╌┤ + │ bazc │ + ╰──────╯ + + (Showing first 3 of 3 rows) Args: other (Expression): a string expression to concatenate with @@ -1235,35 +1678,40 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression: If the pattern does not match or the group does not exist, a null value is returned. Example: + >>> import daft >>> regex = r"(\d)(\d*)" >>> df = daft.from_pydict({"x": ["123-456", "789-012", "345-678"]}) - >>> df.with_column("match", df["x"].str.extract(regex)) - ╭─────────┬─────────╮ - │ x ┆ match │ - │ --- ┆ --- │ - │ Utf8 ┆ Utf8 │ - ╞═════════╪═════════╡ - │ 123-456 ┆ 123 │ - ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 789-012 ┆ 789 │ - ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 345-678 ┆ 345 │ - ╰─────────┴─────────╯ + >>> df.with_column("match", df["x"].str.extract(regex)).collect() + ╭─────────┬───────╮ + │ x ┆ match │ + │ --- ┆ --- │ + │ Utf8 ┆ Utf8 │ + ╞═════════╪═══════╡ + │ 123-456 ┆ 123 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 789-012 ┆ 789 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 345-678 ┆ 345 │ + ╰─────────┴───────╯ + + (Showing first 3 of 3 rows) Extract the first capture group >>> df.with_column("match", df["x"].str.extract(regex, 1)).collect() - ╭─────────┬─────────╮ - │ x ┆ match │ - │ --- ┆ --- │ - │ Utf8 ┆ Utf8 │ - ╞═════════╪═════════╡ - │ 123-456 ┆ 1 │ - ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 789-012 ┆ 7 │ - ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 345-678 ┆ 3 │ - ╰─────────┴─────────╯ + ╭─────────┬───────╮ + │ x ┆ match │ + │ --- ┆ --- │ + │ Utf8 ┆ Utf8 │ + ╞═════════╪═══════╡ + │ 123-456 ┆ 1 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 789-012 ┆ 7 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 345-678 ┆ 3 │ + ╰─────────┴───────╯ + + (Showing first 3 of 3 rows) Args: pattern: The regex pattern to extract @@ -1286,11 +1734,12 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression: If index is 0, the entire match is returned. If the pattern does not match or the group does not exist, an empty list is returned. Example: + >>> import daft >>> regex = r"(\d)(\d*)" >>> df = daft.from_pydict({"x": ["123-456", "789-012", "345-678"]}) - >>> df.with_column("match", df["x"].str.extract_all(regex)) + >>> df.with_column("match", df["x"].str.extract_all(regex)).collect() ╭─────────┬────────────╮ - │ x ┆ matches │ + │ x ┆ match │ │ --- ┆ --- │ │ Utf8 ┆ List[Utf8] │ ╞═════════╪════════════╡ @@ -1300,12 +1749,14 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression: ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 345-678 ┆ [345, 678] │ ╰─────────┴────────────╯ + + (Showing first 3 of 3 rows) Extract the first capture group >>> df.with_column("match", df["x"].str.extract_all(regex, 1)).collect() ╭─────────┬────────────╮ - │ x ┆ matches │ + │ x ┆ match │ │ --- ┆ --- │ │ Utf8 ┆ List[Utf8] │ ╞═════════╪════════════╡ @@ -1315,6 +1766,8 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression: ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 345-678 ┆ [3, 6] │ ╰─────────┴────────────╯ + + (Showing first 3 of 3 rows) Args: pattern: The regex pattern to extract @@ -1338,6 +1791,7 @@ def replace( """Replaces all occurrences of a pattern in a string column with a replacement string. The pattern can be a literal string or a regex pattern. Example: + >>> import daft >>> df = daft.from_pydict({"data": ["foo", "bar", "baz"]}) >>> df.with_column("replace", df["data"].str.replace("ba", "123")).collect() ╭──────┬─────────╮ @@ -1351,9 +1805,12 @@ def replace( ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ │ baz ┆ 123z │ ╰──────┴─────────╯ + + (Showing first 3 of 3 rows) Replace with a regex pattern + >>> import daft >>> df = daft.from_pydict({"data": ["foo", "fooo", "foooo"]}) >>> df.with_column("replace", df["data"].str.replace(r"o+", "a", regex=True)).collect() ╭───────┬─────────╮ @@ -1367,6 +1824,8 @@ def replace( ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ │ foooo ┆ fa │ ╰───────┴─────────╯ + + (Showing first 3 of 3 rows) Args: pattern: The pattern to replace @@ -1384,7 +1843,23 @@ def length(self) -> Expression: """Retrieves the length for a UTF-8 string column Example: - >>> col("x").str.length() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) + >>> df = df.select(df["x"].str.length()) + >>> df.show() + ╭────────╮ + │ x │ + │ --- │ + │ UInt64 │ + ╞════════╡ + │ 3 │ + ├╌╌╌╌╌╌╌╌┤ + │ 3 │ + ├╌╌╌╌╌╌╌╌┤ + │ 3 │ + ╰────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: an UInt64 expression with the length of each string @@ -1395,7 +1870,23 @@ def lower(self) -> Expression: """Convert UTF-8 string to all lowercase Example: - >>> col("x").str.lower() + >>> import daft + >>> df = daft.from_pydict({"x": ["FOO", "BAR", "BAZ"]}) + >>> df = df.select(df["x"].str.lower()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ foo │ + ├╌╌╌╌╌╌┤ + │ bar │ + ├╌╌╌╌╌╌┤ + │ baz │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` lowercased @@ -1406,7 +1897,23 @@ def upper(self) -> Expression: """Convert UTF-8 string to all upper Example: - >>> col("x").str.upper() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) + >>> df = df.select(df["x"].str.upper()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ FOO │ + ├╌╌╌╌╌╌┤ + │ BAR │ + ├╌╌╌╌╌╌┤ + │ BAZ │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` uppercased @@ -1417,7 +1924,23 @@ def lstrip(self) -> Expression: """Strip whitespace from the left side of a UTF-8 string Example: - >>> col("x").str.lstrip() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", " baz"]}) + >>> df = df.select(df["x"].str.lstrip()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ foo │ + ├╌╌╌╌╌╌┤ + │ bar │ + ├╌╌╌╌╌╌┤ + │ baz │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` with leading whitespace stripped @@ -1428,7 +1951,23 @@ def rstrip(self) -> Expression: """Strip whitespace from the right side of a UTF-8 string Example: - >>> col("x").str.rstrip() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz "]}) + >>> df = df.select(df["x"].str.rstrip()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ foo │ + ├╌╌╌╌╌╌┤ + │ bar │ + ├╌╌╌╌╌╌┤ + │ baz │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` with trailing whitespace stripped @@ -1439,7 +1978,23 @@ def reverse(self) -> Expression: """Reverse a UTF-8 string Example: - >>> col("x").str.reverse() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) + >>> df = df.select(df["x"].str.reverse()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ oof │ + ├╌╌╌╌╌╌┤ + │ rab │ + ├╌╌╌╌╌╌┤ + │ zab │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` reversed @@ -1450,7 +2005,23 @@ def capitalize(self) -> Expression: """Capitalize a UTF-8 string Example: - >>> col("x").str.capitalize() + >>> import daft + >>> df = daft.from_pydict({"x": ["foo", "bar", "baz"]}) + >>> df = df.select(df["x"].str.capitalize()) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ Foo │ + ├╌╌╌╌╌╌┤ + │ Bar │ + ├╌╌╌╌╌╌┤ + │ Baz │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` uppercased with the first character and lowercased the rest @@ -1461,7 +2032,23 @@ def left(self, nchars: int | Expression) -> Expression: """Gets the n (from nchars) left-most characters of each string Example: - >>> col("x").str.left(3) + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.left(4)) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ daft │ + ├╌╌╌╌╌╌┤ + │ quer │ + ├╌╌╌╌╌╌┤ + │ engi │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is the `n` left-most characters of `self` @@ -1473,7 +2060,23 @@ def right(self, nchars: int | Expression) -> Expression: """Gets the n (from nchars) right-most characters of each string Example: - >>> col("x").str.right(3) + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "distributed", "engine"]}) + >>> df = df.select(df["x"].str.right(4)) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ daft │ + ├╌╌╌╌╌╌┤ + │ uted │ + ├╌╌╌╌╌╌┤ + │ gine │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is the `n` right-most characters of `self` @@ -1489,7 +2092,23 @@ def find(self, substr: str | Expression) -> Expression: If the substring is not found, -1 is returned. Example: - >>> col("x").str.find("foo") + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query daft", "df_daft"]}) + >>> df = df.select(df["x"].str.find("daft")) + >>> df.show() + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 0 │ + ├╌╌╌╌╌╌╌┤ + │ 6 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ╰───────╯ + + (Showing first 3 of 3 rows) Returns: Expression: an Int64 expression with the index of the first occurrence of the substring in each string @@ -1505,7 +2124,23 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression: The pad character must be a single character. Example: - >>> col("x").str.rpad(5, "0") + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.rpad(6, "0")) + >>> df.show() + ╭────────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞════════╡ + │ daft00 │ + ├╌╌╌╌╌╌╌╌┤ + │ query0 │ + ├╌╌╌╌╌╌╌╌┤ + │ engine │ + ╰────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` truncated or right-padded with the pad character @@ -1522,7 +2157,23 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression: The pad character must be a single character. Example: - >>> col("x").str.lpad(5, "0") + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.lpad(6, "0")) + >>> df.show() + ╭────────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞════════╡ + │ 00daft │ + ├╌╌╌╌╌╌╌╌┤ + │ 0query │ + ├╌╌╌╌╌╌╌╌┤ + │ engine │ + ╰────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` truncated or left-padded with the pad character @@ -1535,7 +2186,23 @@ def repeat(self, n: int | Expression) -> Expression: """Repeats each string n times Example: - >>> col("x").str.repeat(3) + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.repeat(5)) + >>> df.show() + ╭────────────────────────────────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞════════════════════════════════╡ + │ daftdaftdaftdaftdaft │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ queryqueryqueryqueryquery │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ engineengineengineengineengin… │ + ╰────────────────────────────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a String expression which is `self` repeated `n` times @@ -1550,7 +2217,23 @@ def like(self, pattern: str | Expression) -> Expression: Use % as a multiple-character wildcard or _ as a single-character wildcard. Example: - >>> col("x").str.like("foo%") + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.like("daf%")) + >>> df.show() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a Boolean expression indicating whether each value matches the provided pattern @@ -1565,7 +2248,23 @@ def ilike(self, pattern: str | Expression) -> Expression: Use % as a multiple-character wildcard or _ as a single-character wildcard. Example: - >>> col("x").str.ilike("foo%") + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.ilike("%ft%")) + >>> df.show() + ╭─────────╮ + │ x │ + │ --- │ + │ Boolean │ + ╞═════════╡ + │ true │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌╌╌┤ + │ false │ + ╰─────────╯ + + (Showing first 3 of 3 rows) Returns: Expression: a Boolean expression indicating whether each value matches the provided pattern @@ -1580,7 +2279,23 @@ def substr(self, start: int | Expression, length: int | Expression | None = None If `length` is not provided, the substring will include all characters from `start` to the end of the string. Example: - >>> col("x").str.substr(2, 2) + >>> import daft + >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) + >>> df = df.select(df["x"].str.substr(2,4)) + >>> df.show() + ╭──────╮ + │ x │ + │ --- │ + │ Utf8 │ + ╞══════╡ + │ ft │ + ├╌╌╌╌╌╌┤ + │ ery │ + ├╌╌╌╌╌╌┤ + │ gine │ + ╰──────╯ + + (Showing first 3 of 3 rows) Returns: Expression: A String expression representing the extracted substring. @@ -1597,6 +2312,7 @@ def to_date(self, format: str) -> Expression: See: https://docs.rs/chrono/latest/chrono/format/strftime/index.html Example: + >>> import daft >>> df = daft.from_pydict({"x": ["2021-01-01", "2021-01-02", None]}) >>> df = df.with_column("date", df["x"].str.to_date("%Y-%m-%d")) >>> df.show() @@ -1611,7 +2327,8 @@ def to_date(self, format: str) -> Expression: ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ None ┆ None │ ╰────────────┴────────────╯ - + + (Showing first 3 of 3 rows) Returns: Expression: a Date expression which is parsed by given format @@ -1626,6 +2343,7 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression: See: https://docs.rs/chrono/latest/chrono/format/strftime/index.html Example: + >>> import daft >>> df = daft.from_pydict({"x": ["2021-01-01 00:00:00.123", "2021-01-02 12:30:00.456", None]}) >>> df = df.with_column("datetime", df["x"].str.to_datetime("%Y-%m-%d %H:%M:%S%.3f")) >>> df.show() @@ -1640,6 +2358,8 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression: ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ None ┆ None │ ╰─────────────────────────┴───────────────────────────────╯ + + (Showing first 3 of 3 rows) If a timezone is provided, the datetime will be parsed in that timezone @@ -1657,7 +2377,8 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression: ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ None ┆ None │ ╰───────────────────────────────┴────────────────────────────────────────────────╯ - + + (Showing first 3 of 3 rows) Returns: Expression: a DateTime expression which is parsed by given format and timezone @@ -1678,6 +2399,7 @@ def normalize( All processing options are on by default. Example: + >>> import daft >>> df = daft.from_pydict({"x": ["hello world", "Hello, world!", "HELLO, \\nWORLD!!!!"]}) >>> df = df.with_column("normalized", df["x"].str.normalize()) >>> df.show() @@ -1693,6 +2415,8 @@ def normalize( │ HELLO, ┆ hello world │ │ WORLD!!!! ┆ │ ╰───────────────┴─────────────╯ + + (Showing first 3 of 3 rows) Args: remove_punct: Whether to remove all punctuation (ASCII). @@ -1805,25 +2529,26 @@ def get(self, key: Expression) -> Expression: Example: >>> import pyarrow as pa >>> import daft - >>> pa_array = pa.array([[("a", 1)],[],[("b",2)]], type=pa.map_(pa.string(), pa.int64())) + >>> pa_array = pa.array([[("a", 1)],[],[("b", 2)]], type=pa.map_(pa.string(), pa.int64())) >>> df = daft.from_arrow(pa.table({"map_col": pa_array})) - >>> df1 = df.with_column("a", df["map_col"].map.get("a")) - >>> df1.show() - ╭───────────┬───────╮ - │ map_col ┆ a │ - │ --- ┆ --- │ - │ Map[Utf8] ┆ Int64 │ - ╞═══════════╪═══════╡ - │ [{key: a, ┆ 1 │ - │ value: 1, ┆ │ - │ }] ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ [] ┆ None │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ [{key: b, ┆ None │ - │ value: 2, ┆ │ - │ }] ┆ │ - ╰───────────┴───────╯ + >>> df = df.with_column("a", df["map_col"].map.get("a")) + >>> df.show() + ╭──────────────────────────────────────┬───────╮ + │ map_col ┆ a │ + │ --- ┆ --- │ + │ Map[Struct[key: Utf8, value: Int64]] ┆ Int64 │ + ╞══════════════════════════════════════╪═══════╡ + │ [{key: a, ┆ 1 │ + │ value: 1, ┆ │ + │ }] ┆ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ [] ┆ None │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ [{key: b, ┆ None │ + │ value: 2, ┆ │ + │ }] ┆ │ + ╰──────────────────────────────────────┴───────╯ + (Showing first 3 of 3 rows) Args: @@ -2089,6 +2814,7 @@ def query(self, jq_query: str) -> Expression: This expression uses jaq as the underlying executor, see https://github.com/01mf02/jaq for the full list of supported filters. Example: + >>> import daft >>> df = daft.from_pydict({"col": ['{"a": 1}', '{"a": 2}', '{"a": 3}']}) >>> df.with_column("res", df["col"].json.query(".a")).collect() ╭──────────┬──────╮ @@ -2102,6 +2828,8 @@ def query(self, jq_query: str) -> Expression: ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ │ {"a": 3} ┆ 3 │ ╰──────────┴──────╯ + + (Showing first 3 of 3 rows) Args: jq_query (str): JQ query string diff --git a/docs/source/user_guide/basic_concepts/expressions.rst b/docs/source/user_guide/basic_concepts/expressions.rst index a9e9e7b894..981b40009f 100644 --- a/docs/source/user_guide/basic_concepts/expressions.rst +++ b/docs/source/user_guide/basic_concepts/expressions.rst @@ -314,3 +314,187 @@ The :meth:`.if_else() ` method is a useful (Showing first 3 of 3 rows) This is a useful expression for cleaning your data! + + + +Temporal Operations +------------------- + +Daft lets you work with various temporal data types such as Time, Timestamp, and Duration. Let's explore how to use these types and their interactions. + +.. code:: python + + df = daft.from_pydict({"x": [ + datetime.datetime(2021, 1, 1, 0, 1, 1), + datetime.datetime(2021, 1, 1, 0, 1, 59), + datetime.datetime(2021, 1, 1, 0, 2, 0), + ] + }) + df.show() + +.. code:: none + + +------------------------+ + | x | + | DateTime | + +========================+ + | 2021-01-01T00:01:01 | + +------------------------+ + | 2021-01-01T00:01:59 | + +------------------------+ + | 2021-01-01T00:02:00 | + +------------------------+ + (Showing first 3 rows) + +Let's add 10 seconds to each timestamp. + +.. code:: python + + df = df.with_column("x_plus_10_seconds", df["x"] + datetime.timedelta(seconds=10)) + df.show() + +.. code:: none + + +------------------------+------------------------+ + | x | x_plus_10_seconds | + | DateTime | DateTime | + +========================+========================+ + | 2021-01-01T00:01:01 | 2021-01-01T00:01:11 | + +------------------------+------------------------+ + | 2021-01-01T00:01:59 | 2021-01-01T00:02:09 | + +------------------------+------------------------+ + | 2021-01-01T00:02:00 | 2021-01-01T00:02:10 | + +------------------------+------------------------+ + (Showing first 3 rows) + +Subtracting Timestamps +^^^^^^^^^^^^^^^^^^^^^^ + +You can subtract one Timestamp from another to get a Duration. + +.. code:: python + + df = df.with_column("duration_between_x_plus_10_and_x", df["x_plus_10_seconds"] - df["x"]) + df.show() + +.. code:: none + + +------------------------+------------------------+-------------------------------+ + | x | x_plus_10_seconds | duration_between_x_plus_10_and_x| + | DateTime | DateTime | Duration | + +========================+========================+===============================+ + | 2021-01-01T00:01:01 | 2021-01-01T00:01:11 | 0:00:10 | + +------------------------+------------------------+-------------------------------+ + | 2021-01-01T00:01:59 | 2021-01-01T00:02:09 | 0:00:10 | + +------------------------+------------------------+-------------------------------+ + | 2021-01-01T00:02:00 | 2021-01-01T00:02:10 | 0:00:10 | + +------------------------+------------------------+-------------------------------+ + (Showing first 3 rows) + + +Extracting Time Stamps +^^^^^^^^^^^^^^^^^^^^^^ + +You can extract parts of a Timestamp, such as the year, month, day, hour, minute, and second. + +.. code:: python + + df = df.with_columns("year", df["x"].dt.year()) // with columns integration + df = df.with_column("month", df["x"].dt.month()) + df = df.with_column("day", df["x"].dt.day()) + df = df.with_column("hour", df["x"].dt.hour()) + df = df.with_column("minute", df["x"].dt.minute()) + df = df.with_column("second", df["x"].dt.second()) + df.show() + +.. code:: none + + +------------------------+------+------+-----+------+-------+--------+ + | x | year | month| day | hour | minute| second | + | DateTime | Int32| Int32|Int32| Int32| Int32 | Int32 | + +========================+======+======+=====+======+=======+========+ + | 2021-01-01T00:01:01 | 2021 | 1 | 1 | 0 | 1 | 1 | + +------------------------+------+------+-----+------+-------+--------+ + | 2021-01-01T00:01:59 | 2021 | 1 | 1 | 0 | 1 | 59 | + +------------------------+------+------+-----+------+-------+--------+ + | 2021-01-01T00:02:00 | 2021 | 1 | 1 | 0 | 2 | 0 | + +------------------------+------+------+-----+------+-------+--------+ + (Showing first 3 rows) + + +Converting Between Time Zones +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can convert a Timestamp to a different time zone. + +.. code:: python + + df = daft.from_pydict({"x": [ + "2021-01-01 00:00:00.123 +0800", + "2021-01-02 12:30:00.456 +0800"] + }) + df = df.with_column("datetime", df["x"].str.to_datetime("%Y-%m-%d %H:%M:%S%.3f %z", timezone="America/New_York")) + df.collect() + +.. code:: none + + ╭───────────────────────────────┬───────────────────────────────────────────────────╮ + │ x ┆ datetime │ + │ --- ┆ --- │ + │ Utf8 ┆ Timestamp(Milliseconds, Some("America/New_York")) │ + ╞═══════════════════════════════╪═══════════════════════════════════════════════════╡ + │ 2021-01-01 00:00:00.123 +0800 ┆ 2020-12-31 11:00:00.123 EST │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-02 12:30:00.456 +0800 ┆ 2021-01-01 23:30:00.456 EST │ + ╰───────────────────────────────┴───────────────────────────────────────────────────╯ + + (Showing first 2 of 2 rows) + + +Using the Truncate Function +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +The `truncate` function can be used to truncate timestamps to a specific time unit. For example, you can use it to truncate timestamps to the nearest day +.. code:: python + + import pandas as pd + import daft + + # Create a DataFrame with a range of dates + df = daft.from_pydict({ + "datetime": [ + datetime.datetime(2021, 1, 7, 0, 1, 1), + datetime.datetime(2021, 1, 8, 0, 1, 59), + datetime.datetime(2021, 1, 9, 0, 2, 0), + datetime.datetime(2021, 1, 10, 0, 2, 0), + ], + } + ) + + # Truncate dates to the start of the week + df.with_column("truncated", df["datetime"].dt.truncate("1 week")).collect() + +.. code:: none + ╭───────────────────────────────┬───────────────────────────────╮ + │ datetime ┆ truncated │ + │ --- ┆ --- │ + │ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │ + ╞═══════════════════════════════╪═══════════════════════════════╡ + │ 2021-01-07 00:01:01 ┆ 2021-01-07 00:00:00 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-08 00:01:59 ┆ 2021-01-07 00:00:00 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-09 00:02:00 ┆ 2021-01-07 00:00:00 │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2021-01-10 00:02:00 ┆ 2021-01-07 00:00:00 │ + ╰───────────────────────────────┴───────────────────────────────╯ + + (Showing first 4 of 4 rows) + +Explanation: +- The `dates` column contains dates from January 7, 2021, to January 10, 2021. +- The `truncated_dates` column shows the dates truncated to the start of the week. +- For dates between January 7 and January 10, the start of the week is considered as the closest preceding Sunday. + +This example demonstrates the advantage of using the `truncate` function to group or round dates to a desired time unit, such as the start of the week, which can be particularly useful for summarizing or aggregating data by weeks.