feat(python): add update method to ldf/df (#6787)

pola-rs · Feb 10, 2023 · d1b70fb · d1b70fb
1 parent a9820b6
commit d1b70fb
Show file tree

Hide file tree

Showing 6 changed files with 215 additions and 0 deletions.
diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst
@@ -59,6 +59,7 @@ Manipulation/selection
     DataFrame.unique
     DataFrame.unnest
     DataFrame.unstack
+    DataFrame.update
     DataFrame.upsample
     DataFrame.vstack
     DataFrame.with_columns

diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst
@@ -38,6 +38,7 @@ Manipulation/selection
     LazyFrame.take_every
     LazyFrame.unique
     LazyFrame.unnest
+    LazyFrame.update
     LazyFrame.with_columns
     LazyFrame.with_context
     LazyFrame.with_row_count
diff --git a/py-polars/polars/internals/__init__.py b/py-polars/polars/internals/__init__.py
@@ -32,6 +32,7 @@
     arange,
     arg_where,
     argsort_by,
+    coalesce,
     col,
     concat_list,
     count,
@@ -56,6 +57,7 @@
     "arg_where",
     "argsort_by",
     "BatchedCsvReader",
+    "coalesce",
     "col",
     "concat",
     "concat_list",

diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -7331,6 +7331,85 @@ def merge_sorted(self, other: DataFrame, key: str) -> Self:
             ._df
         )
 
+    def update(
+        self, other: DataFrame, on: None | str | Sequence[str] = None, how: str = "left"
+    ) -> DataFrame:
+        """
+        Update the values in this `DataFrame` with the non-null values in `other`.
+
+        Notes
+        -----
+        This is syntactic sugar for a left/inner join + coalesce
+
+        Warnings
+        --------
+        This functionality is experimental and may change without it being considered a
+        breaking change.
+
+        Parameters
+        ----------
+        other
+            DataFrame that will be used to update the values
+        on
+            Column names that will be joined on.
+            If none given the row count is used.
+        how : {'left', 'inner'}
+            'Left' will keep the left table rows as is.
+            'Inner' will remove rows that are not found in other
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "A": [1, 2, 3, 4],
+        ...         "B": [400, 500, 600, 700],
+        ...     }
+        ... )
+        >>> df
+        shape: (4, 2)
+        ┌─────┬─────┐
+        │ A   ┆ B   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 400 │
+        │ 2   ┆ 500 │
+        │ 3   ┆ 600 │
+        │ 4   ┆ 700 │
+        └─────┴─────┘
+        >>> new_df = pl.DataFrame(
+        ...     {
+        ...         "B": [4, None, 6],
+        ...         "C": [7, 8, 9],
+        ...     }
+        ... )
+        >>> new_df
+        shape: (3, 2)
+        ┌──────┬─────┐
+        │ B    ┆ C   │
+        │ ---  ┆ --- │
+        │ i64  ┆ i64 │
+        ╞══════╪═════╡
+        │ 4    ┆ 7   │
+        │ null ┆ 8   │
+        │ 6    ┆ 9   │
+        └──────┴─────┘
+        >>> df.update(new_df)
+        shape: (4, 2)
+        ┌─────┬─────┐
+        │ A   ┆ B   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 4   │
+        │ 2   ┆ 500 │
+        │ 3   ┆ 6   │
+        │ 4   ┆ 700 │
+        └─────┴─────┘
+
+        """
+        return self.lazy().update(other.lazy(), on, how).collect(no_optimization=True)
+
 
 def _prepare_other_arg(other: Any, length: int | None = None) -> pli.Series:
     # if not a series create singleton series such that it will broadcast

diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py
@@ -3982,3 +3982,120 @@ def merge_sorted(self, other: LazyFrame, key: str) -> Self:
 
         """
         return self._from_pyldf(self._ldf.merge_sorted(other._ldf, key))
+
+    def update(
+        self, other: LazyFrame, on: None | str | Sequence[str] = None, how: str = "left"
+    ) -> LazyFrame:
+        """
+        Update the values in this `LazyFrame` with the non-null values in `other`.
+
+        Notes
+        -----
+        This is syntactic sugar for a left/inner join + coalesce
+
+        Warnings
+        --------
+        This functionality is experimental and may change without it being considered a
+        breaking change.
+
+        Parameters
+        ----------
+        other
+            LazyFrame that will be used to update the values
+        on
+            Column names that will be joined on.
+            If none given the row count is used.
+        how : {'left', 'inner'}
+            'Left' will keep the left table rows as is.
+            'Inner' will remove rows that are not found in other
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "A": [1, 2, 3, 4],
+        ...         "B": [400, 500, 600, 700],
+        ...     }
+        ... )
+        >>> df
+        shape: (4, 2)
+        ┌─────┬─────┐
+        │ A   ┆ B   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 400 │
+        │ 2   ┆ 500 │
+        │ 3   ┆ 600 │
+        │ 4   ┆ 700 │
+        └─────┴─────┘
+        >>> new_df = pl.DataFrame(
+        ...     {
+        ...         "B": [4, None, 6],
+        ...         "C": [7, 8, 9],
+        ...     }
+        ... )
+        >>> new_df
+        shape: (3, 2)
+        ┌──────┬─────┐
+        │ B    ┆ C   │
+        │ ---  ┆ --- │
+        │ i64  ┆ i64 │
+        ╞══════╪═════╡
+        │ 4    ┆ 7   │
+        │ null ┆ 8   │
+        │ 6    ┆ 9   │
+        └──────┴─────┘
+        >>> df.update(new_df)
+        shape: (4, 2)
+        ┌─────┬─────┐
+        │ A   ┆ B   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 4   │
+        │ 2   ┆ 500 │
+        │ 3   ┆ 6   │
+        │ 4   ┆ 700 │
+        └─────┴─────┘
+
+        """
+        row_count_used = False
+        if on is None:
+            row_count_used = True
+            row_count_name = "__POLARS_ROW_COUNT"
+            self = self.with_row_count(row_count_name)
+            other = other.with_row_count(row_count_name)
+            on = row_count_name
+
+        if isinstance(on, str):
+            on = [on]
+
+        union_names = set(self.columns) & set(other.columns)
+
+        for name in on:
+            if name not in union_names:
+                raise ValueError(f"Join column {name} not found.")
+
+        right_added_names = union_names - set(on)
+
+        # no need to join if only join columns are in other
+        if len(right_added_names) == 0:
+            return self
+        tmp_name = "__POLARS_RIGHT"
+
+        result = (
+            self.join(other.select(list(union_names)), on=on, how=how, suffix=tmp_name)  # type: ignore[arg-type]
+            .with_columns(
+                [
+                    pli.coalesce([column_name + tmp_name, pli.col(column_name)]).alias(
+                        column_name
+                    )
+                    for column_name in right_added_names
+                ]
+            )
+            .drop([name + tmp_name for name in right_added_names])
+        )
+        if row_count_used:
+            result = result.drop(row_count_name)
+        return result
diff --git a/py-polars/tests/unit/test_joins.py b/py-polars/tests/unit/test_joins.py
@@ -768,3 +768,18 @@ def test_semi_join_projection_pushdown_6455() -> None:
         "id": [1, 2],
         "value": [2, 4],
     }
+
+
+def test_update() -> None:
+    df = pl.DataFrame({"A": [1, 2, 3, 4], "B": [400, 500, 600, 700]})
+
+    new_df = pl.DataFrame({"B": [4, None, 6], "C": [7, 8, 9]})
+
+    assert df.update(new_df).to_dict(False) == {
+        "A": [1, 2, 3, 4],
+        "B": [4, 500, 6, 700],
+    }
+    df1 = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = pl.DataFrame({"a": [2, 3], "b": [8, 9]})
+
+    assert df1.update(df2, on="a").to_dict(False) == {"a": [1, 2, 3], "b": [4, 8, 9]}