Skip to content

Commit

Permalink
feat(python): add update method to ldf/df (#6787)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Feb 10, 2023
1 parent a9820b6 commit d1b70fb
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Manipulation/selection
DataFrame.unique
DataFrame.unnest
DataFrame.unstack
DataFrame.update
DataFrame.upsample
DataFrame.vstack
DataFrame.with_columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Manipulation/selection
LazyFrame.take_every
LazyFrame.unique
LazyFrame.unnest
LazyFrame.update
LazyFrame.with_columns
LazyFrame.with_context
LazyFrame.with_row_count
2 changes: 2 additions & 0 deletions py-polars/polars/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
arange,
arg_where,
argsort_by,
coalesce,
col,
concat_list,
count,
Expand All @@ -56,6 +57,7 @@
"arg_where",
"argsort_by",
"BatchedCsvReader",
"coalesce",
"col",
"concat",
"concat_list",
Expand Down
79 changes: 79 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7331,6 +7331,85 @@ def merge_sorted(self, other: DataFrame, key: str) -> Self:
._df
)

def update(
self, other: DataFrame, on: None | str | Sequence[str] = None, how: str = "left"
) -> DataFrame:
"""
Update the values in this `DataFrame` with the non-null values in `other`.
Notes
-----
This is syntactic sugar for a left/inner join + coalesce
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
Parameters
----------
other
DataFrame that will be used to update the values
on
Column names that will be joined on.
If none given the row count is used.
how : {'left', 'inner'}
'Left' will keep the left table rows as is.
'Inner' will remove rows that are not found in other
Examples
--------
>>> df = pl.DataFrame(
... {
... "A": [1, 2, 3, 4],
... "B": [400, 500, 600, 700],
... }
... )
>>> df
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 400 │
│ 2 ┆ 500 │
│ 3 ┆ 600 │
│ 4 ┆ 700 │
└─────┴─────┘
>>> new_df = pl.DataFrame(
... {
... "B": [4, None, 6],
... "C": [7, 8, 9],
... }
... )
>>> new_df
shape: (3, 2)
┌──────┬─────┐
│ B ┆ C │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════╡
│ 4 ┆ 7 │
│ null ┆ 8 │
│ 6 ┆ 9 │
└──────┴─────┘
>>> df.update(new_df)
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
│ 2 ┆ 500 │
│ 3 ┆ 6 │
│ 4 ┆ 700 │
└─────┴─────┘
"""
return self.lazy().update(other.lazy(), on, how).collect(no_optimization=True)


def _prepare_other_arg(other: Any, length: int | None = None) -> pli.Series:
# if not a series create singleton series such that it will broadcast
Expand Down
117 changes: 117 additions & 0 deletions py-polars/polars/internals/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3982,3 +3982,120 @@ def merge_sorted(self, other: LazyFrame, key: str) -> Self:
"""
return self._from_pyldf(self._ldf.merge_sorted(other._ldf, key))

def update(
self, other: LazyFrame, on: None | str | Sequence[str] = None, how: str = "left"
) -> LazyFrame:
"""
Update the values in this `LazyFrame` with the non-null values in `other`.
Notes
-----
This is syntactic sugar for a left/inner join + coalesce
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
Parameters
----------
other
LazyFrame that will be used to update the values
on
Column names that will be joined on.
If none given the row count is used.
how : {'left', 'inner'}
'Left' will keep the left table rows as is.
'Inner' will remove rows that are not found in other
Examples
--------
>>> df = pl.DataFrame(
... {
... "A": [1, 2, 3, 4],
... "B": [400, 500, 600, 700],
... }
... )
>>> df
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 400 │
│ 2 ┆ 500 │
│ 3 ┆ 600 │
│ 4 ┆ 700 │
└─────┴─────┘
>>> new_df = pl.DataFrame(
... {
... "B": [4, None, 6],
... "C": [7, 8, 9],
... }
... )
>>> new_df
shape: (3, 2)
┌──────┬─────┐
│ B ┆ C │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════╡
│ 4 ┆ 7 │
│ null ┆ 8 │
│ 6 ┆ 9 │
└──────┴─────┘
>>> df.update(new_df)
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
│ 2 ┆ 500 │
│ 3 ┆ 6 │
│ 4 ┆ 700 │
└─────┴─────┘
"""
row_count_used = False
if on is None:
row_count_used = True
row_count_name = "__POLARS_ROW_COUNT"
self = self.with_row_count(row_count_name)
other = other.with_row_count(row_count_name)
on = row_count_name

if isinstance(on, str):
on = [on]

union_names = set(self.columns) & set(other.columns)

for name in on:
if name not in union_names:
raise ValueError(f"Join column {name} not found.")

right_added_names = union_names - set(on)

# no need to join if only join columns are in other
if len(right_added_names) == 0:
return self
tmp_name = "__POLARS_RIGHT"

result = (
self.join(other.select(list(union_names)), on=on, how=how, suffix=tmp_name) # type: ignore[arg-type]
.with_columns(
[
pli.coalesce([column_name + tmp_name, pli.col(column_name)]).alias(
column_name
)
for column_name in right_added_names
]
)
.drop([name + tmp_name for name in right_added_names])
)
if row_count_used:
result = result.drop(row_count_name)
return result
15 changes: 15 additions & 0 deletions py-polars/tests/unit/test_joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,18 @@ def test_semi_join_projection_pushdown_6455() -> None:
"id": [1, 2],
"value": [2, 4],
}


def test_update() -> None:
df = pl.DataFrame({"A": [1, 2, 3, 4], "B": [400, 500, 600, 700]})

new_df = pl.DataFrame({"B": [4, None, 6], "C": [7, 8, 9]})

assert df.update(new_df).to_dict(False) == {
"A": [1, 2, 3, 4],
"B": [4, 500, 6, 700],
}
df1 = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df2 = pl.DataFrame({"a": [2, 3], "b": [8, 9]})

assert df1.update(df2, on="a").to_dict(False) == {"a": [1, 2, 3], "b": [4, 8, 9]}

0 comments on commit d1b70fb

Please sign in to comment.