dask · fjetter · Sep 8, 2023 · Jan 16, 2024 · Jan 16, 2024 · fjetter
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,6 @@ bench/shakespeare.txt
 .idea/
 .ipynb_checkpoints/
 coverage.xml
+
+
+test_cluster_dump/*
diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
@@ -215,16 +215,20 @@ def _wrap_unary_expr_op(self, op=None):
 #
 # Collection classes
 #
+from dask.typing import DaskCollection2
 
 
-class FrameBase(DaskMethodsMixin):
+class FrameBase(DaskMethodsMixin, DaskCollection2):
     """Base class for Expr-backed Collections"""
 
     __dask_scheduler__ = staticmethod(
         named_schedulers.get("threads", named_schedulers["sync"])
     )
     __dask_optimize__ = staticmethod(lambda dsk, keys, **kwargs: dsk)
 
+    def __dask_tokenize__(self):
+        return self.expr._name
+
     def __init__(self, expr):
         self._expr = expr
 
@@ -311,25 +315,11 @@ def persist(self, fuse=True, **kwargs):
         return DaskMethodsMixin.persist(out, **kwargs)
 
     def compute(self, fuse=True, **kwargs):
-        out = self
-        if not isinstance(out, Scalar):
-            out = out.repartition(npartitions=1)
-        out = out.optimize(fuse=fuse)
+        out = self.finalize_compute()
         return DaskMethodsMixin.compute(out, **kwargs)
 
-    @property
-    def dask(self):
-        return self.__dask_graph__()
-
-    def __dask_graph__(self):
-        out = self.expr
-        out = out.lower_completely()
-        return out.__dask_graph__()
-
-    def __dask_keys__(self):
-        out = self.expr
-        out = out.lower_completely()
-        return out.__dask_keys__()
+    def __dask_graph_factory__(self):
+        return self.expr
 
     def simplify(self):
         return new_collection(self.expr.simplify())
@@ -342,7 +332,20 @@ def optimize(self, fuse: bool = True):
 
     @property
     def dask(self):
-        return self.__dask_graph__()
+        # FIXME: This is highly problematic. Defining this as a property can
+        # cause very unfortunate materializations. Even a mere hasattr(obj,
+        # "dask") check already triggers this since it's a property, not even a
+        # method.
+        return self.__dask_graph_factory__().lower_completely().materialize()
+
+    def finalize_compute(self):
+        return new_collection(Repartition(self.expr, 1))
+
+    def postpersist(self, futures):
+        if not isinstance(futures, dict):
+            raise TypeError("Provided `futures` must be a dictionary")
+        func, args = self.__dask_postpersist__()
+        return func(futures, *args)
 
     def __dask_postcompute__(self):
         state = new_collection(self.expr.lower_completely())
@@ -3001,7 +3004,9 @@ def __bool__(self):
             "a conditional statement."
         )
 
-    def __dask_postcompute__(self):
+    def finalize_compute(self):
+        return self
+
         return first, ()
 
     def to_series(self, index=0) -> Series:

diff --git a/dask_expr/_core.py b/dask_expr/_core.py
@@ -10,6 +10,7 @@
 import pandas as pd
 import toolz
 from dask.dataframe.core import is_dataframe_like, is_index_like, is_series_like
+from dask.typing import TaskGraphFactory
 from dask.utils import funcname, import_required, is_arraylike
 
 from dask_expr._util import _BackendData, _tokenize_deterministic
@@ -426,7 +427,10 @@ def __getattr__(self, key):
                 f"API function. Current API coverage is documented here: {link}."
             )
 
-    def __dask_graph__(self):
+    def get_annotations(self):
+        return {}
+
+    def materialize(self):
         """Traverse expression tree, collect layers"""
         stack = [self]
         seen = set()
@@ -444,9 +448,12 @@ def __dask_graph__(self):
 
         return toolz.merge(layers)
 
+    def __dask_output_keys__(self) -> list:
+        return [(self._name, i) for i in range(self.npartitions)]
+
     @property
-    def dask(self):
-        return self.__dask_graph__()
+    def dask(self) -> dict:
+        return self.materialize()
 
     def substitute(self, old, new) -> Expr:
         """Substitute a specific term within the expression
@@ -619,6 +626,22 @@ def _to_graphviz(
 
         return g
 
+    @classmethod
+    def combine_factories(cls, *exprs: Expr) -> Expr:
+        """Combine multiple expressions into a single expression
+
+        Parameters
+        ----------
+        exprs:
+            Expressions to combine
+
+        Returns
+        -------
+        expr:
+            Combined expression
+        """
+        raise NotImplementedError()
+
     def visualize(self, filename="dask-expr.svg", format=None, **kwargs):
         """
         Visualize the expression graph.

diff --git a/dask_expr/_expr.py b/dask_expr/_expr.py
@@ -10,6 +10,7 @@
 import dask
 import numpy as np
 import pandas as pd
+import toolz
 from dask.array import Array
 from dask.base import normalize_token
 from dask.core import flatten
@@ -27,7 +28,7 @@
     safe_head,
     total_mem_usage,
 )
-from dask.dataframe.dispatch import meta_nonempty
+from dask.dataframe.dispatch import make_meta_dispatch, meta_nonempty
 from dask.dataframe.rolling import CombinedOutput, _head_timedelta, overlap_chunk
 from dask.dataframe.shuffle import drop_overlap, get_overlap
 from dask.dataframe.utils import (
@@ -67,9 +68,6 @@ def ndim(self):
         except AttributeError:
             return 0
 
-    def __dask_keys__(self):
-        return [(self._name, i) for i in range(self.npartitions)]
-
     def optimize(self, **kwargs):
         return optimize(self, **kwargs)
 
@@ -105,6 +103,10 @@ def __getattr__(self, key):
                 f"API function. Current API coverage is documented here: {link}."
             )
 
+    @classmethod
+    def combine_factories(cls, *exprs: Expr, **kwargs) -> Expr:
+        return Tuple(*exprs)
+
     @property
     def index(self):
         return Index(self)
@@ -387,6 +389,7 @@ def memory_usage_per_partition(self, index=True, deep=False):
 
     @functools.cached_property
     def divisions(self):
+        # Note: This is triggering a divisions calculation on an hasattr check!
         return tuple(self._divisions())
 
     def _divisions(self):
@@ -3060,3 +3063,33 @@ def _get_meta_map_partitions(args, dfs, func, kwargs, meta, parent_meta):
     Var,
 )
 from dask_expr.io import IO, BlockwiseIO
+
+
+class Tuple(Expr):
+    def __getitem__(self, other):
+        return self.operands[other]
+
+    def _layer(self) -> dict:
+        return toolz.merge(op._layer() for op in self.operands)
+
+    def __dask_output_keys__(self) -> list:
+        all_keys = []
+        for op in self.operands:
+            l = op.__dask_output_keys__()
+            if len(l) > 1:
+                raise NotImplementedError()
+            all_keys.append(l[0])
+        return all_keys
+
+    def __len__(self):
+        return len(self.operands)
+
+    def __iter__(self):
+        return iter(self.operands)
+
+
+@make_meta_dispatch.register(Expr)
+def make_meta_expr(expr, index=None):
+    # make_meta only access the _meta attribute for collections but Expr is not
+    # a collection. Still, we're sometimes calling make_meta on Expr instances
+    return expr._meta
diff --git a/dask_expr/_merge.py b/dask_expr/_merge.py
@@ -70,7 +70,7 @@ class Merge(Expr):
     }
 
     def __str__(self):
-        return f"Merge({self._name[-7:]})"
+        return f"{type(self).__name__}({self._name[-7:]})"
 
     @property
     def kwargs(self):
@@ -114,56 +114,7 @@ def _bcast_right(self):
         return self.right
 
     def _divisions(self):
-        if self.merge_indexed_left and self.merge_indexed_right:
-            divisions = list(
-                unique(merge_sorted(self.left.divisions, self.right.divisions))
-            )
-            if len(divisions) == 1:
-                return (divisions[0], divisions[0])
-            if self.left.npartitions == 1 and self.right.npartitions == 1:
-                return (min(divisions), max(divisions))
-            return divisions
-
-        if self._is_single_partition_broadcast:
-            use_left = self.right_index or _contains_index_name(
-                self.right._meta, self.right_on
-            )
-            use_right = self.left_index or _contains_index_name(
-                self.left._meta, self.left_on
-            )
-            if (
-                use_right
-                and self.left.npartitions == 1
-                and self.how in ("right", "inner")
-            ):
-                return self.right.divisions
-            elif (
-                use_left
-                and self.right.npartitions == 1
-                and self.how in ("inner", "left")
-            ):
-                return self.left.divisions
-            else:
-                _npartitions = max(self.left.npartitions, self.right.npartitions)
-
-        elif self.is_broadcast_join:
-            meta_index_names = set(self._meta.index.names)
-            if (
-                self.broadcast_side == "left"
-                and set(self.right._meta.index.names) == meta_index_names
-            ):
-                return self._bcast_right._divisions()
-            elif (
-                self.broadcast_side == "right"
-                and set(self.left._meta.index.names) == meta_index_names
-            ):
-                return self._bcast_left._divisions()
-            _npartitions = max(self.left.npartitions, self.right.npartitions)
-
-        else:
-            _npartitions = self._npartitions
-
-        return (None,) * (_npartitions + 1)
+        return self.lower_completely()._divisions()
 
     @functools.cached_property
     def broadcast_side(self):
@@ -235,7 +186,6 @@ def _lower(self):
         left_index = self.left_index
         right_index = self.right_index
         shuffle_method = self.shuffle_method
-
         # TODO:
         #  1. Add/leverage partition statistics
 
@@ -437,6 +387,13 @@ class HashJoinP2P(Merge, PartitionsFiltered):
     }
     is_broadcast_join = False
 
+    @property
+    def npartitions(self):
+        return self._npartitions or max(self.left.npartitions, self.right.npartitions)
+
+    def _divisions(self):
+        return (None,) * (self.npartitions + 1)
+
     def _lower(self):
         return None
 
@@ -679,16 +636,17 @@ class BlockwiseMerge(Merge, Blockwise):
 
     is_broadcast_join = False
 
+    def dependencies(self):
+        # FIXME: The Blockwise._divisions is assuming that the left most is not
+        # a broadcast dep
+        return sorted(super().dependencies(), key=self._broadcast_dep)
+
     def _divisions(self):
-        if self.left.npartitions == self.right.npartitions:
-            return super()._divisions()
-        is_unknown = any(d is None for d in super()._divisions())
-        frame = (
-            self.left if self.left.npartitions > self.right.npartitions else self.right
-        )
-        if is_unknown:
-            return (None,) * (frame.npartitions + 1)
-        return frame.divisions
+        # Note: If reversed MRO for Blockwise to take precedence we wouldn't
+        # need this but we'd also get the _meta implementation of Blockwise even
+        # though we would want Merge to take precedence. This is probably the
+        # lesser evil
+        return Blockwise._divisions(self)
 
     def _lower(self):
         return None

diff --git a/dask_expr/_quantiles.py b/dask_expr/_quantiles.py
@@ -43,7 +43,7 @@ def _layer(self):
             random_state = self.random_state
         state_data = random_state_data(self.frame.npartitions, random_state)
 
-        keys = self.frame.__dask_keys__()
+        keys = self.frame.__dask_output_keys__()
         dtype_dsk = {(self._name, 0, 0): (dtype_info, keys[0])}
 
         percentiles_dsk = {

diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py
@@ -319,7 +319,7 @@ def _layer(self):
         # apply combine to batches of intermediate results
         j = 1
         d = {}
-        keys = self.frame.__dask_keys__()
+        keys = self.frame.__dask_output_keys__()
         split_every = self.split_every
         while len(keys) > 1:
             new_keys = []