support dask expressions

Jin Zhou · Jin Zhou · commit 7d729aba4f4b · 2026-01-08T15:54:21.000-05:00
diff --git a/taskvine/src/graph/dagvine/blueprint_graph/.gitignore b/taskvine/src/graph/dagvine/blueprint_graph/.gitignore
@@ -0,0 +1,2 @@
+adaptor_test.py
+__pycache__/
diff --git a/taskvine/src/graph/dagvine/blueprint_graph/adaptor.py b/taskvine/src/graph/dagvine/blueprint_graph/adaptor.py
@@ -19,7 +19,7 @@
     # where the private module is unavailable or type-checkers can't resolve it).
     dts = None
 
-from ndcctools.taskvine.dagvine.blueprint_graph.blueprint_graph import TaskOutputRef
+from ndcctools.taskvine.dagvine.blueprint_graph.blueprint_graph import TaskOutputRef, BlueprintGraph
 
 
 def _identity(value):
@@ -32,8 +32,11 @@ class Adaptor:
 
     _LEAF_TYPES = (str, bytes, bytearray, memoryview, int, float, bool, type(None))
 
-    def __init__(self, collection_dict):
-        self.original_collection_dict = collection_dict
+    def __init__(self, task_dict):
+
+        if isinstance(task_dict, BlueprintGraph):
+            self.converted = task_dict
+            return
 
         # TaskSpec-only state used to "lift" inline Tasks that cannot be reduced to
         # a pure Python value (or would be unsafe/expensive to inline).
@@ -45,23 +48,23 @@ def __init__(self, collection_dict):
         # lifted keys remain visible across subsequent conversions/dedup/reference checks.
         self._task_keys = set()
 
-        normalized = self._normalize_task_dict(collection_dict)
-        self.task_dict = self._convert_to_blueprint_tasks(normalized)
+        normalized = self._normalize_task_dict(task_dict)
+        self.converted = self._convert_to_blueprint_tasks(normalized)
 
-    def _normalize_task_dict(self, collection_dict):
+    def _normalize_task_dict(self, task_dict):
         """Collapse every supported input style into a classic `{key: sexpr or TaskSpec}` mapping."""
         from_dask_collection = bool(
-            is_dask_collection and any(is_dask_collection(v) for v in collection_dict.values())
+            is_dask_collection and any(is_dask_collection(v) for v in task_dict.values())
         )
 
         if from_dask_collection:
-            task_dict = self._dask_collections_to_task_dict(collection_dict)
+            task_dict = self._dask_collections_to_task_dict(task_dict)
         else:
             # IMPORTANT: treat plain user dicts as DAGVine sexprs by default.
             # If we unconditionally run `dask._task_spec.convert_legacy_graph(...)` when
             # dts is available, Dask will interpret our "final Mapping is kwargs"
             # convention as a positional dict argument, breaking sexpr semantics.
-            task_dict = dict(collection_dict)
+            task_dict = dict(task_dict)
 
         # Only ask Dask to rewrite legacy graphs when we *know* the input came
         # from a Dask collection/HLG. This keeps classic DAGVine sexprs stable
@@ -227,36 +230,38 @@ def _should_wrap(self, obj, task_keys):
         """Decide whether a value should become a `TaskOutputRef`."""
         if isinstance(obj, self._LEAF_TYPES):
             if isinstance(obj, str):
-                return obj in task_keys
+                hit = obj in task_keys
+                return hit
             return False
         try:
-            return obj in task_keys
+            hit = obj in task_keys
+            return hit
         except TypeError:
             return False
 
     # Flatten Dask collections into the dict-of-tasks structure the rest of the
     # pipeline expects. DAGVine clients often hand us a dict like
     # {"result": dask.delayed(...)}; we merge the underlying HighLevelGraphs so
     # `ContextGraph` sees the same dict representation C does.
-    def _dask_collections_to_task_dict(self, collection_dict):
+    def _dask_collections_to_task_dict(self, task_dict):
         """Flatten Dask collections into the classic dict-of-task layout."""
         assert is_dask_collection is not None
         from dask.highlevelgraph import HighLevelGraph, ensure_dict
 
-        if not isinstance(collection_dict, dict):
+        if not isinstance(task_dict, dict):
             raise TypeError("Input must be a dict")
 
-        for k, v in collection_dict.items():
+        for k, v in task_dict.items():
             if not is_dask_collection(v):
                 raise TypeError(
                     f"Input must be a dict of DaskCollection, but found {k} with type {type(v)}"
                 )
 
         if dts:
-            sub_hlgs = [v.dask for v in collection_dict.values()]
+            sub_hlgs = [v.dask for v in task_dict.values()]
             hlg = HighLevelGraph.merge(*sub_hlgs).to_dict()
         else:
-            hlg = dask.base.collections_to_dsk(collection_dict.values())
+            hlg = dask.base.collections_to_dsk(task_dict.values())
 
         return ensure_dict(hlg)
 
@@ -299,18 +304,21 @@ def _unwrap_dts_operand(self, operand, task_keys, *, parent_key=None):
 
         literal_cls = getattr(dts, "Literal", None)
         if literal_cls and isinstance(operand, literal_cls):
-            return getattr(operand, "value", None)
+            value = getattr(operand, "value", None)
+            return value
 
         datanode_cls = getattr(dts, "DataNode", None)
         if datanode_cls and isinstance(operand, datanode_cls):
-            return operand.value
+            value = operand.value
+            return value
 
         nested_cls = getattr(dts, "NestedContainer", None)
         if nested_cls and isinstance(operand, nested_cls):
             payload = getattr(operand, "value", None)
             if payload is None:
                 payload = getattr(operand, "data", None)
-            return self._unwrap_dts_operand(payload, task_keys, parent_key=parent_key)
+            value = self._unwrap_dts_operand(payload, task_keys, parent_key=parent_key)
+            return value
 
         task_cls = getattr(dts, "Task", None)
         if task_cls and isinstance(operand, task_cls):
@@ -323,15 +331,31 @@ def _unwrap_dts_operand(self, operand, task_keys, *, parent_key=None):
             # Otherwise it is an inline expression. Reduce if safe, else lift.
             func = self._extract_callable_from_task(operand)
             if func is None:
-                return self._lift_inline_task(operand, task_keys, parent_key=parent_key)
+                out = self._lift_inline_task(operand, task_keys, parent_key=parent_key)
+                return out
 
             # Special-case: Dask internal identity-cast wrappers should not be called
-            # during adaptation. Reduce structurally by returning the first argument.
+            # during adaptation. Reduce structurally by unwrapping all args and
+            # rebuilding the requested container type. This preserves dependency
+            # edges (critical for WCC) without executing arbitrary code.
             if self._is_identity_cast_op(func):
                 raw_args = getattr(operand, "args", ()) or ()
-                if not raw_args:
-                    return None
-                return self._unwrap_dts_operand(raw_args[0], task_keys, parent_key=parent_key)
+                raw_kwargs = getattr(operand, "kwargs", {}) or {}
+                typ = raw_kwargs.get("typ", None)
+
+                values = [self._unwrap_dts_operand(a, task_keys, parent_key=parent_key) for a in raw_args]
+
+                # Only allow safe container constructors here; otherwise lift.
+                safe_types = (list, tuple, set, frozenset, dict)
+                if typ in safe_types:
+                    try:
+                        casted = typ(values)
+                    except Exception:
+                        return self._lift_inline_task(operand, task_keys, parent_key=parent_key)
+                    return casted
+
+                # Unknown/unsafe typ: lift so the worker executes the real op.
+                return self._lift_inline_task(operand, task_keys, parent_key=parent_key)
 
             if self._is_pure_value_op(func):
                 reduced, used_lift = self._reduce_inline_task(operand, task_keys, parent_key=parent_key)
diff --git a/taskvine/src/graph/dagvine/blueprint_graph/adaptor_test.py b/taskvine/src/graph/dagvine/blueprint_graph/adaptor_test.py
@@ -501,15 +501,20 @@ def fake_identity_cast(x, *_, **__):
         fake_identity_cast.__module__ = "dask._fake"
 
         graph = {
-            "raw": _FakeDataNode(value=5),
+            "raw0": _FakeDataNode(value=5),
+            "raw1": _FakeDataNode(value=6),
             "outer": _FakeTask(
                 key="outer",
                 function=lambda x: x,
                 args=(
                     _FakeTask(
                         key=None,
                         function=fake_identity_cast,
-                        args=(_FakeTaskRef("raw"),),
+                        args=(
+                            _FakeTaskRef("raw0"),
+                            _FakeTaskRef("raw1"),
+                        ),
+                        kwargs={"typ": list},
                     ),
                 ),
             ),
@@ -519,8 +524,12 @@ def fake_identity_cast(x, *_, **__):
         _, outer_args, outer_kwargs = adapted["outer"]
         self.assertEqual(outer_kwargs, {})
         self.assertEqual(len(outer_args), 1)
-        self.assertIsInstance(outer_args[0], TaskOutputRef)
-        self.assertEqual(outer_args[0].task_key, "raw")
+        self.assertIsInstance(outer_args[0], list)
+        self.assertEqual(len(outer_args[0]), 2)
+        self.assertIsInstance(outer_args[0][0], TaskOutputRef)
+        self.assertIsInstance(outer_args[0][1], TaskOutputRef)
+        self.assertEqual(outer_args[0][0].task_key, "raw0")
+        self.assertEqual(outer_args[0][1].task_key, "raw1")
         self.assertFalse(any(str(k).startswith("__lift__") for k in adapted.keys()))
 
 
diff --git a/taskvine/src/graph/dagvine/blueprint_graph/blueprint_graph.py b/taskvine/src/graph/dagvine/blueprint_graph/blueprint_graph.py
@@ -8,6 +8,8 @@
 import cloudpickle
 
 
+# Lightweight wrapper around task results that optionally pads the payload. The
+# padding lets tests model large outputs without altering the logical result.
 class TaskOutputWrapper:
     def __init__(self, result, extra_size_mb=None):
         self.result = result
@@ -24,6 +26,7 @@ def load_from_path(path):
             raise FileNotFoundError(f"Task result file not found at {path}")
 
 
+# A reference to a task output. This is used to represent the output of a task as a dependency of another task.
 class TaskOutputRef:
     __slots__ = ("task_key", "path")
 
@@ -37,6 +40,8 @@ def __getitem__(self, key):
         return TaskOutputRef(self.task_key, self.path + (key,))
 
 
+# The BlueprintGraph is a directed acyclic graph (DAG) that represents the logical dependencies between tasks.
+# It is used to build the C vine graph.
 class BlueprintGraph:
 
     _LEAF_TYPES = (str, bytes, bytearray, memoryview, int, float, bool, type(None))
@@ -55,6 +60,9 @@ def __init__(self):
         self.pykey2cid = {}                  # py_key -> c_id
         self.cid2pykey = {}                  # c_id -> py_key
 
+        self.extra_task_output_size_mb = {}  # task_key -> extra size in MB
+        self.extra_task_sleep_time = {}      # task_key -> extra sleep time in seconds
+
     def _visit_task_output_refs(self, obj, on_ref, *, rewrite: bool):
         seen = set()
 
@@ -154,7 +162,7 @@ def task_consumes(self, task_key, *filenames):
 
     def save_task_output(self, task_key, output):
         with open(self.outfile_remote_name[task_key], "wb") as f:
-            wrapped_output = TaskOutputWrapper(output, extra_size_mb=0)
+            wrapped_output = TaskOutputWrapper(output, extra_size_mb=self.extra_task_output_size_mb[task_key])
             cloudpickle.dump(wrapped_output, f)
 
     def load_task_output(self, task_key):
@@ -191,6 +199,7 @@ def verify_topo(g, topo):
         print("topo verified: ok")
 
     def finalize(self):
+        # build the dependencies determined by files produced and consumed
         for file, producer in self.producer_of.items():
             for consumer in self.consumers_of.get(file, ()):
                 self.parents_of[consumer].add(producer)
diff --git a/taskvine/src/graph/dagvine/blueprint_graph/proxy_functions.py b/taskvine/src/graph/dagvine/blueprint_graph/proxy_functions.py
@@ -4,6 +4,7 @@
 
 
 from ndcctools.taskvine.utils import load_variable_from_library
+import time
 
 
 def compute_task(bg, task_expr):
@@ -33,10 +34,6 @@ def on_ref(r):
     r_args = bg._visit_task_output_refs(args, on_ref, rewrite=True)
     r_kwargs = bg._visit_task_output_refs(kwargs, on_ref, rewrite=True)
 
-    print(f"func: {func}")
-    print(f"r_args: {r_args}")
-    print(f"r_kwargs: {r_kwargs}")
-
     return func(*r_args, **r_kwargs)
 
 
@@ -48,4 +45,6 @@ def compute_single_key(vine_key):
 
     output = compute_task(bg, task_expr)
 
+    time.sleep(bg.extra_task_sleep_time[task_key])
+
     bg.save_task_output(task_key, output)
diff --git a/taskvine/src/graph/dagvine/dagvine.py b/taskvine/src/graph/dagvine/dagvine.py
diff --git a/taskvine/src/graph/dagvine/vine_graph/vine_graph.c b/taskvine/src/graph/dagvine/vine_graph/vine_graph.c