Implement lazy dataframes.

andrewchambers · andrewchambers · commit 61cf70d576a5 · 2023-08-30T20:24:56.000+12:00
diff --git a/btrdb/experimental/dask.py b/btrdb/experimental/dask.py
@@ -2,7 +2,9 @@
 import btrdb.exceptions
 import dask
 import dask.distributed
-import pandas as pd
+import dask.dataframe
+import pyarrow
+import pandas
 
 # This process local connection variable is initialized in all
 # dask worker processes by the configure function.
@@ -84,3 +86,115 @@ def configure(client=None, conn_str=None, apikey=None, profile=None):
         # Configure the distributed scheduler.
         plugin = BtrdbConnectionPlugin(**creds)
         client.register_worker_plugin(plugin, name="btrdb_connection")
+
+
+@dask.delayed
+def _stream_as_dataframe_part(uuid, start, end, snap_period, data_column, version):
+    db = get_btrdb()
+    # For now we use multi values because it implements both timesnapping and does
+    # not return duplicate values for a single timestamp. Both of these are useful
+    # properties for how dask dataframe partitions are supposed to behave.
+    values = db.ep.arrowMultiValues([uuid], start, end, [version], snap_period)
+    values = [v for v in values]
+    if len(values) != 0:
+        values = pyarrow.concat_tables([v for v in values])
+    else:
+        schema = pa.schema(
+            [
+                pyarrow.field(
+                    "time", pyarrow.timestamp("ns", tz="UTC"), nullable=False
+                ),
+                pyarrow.field(str(uuid), pyarrow.float64(), nullable=False),
+            ]
+        )
+        values = pa.Table.from_arrays([pa.array([]), pa.array([])], schema=schema)
+    # XXX ensure this is zero copy.
+    values = values.rename_columns(["time", data_column])
+    values = values.to_pandas()
+    # XXX Can we do this from to_dataframe?
+    values.set_index("time", inplace=True)
+    return values
+
+
+def stream_as_dataframe(
+    stream,
+    start=None,
+    end=None,
+    partitions=1,
+    snap_period=0,
+    data_column=None,
+    version=0,
+):
+    """
+    Converts a btrdb stream to a lazy Dask DataFrame.
+
+    Parameters:
+    ----------
+    stream : btrdb.stream.Stream
+        The stream containing the data.
+
+    start : datetime-like, optional
+        The start time for the data from the stream. Defaults to the earliest time in the stream.
+
+    end : datetime-like, optional
+        The end time for the data from the stream. Defaults to the latest time in the stream.
+
+    partitions : int, optional
+        Number of partitions for the dask dataframe. Default is 1.
+
+    snap_period : int, optional
+        The period for data time snapping.
+        Defaults to 0, which means no snapping.
+
+    data_column : str or callable, optional
+        The name of the data column. If None, it defaults to the collection and name of the stream.
+        If callable, the function is applied to the stream object to determine the data column name.
+
+    version : int, optional
+        The stream version to be used. Defaults to 0.
+
+    Returns:
+    --------
+    Dask DataFrame
+        The Dask DataFrame containing the stream data.
+    """
+    if data_column is None:
+        data_column = stream.collection + "/" + stream.name
+    else:
+        if type(data_column) != str:
+            # assume callable.
+            data_column = data_column(s)
+    if start is None:
+        start = stream.earliest()[0].time
+    if end is None:
+        end = stream.latest()[0].time
+    duration = end - start
+    if partitions >= duration:
+        partitions = 1
+    part_duration = duration // partitions
+    if snap_period != 0:
+        # N.B. Due to the way the server does time snapping, we need to ensure that our partitions are aligned to the
+        # timesnapping period. The reason for this is we don't want values to snapped into both partitions by accident.
+        remainder = part_duration % snap_period
+        if remainder != 0:
+            part_duration += snap_period - remainder
+    parts = []
+    divisions = []
+    part_start = start
+    while part_start < end:
+        part_end = min(part_start + part_duration, end)
+        part = _stream_as_dataframe_part(
+            stream.uuid, part_start, part_end, snap_period, data_column, version
+        )
+        parts.append(part)
+        divisions.append(part_start)
+        part_start += part_duration
+    divisions.append(end)
+    meta = pandas.DataFrame(
+        index=pandas.DatetimeIndex([], tz="UTC"),
+        columns=[data_column],
+        dtype="float64",
+    )
+    return dask.dataframe.from_delayed(
+        parts, meta=meta, divisions=divisions, verify_meta=False
+    )