Attempt to address xdf-modules#1

agricolab · agricolab · commit 3d73e6dbbea3 · 2024-10-07T06:30:35.000+02:00
diff --git a/pyxdf/__init__.py b/pyxdf/__init__.py
@@ -3,10 +3,14 @@
 #
 # License: BSD (2-clause)
 
-from pkg_resources import get_distribution, DistributionNotFound
 try:
-    __version__ = get_distribution(__name__).version
-except DistributionNotFound:  # package is not installed
+    from pkg_resources import get_distribution, DistributionNotFound
+
+    try:
+        __version__ = get_distribution(__name__).version
+    except DistributionNotFound:  # package is not installed
+        __version__ = None
+except ImportError:  # pkg_resources is not available
     __version__ = None
 from .pyxdf import load_xdf, resolve_streams, match_streaminfos, align_streams
 
diff --git a/pyxdf/align.py b/pyxdf/align.py
@@ -31,148 +31,182 @@ def _interpolate(
 
 
 def _shift_align(old_timestamps, old_timeseries, new_timestamps):
+    # Convert inputs to numpy arrays
     old_timestamps = np.array(old_timestamps)
     old_timeseries = np.array(old_timeseries)
     new_timestamps = np.array(new_timestamps)
+
     ts_last = old_timestamps[-1]
-    ts_first = old_timestamps[0]    
-    source = list()
-    target = list()    
-    new_timeseries = np.empty((
-                               new_timestamps.shape[0],  # new sample count
-                               old_timeseries.shape[1], # old channel count
-                               ), dtype=object)
-    new_timeseries.fill(np.nan)
-    too_old = list()
-    too_young = list()
+    ts_first = old_timestamps[0]
+
+    # Initialize variables
+    source = []
+    target = []
+
+    new_timeseries = np.full((new_timestamps.shape[0], old_timeseries.shape[1]), np.nan)
+
+    too_old = []
+    too_young = []
+
+    # Loop through new timestamps to find the closest old timestamp
+    # Handle timestamps outside of the segment (too young or too old) different from stamnps from within the segment
     for nix, nts in enumerate(new_timestamps):
-        closest = (np.abs(old_timestamps - nts)).argmin()
-        # remember the edge cases, 
-        if (nts>ts_last): 
+        if nts > ts_last:
             too_young.append((nix, nts))
-        elif (nts < ts_first):
-            too_old.append((nix,nts))
+        elif nts < ts_first:
+            too_old.append((nix, nts))
         else:
-            closest = (np.abs(old_timestamps - nts)).argmin()
-            source.append(closest)
-            target.append(nix)
-    # check the edge cases, 
-    for nix, nts in reversed(too_old):
-        closest = (np.abs(old_timestamps - nts)).argmin()
-        if (closest not in source):
+            closest = np.abs(old_timestamps - nts).argmin()
+            if closest not in source:  # Ensure unique mapping
+                source.append(closest)
+                target.append(nix)
+            else:
+                raise RuntimeError(
+                    f"Non-unique mapping. Closest old timestamp for {new_timestamps[nix]} is {old_timestamps[closest]} but that one was already assigned to {new_timestamps[source.index(closest)]}"
+                )
+
+    # Handle too old timestamps (those before the first old timestamp)
+    for nix, nts in too_old:
+        closest = 0  # Assign to the first timestamp
+        if closest not in source:  # Ensure unique mapping
             source.append(closest)
             target.append(nix)
-        break
+            break  # only one, because we only need the edge
+
+    # Handle too young timestamps (those after the last old timestamp)
     for nix, nts in too_young:
-        closest = (np.abs(old_timestamps - nts)).argmin()
-        if (closest not in source):
+        closest = len(old_timestamps) - 1  # Assign to the last timestamp
+        if closest not in source:  # Ensure unique mapping
             source.append(closest)
             target.append(nix)
-        break
-    
-    if len(set(source)) != len(old_timestamps):
-        missed = len(old_timestamps)-len(set(source))
-        raise RuntimeError(f"Too few new timestamps. {missed} of {len(old_timestamps)} old samples could not be assigned.")
-    if len(set(source)) != len(source): #non-unique mapping            
-        cnt = Counter(source)        
-        toomany = defaultdict(list)
-        for v,n in zip(source, target):
-            if cnt[v] != 1:
-                toomany[old_timestamps[source[v]]].append(new_timestamps[target[n]])
-        for k,v in toomany.items():
-            print("The old time_stamp ", k,
-                "is a closest neighbor of", len(v) ,"new time_stamps:", v)
-        raise RuntimeError("Can not align streams. Could not create an unique mapping")
+            break  # only one, because we only need the edge
+
+    # Sanity check: all old timestamps should be assigned to at least one new timestamp
+    missed = len(old_timestamps) - len(set(source))
+    if missed > 0:
+        unassigned_old = [i for i in range(len(old_timestamps)) if i not in source]
+        for i, ts in zip(unassigned_old, old_timestamps[unassigned_old]):
+            print(
+                f"Old timestamp {ts} was not assigned to any new timestamp. Closest new timestamp is {new_timestamps[np.abs(new_timestamps - ts).argmin()]}"
+            )
+        raise RuntimeError(
+            f"Too few new timestamps. {missed} old timestamps ({old_timestamps[unassigned_old]}) found no corresponding new timestamp because it was already taken by another old timestamp."
+        )
+
+    # Populate new timeseries with aligned values from old_timeseries
     for chan in range(old_timeseries.shape[1]):
-        new_timeseries[target, chan] = old_timeseries[source,chan]
+        new_timeseries[target, chan] = old_timeseries[source, chan]
+
     return new_timeseries
 
 
-def align_streams(streams, # List[defaultdict]
-                  align_foo=dict(), # defaultdict[int, Callable] 
-                  aligned_timestamps=None, # Optional[List[float]]
-                  sampling_rate=None # Optional[float|int]
-): # -> Tuple[np.ndarray, List[float]]
+def align_streams(
+    streams,  # List[defaultdict]
+    align_foo=dict(),  # defaultdict[int, Callable]
+    aligned_timestamps=None,  # Optional[List[float]]
+    sampling_rate=None,  # Optional[float|int]
+):  # -> Tuple[np.ndarray, List[float]]
     """
-    A function to 
+    A function to
 
 
     Args:
 
-        streams: a list of defaultdicts  (i.e. streams) as returned by 
+        streams: a list of defaultdicts  (i.e. streams) as returned by
                     load_xdf
-        align_foo: a dictionary mapping streamIDs (i.e. int) to interpolation 
-                    callables. These callables must have the signature 
+        align_foo: a dictionary mapping streamIDs (i.e. int) to interpolation
+                    callables. These callables must have the signature
                     `interpolate(old_timestamps, old_timeseries, new_timestamps)` and return a np.ndarray. See `_shift_align` and `_interpolate` for examples.
-        aligned_timestamps (optional): a list of floats with the new 
+        aligned_timestamps (optional): a list of floats with the new
                     timestamps to be used for alignment/interpolation. This list of timestamps can be irregular and have gaps.
-        sampling_rate (optional): a float defining the sampling rate which 
+        sampling_rate (optional): a float defining the sampling rate which
                     will be used to calculate aligned_timestamps.
-    
+
     Return:
         (aligned_timeseries, aligned_timestamps): tuple
 
 
-    THe user can define either aligned_timestamps or sampling_rate or neither. If neither is defined, the algorithm will take the sampling_rate of the fastest stream and create aligned_timestamps from the oldest sample of all streams to the youngest. 
-        
+    THe user can define either aligned_timestamps or sampling_rate or neither. If neither is defined, the algorithm will take the sampling_rate of the fastest stream and create aligned_timestamps from the oldest sample of all streams to the youngest.
+
     """
-       
+
     if sampling_rate is not None and aligned_timestamps is not None:
-        raise ValueError("You can not specify aligned_timestamps and sampling_rate at the same time")
-    
+        raise ValueError(
+            "You can not specify aligned_timestamps and sampling_rate at the same time"
+        )
+
     if sampling_rate is None:
-         # we pick the effective sampling rate from the  fastest stream
+        # we pick the effective sampling rate from the  fastest stream
         srates = [stream["info"]["effective_srate"] for stream in streams]
         sampling_rate = max(srates, default=0)
         if sampling_rate <= 0:  # either no valid stream or all streams are async
-            warnings.warn("Can not align streams: Fastest effective sampling rate was 0 or smaller.")
+            warnings.warn(
+                "Can not align streams: Fastest effective sampling rate was 0 step = 1 / sampling_rateor smaller."
+            )
             return streams
-        
-    
-    if aligned_timestamps is None:        
+
+    if aligned_timestamps is None:
         # we pick the oldest and youngest timestamp of all streams
-        stamps = [stream["time_stamps"] for stream in streams]        
-        ts_first = min((min(s) for s in stamps))      
-        ts_last = max((max(s) for s in stamps))  
-        full_dur = ts_last-ts_first
-        step = 1/sampling_rate
+        stamps = [stream["time_stamps"] for stream in streams]
+        ts_first = min((min(s) for s in stamps))
+        ts_last = max((max(s) for s in stamps))
+        full_dur = ts_last - ts_first
+        # Use np.linspace for precise control over the number of points and guaranteed inclusion of the stop value.
+        # np.arange is better when you need direct control over step size but may exclude the stop value and accumulate floating-point errors.
+        # Choose np.linspace for better precision and np.arange for efficiency with fixed steps.
         # we create new regularized timestamps
-        aligned_timestamps = np.arange(ts_first, ts_last+step/2, step)
-        # using np.linspace only differs in step if n_samples is different (as n_samples must be an integer number (see implementation below). 
-        # therefore we stick with np.arange (in spite of possible floating point error accumulation, but to make sure that ts_last is included, we add a half-step. This therefore comes at the cost of a overshoot, but i consider this acceptable considering this stamp would only be from one stream, and not part of all other and therefore is kind of arbitray anyways.
+        # arange implementation:
+        # step = 1 / sampling_rate
+        # aligned_timestamps = np.arange(ts_first, ts_last + step / 2, step)
         # linspace implementation:
-        # n_samples = int(np.round((full_dur * sampling_rate),0))+1
-        # aligned_timestamps = np.linspace(ts_first, ts_last, n_samples)       
-        
+        # add 1 to the number of samples to include the last sample
+        n_samples = int(np.round((full_dur * sampling_rate), 0)) + 1
+        aligned_timestamps = np.linspace(ts_first, ts_last, n_samples)
+
     channels = 0
     for stream in streams:
         # print(stream)
         channels += int(stream["info"]["channel_count"][0])
     # https://stackoverflow.com/questions/1704823/create-numpy-matrix-filled-with-nans The timings show a preference for ndarray.fill(..) as the faster alternative.
-    aligned_timeseries = np.empty((len(aligned_timestamps),
-                                   channels,), dtype=object)
+    aligned_timeseries = np.empty(
+        (
+            len(aligned_timestamps),
+            channels,
+        ),
+        dtype=object,
+    )
     aligned_timeseries.fill(np.nan)
 
-    chan_start = 0    
+    chan_start = 0
     chan_end = 0
     for stream in streams:
         sid = stream["info"]["stream_id"]
-        align = align_foo.get(sid, _shift_align) 
+        align = align_foo.get(sid, _shift_align)
         chan_cnt = int(stream["info"]["channel_count"][0])
         new_timeseries = np.empty((len(aligned_timestamps), chan_cnt), dtype=object)
         new_timeseries.fill(np.nan)
-        for seg_start, seg_stop in stream["info"]["segments"]:            
-            _new_timeseries = align(
-                stream["time_stamps"][seg_start:seg_stop+1], 
-                stream["time_series"][seg_start:seg_stop+1], 
-                aligned_timestamps)
+        print("Stream #", sid, " has ", len(stream["info"]["segments"]), "segments")
+        for seg_idx, (seg_start, seg_stop) in enumerate(stream["info"]["segments"]):
+            print(seg_idx, ": from index ", seg_start, "to ", seg_stop + 1)
+            # segments have been created including the stop index, so we need to add 1 to include the last sample
+            segment_old_timestamps = stream["time_stamps"][seg_start : seg_stop + 1]
+            segment_old_timeseries = stream["time_series"][seg_start : seg_stop + 1]
+            # Sanity check for duplicate timestamps
+            if len(np.unique(segment_old_timestamps)) != len(segment_old_timestamps):
+                raise RuntimeError("Duplicate timestamps found in old_timestamps")
+            # apply align function as defined by the user (or default)
+            segment_new_timeseries = align(
+                segment_old_timestamps,
+                segment_old_timeseries,
+                aligned_timestamps,
+            )
             # pick indices of the NEW timestamps closest to when segments start and stop
             a = stream["time_stamps"][seg_start]
             b = stream["time_stamps"][seg_stop]
-            aix = np.argmin(np.abs(aligned_timestamps-a))
-            bix = np.argmin(np.abs(aligned_timestamps-b))            
+            aix = np.argmin(np.abs(aligned_timestamps - a))
+            bix = np.argmin(np.abs(aligned_timestamps - b))
             # and store only this aligned segment, leaving the rest as nans (or aligned as other segments)
-            new_timeseries[aix:bix+1] = _new_timeseries[aix:bix+1]
+            new_timeseries[aix : bix + 1] = segment_new_timeseries[aix : bix + 1]
 
         # store the new timeseries at the respective channel indices in the 2D array
         chan_start = chan_end
diff --git a/test.py b/test.py
@@ -0,0 +1,15 @@
+import matplotlib.pyplot as plt
+import pyxdf
+
+if __name__ == "__main__":
+    fname = "/home/rtgugg/Downloads/sub-13_ses-S001_task-HCT_run-001_eeg.xdf"
+    # streams, header = pyxdf.load_xdf(
+    #     fname, select_streams=[2, 5]
+    # )  # EEG and ACC streams
+
+    # pyxdf.align_streams(streams)
+
+    streams, header = pyxdf.load_xdf(fname, select_streams=[2])  # EEG stream
+    plt.plot(streams[0]["time_stamps"])
+    plt.show()
+    pyxdf.align_streams(streams)