theislab
diff --git a/‎src/scaleflow/data/_data.py‎
Lines changed: 2 additions & 4 deletions b/‎src/scaleflow/data/_data.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/scaleflow/data/_dataloader.py‎
Lines changed: 38 additions & 65 deletions b/‎src/scaleflow/data/_dataloader.py‎
Lines changed: 38 additions & 65 deletions
diff --git a/‎src/scaleflow/data/_datamanager.py‎
Lines changed: 1 addition & 25 deletions b/‎src/scaleflow/data/_datamanager.py‎
Lines changed: 1 addition & 25 deletions
@@ -132,7 +132,6 @@ class GroupedDistributionAnnotation:
     tgt_dist_keys: list[str]
     src_dist_keys: list[str]
     dist_flag_key: str
-    condition_structure: dict[str, tuple[int, int]] | None = None  # Maps covariate name to (start, end) indices in flat array
 
     @classmethod
     def read_zarr(
@@ -294,9 +293,8 @@ def split_by_dist_df(self, dist_df: pd.DataFrame, column: str) -> dict[str, Grou
                 .to_dict()
             )
             src_data = {int(k): self.data.src_data[k] for k in src_tgt_dist_map.keys()}
-            tgt_indices = {int(j) for tgt_list in src_tgt_dist_map.values() for j in tgt_list}
-            tgt_data = {int(k): self.data.tgt_data[k] for k in tgt_indices}
-            conditions = {int(k): self.data.conditions[k] for k in tgt_indices}
+            tgt_data = {int(k): self.data.tgt_data[k] for k in src_tgt_dist_map.keys()}
+            conditions = {int(k): self.data.conditions[k] for k in src_tgt_dist_map.keys()}
             split_data[value] = GroupedDistributionData(
                 src_to_tgt_dist_map=src_tgt_dist_map,
                 src_data=src_data,
 
@@ -66,34 +66,32 @@ def __init__(
 
         if pool_fraction is None and replacement_prob is None:
             self._cache_all = True
-            self._pool_fraction = None
-            self._pool_size = None
-            self._replacement_prob = None
         else:
             if pool_fraction is None:
                 raise ValueError("pool_fraction must be provided if replacement_prob is provided.")
             if replacement_prob is None:
                 raise ValueError("replacement_prob must be provided if pool_fraction is provided.")
-            if not (0 < pool_fraction <= 1):
-                raise ValueError("pool_fraction must be in (0, 1].")
-            self._pool_fraction = pool_fraction
-            self._pool_size = math.ceil(pool_fraction * self.n_source_dists)
-            self._replacement_prob = replacement_prob
-            if pool_fraction == 1.0:
-                self._cache_all = True
-
-        self._pool_usage_count = {}
+        # Compute pool size from fraction
+        if not (0 < pool_fraction <= 1):
+            raise ValueError("pool_fraction must be in (0, 1].")
+        self._pool_fraction = pool_fraction
+        self._pool_size = math.ceil(pool_fraction * self.n_source_dists)
+        self._replacement_prob = replacement_prob
+        self._pool_usage_count = np.zeros(self.n_source_dists, dtype=int)
         self._initialized = False
         self._src_idx_pool = None
 
+        if pool_fraction == 1.0:
+            self._cache_all = True
+
         self._lock = nullcontext() if self._cache_all else threading.RLock()
         self._executor = None
         self._pending_replacements = {}
         if not self._cache_all:
             self._executor = ThreadPoolExecutor(max_workers=2) # TODO: avoid magic numbers
             self._pending_replacements: dict[int, dict[str, Any]] = {}
 
-
+    
 
     def init_sampler(self, rng) -> None:
         if self._initialized:
@@ -104,11 +102,10 @@ def init_sampler(self, rng) -> None:
         return None
 
     def _init_src_idx_pool(self, rng) -> None:
-        src_indices = np.array(list(self._data.data.src_data.keys()))
         if self._cache_all:
-            self._src_idx_pool = src_indices
+            self._src_idx_pool = np.arange(self.n_source_dists)
         else:
-            self._src_idx_pool = rng.choice(src_indices, size=self._pool_size, replace=False)
+            self._src_idx_pool = rng.choice(self.n_source_dists, size=self._pool_size, replace=False)
         return None
 
 
@@ -126,32 +123,24 @@ def sample(self, rng) -> dict[str, Any]:
         """
         source_dist_idx = self._sample_source_dist_idx(rng)
         target_dist_idx = self._sample_target_dist_idx(rng, source_dist_idx)
+        print(f"sampled source dist idx: {source_dist_idx} and target dist idx: {target_dist_idx}")
         source_batch = self._sample_source_cells(rng, source_dist_idx)
+        print(f"sampled source batch: {source_batch.shape}")
         target_batch = self._sample_target_cells(rng, source_dist_idx, target_dist_idx)
-
-        flat_condition = self._data.data.conditions[target_dist_idx]
-
-        if hasattr(self._data, 'annotation') and self._data.annotation.condition_structure:
-            condition = {}
-            max_combination_length = getattr(self._data, 'max_combination_length', 1)
-            for cov_name, (start, end) in self._data.annotation.condition_structure.items():
-                condition[cov_name] = flat_condition[start:end].reshape(1, max_combination_length, -1)
-        else:
-            condition = flat_condition
-
+        print(f"sampled target batch: {target_batch.shape}")
         res = {
             "src_cell_data": source_batch,
-            "tgt_cell_data": target_batch,
-            "condition": condition
+            "tgt_cell_data": target_batch
         }
+        res["condition"] = self._data.data.conditions[target_dist_idx]
         return res
 
 
     def _load_targets_parallel(self, tgt_indices):
         """Load multiple target distributions in parallel."""
         def _load_tgt(j: int):
             return j, self._data.data.tgt_data[j][...]
-
+        
         max_workers = min(32, (os.cpu_count() or 4))  # TODO: avoid magic numbers
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             results = list(ex.map(_load_tgt, tgt_indices))
@@ -160,12 +149,12 @@ def _load_tgt(j: int):
     def _init_cache_pool_elements(self) -> None:
         with self._lock:
             self._cached_srcs = {i: self._data.data.src_data[i][...] for i in self._src_idx_pool}
-
+        
         tgt_indices = sorted({int(j) for i in self._src_idx_pool for j in self._data.data.src_to_tgt_dist_map[i]})
-
+        
         with self._lock:
             self._cached_tgts = self._load_targets_parallel(tgt_indices)
-
+        
         return None
 
 
@@ -189,7 +178,7 @@ def _sample_source_dist_idx(self, rng) -> int:
 
     def _sample_source_dist_idx_in_memory(self, rng) -> int:
         source_idx = rng.choice(sorted(self._cached_srcs.keys()))
-        self._pool_usage_count[source_idx] = self._pool_usage_count.get(source_idx, 0) + 1
+        self._pool_usage_count[source_idx] += 1
         return source_idx
 
     def _sample_source_dist_idx_in_pool(self, rng) -> int:
@@ -199,70 +188,54 @@ def _sample_source_dist_idx_in_pool(self, rng) -> int:
             source_idx = rng.choice(sorted(self._cached_srcs.keys()))
 
         # Increment usage count for monitoring
-        self._pool_usage_count[source_idx] = self._pool_usage_count.get(source_idx, 0) + 1
+        self._pool_usage_count[source_idx] += 1
 
         # Gradually replace elements based on replacement probability (schedule only)
         if rng.random() < self._replacement_prob:
             self._schedule_replacement(rng)
-
+        
         return source_idx
 
     def _schedule_replacement(self, rng):
         if self._cache_all:
             return  # No replacement if everything is cached
-
-        # Get usage counts for indices in the pool
-        pool_indices = self._src_idx_pool.tolist()
-        usage_counts = np.array([self._pool_usage_count.get(idx, 0) for idx in pool_indices])
-
-        if len(usage_counts) == 0:
-            return
-
-        max_usage = usage_counts.max()
-        most_used_weight = (usage_counts == max_usage).astype(float)
+        # weights same as previous logic
+        most_used_weight = (self._pool_usage_count == self._pool_usage_count.max()).astype(float)
         if most_used_weight.sum() == 0:
             return
         most_used_weight /= most_used_weight.sum()
-        replaced_pool_slot = rng.choice(len(pool_indices), p=most_used_weight)
-        replaced_pool_idx = pool_indices[replaced_pool_slot]
+        replaced_pool_idx = rng.choice(self.n_source_dists, p=most_used_weight)
 
         with self._lock:
-            # If there's already a pending replacement for this pool slot, skip
-            if replaced_pool_slot in self._pending_replacements:
+            pool_set = set(self._src_idx_pool.tolist())
+            if replaced_pool_idx not in pool_set:
                 return
+            in_pool_idx = int(np.where(self._src_idx_pool == replaced_pool_idx)[0][0])
 
-            # Find all available source indices (not currently in pool)
-            all_src_indices = list(self._data.data.src_data.keys())
-            pool_set = set(pool_indices)
-            available_indices = [idx for idx in all_src_indices if idx not in pool_set]
-
-            if not available_indices:
+            # If there's already a pending replacement for this pool slot, skip
+            if in_pool_idx in self._pending_replacements:
                 return
 
-            # Get usage counts for available indices
-            available_usage = np.array([self._pool_usage_count.get(idx, 0) for idx in available_indices])
-            min_usage = available_usage.min()
-            least_used_weight = (available_usage == min_usage).astype(float)
+            least_used_weight = (self._pool_usage_count == self._pool_usage_count.min()).astype(float)
             if least_used_weight.sum() == 0:
                 return
             least_used_weight /= least_used_weight.sum()
-            new_idx_position = rng.choice(len(available_indices), p=least_used_weight)
-            new_pool_idx = available_indices[new_idx_position]
+            new_pool_idx = int(rng.choice(self.n_source_dists, p=least_used_weight))
 
             # Kick off background load for new indices
             fut: Future = self._executor.submit(self._load_new_cache, new_pool_idx)
-            self._pending_replacements[replaced_pool_slot] = {
+            self._pending_replacements[in_pool_idx] = {
                 "old": replaced_pool_idx,
                 "new": new_pool_idx,
                 "future": fut,
             }
-            print(f"scheduled replacement of {replaced_pool_idx} with {new_pool_idx} (slot {replaced_pool_slot})")
+            print(f"scheduled replacement of {replaced_pool_idx} with {new_pool_idx} (slot {in_pool_idx})")
 
     def _load_targets_parallel(self, tgt_indices):
         """Load multiple target distributions in parallel."""
         def _load_tgt(j: int):
             return j, self._data.data.tgt_data[j][...]
-
+        
         max_workers = min(32, (os.cpu_count() or 4))  # TODO: avoid magic numbers
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             results = list(ex.map(_load_tgt, tgt_indices))
 
@@ -114,31 +114,8 @@ def prepare_data(
         tgt_dist_labels = dict(zip(tgt_dist_labels.index, tgt_dist_labels.itertuples(index=False, name=None), strict=True))
 
 
-        # prepare conditions and structure metadata
+        # prepare conditions
         col_to_repr = {key: adata.uns[self.rep_keys[key]] for key in self.rep_keys.keys()}
-
-        # Compute condition_structure from first available label
-        condition_structure = {}
-        offset = 0
-        first_src_label = next(iter(src_dist_labels.values()))
-        first_tgt_label = next(iter(tgt_dist_labels.values()))
-
-        for col, label in zip(self.src_dist_keys, first_src_label, strict=True):
-            if col in col_to_repr:
-                dim = len(col_to_repr[col][label])
-                condition_structure[col] = (offset, offset + dim)
-                offset += dim
-
-        for col, label in zip(self.tgt_dist_keys, first_tgt_label, strict=True):
-            if col in col_to_repr:
-                dim = len(col_to_repr[col][label])
-                condition_structure[col] = (offset, offset + dim)
-                offset += dim
-            elif isinstance(label, (int, float)):
-                # Scalar value (like dosage)
-                condition_structure[col] = (offset, offset + 1)
-                offset += 1
-
         with timer("Getting conditions", verbose=verbose):
             conditions = {}
             for src_dist_idx, tgt_dist_idxs in src_to_tgt_dist_map.items():
@@ -183,7 +160,6 @@ def prepare_data(
                 src_dist_idx_to_labels=src_dist_labels,
                 tgt_dist_idx_to_labels=tgt_dist_labels,
                 default_values=default_values,
-                condition_structure=condition_structure,
             ),
         )