Asyncable functions (#258)

JoshLoecker · web-flow · commit dfaee415b895 · 2026-03-16T14:01:05.000-05:00
diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
@@ -32,7 +32,7 @@
     Solver,
     _BoundaryReactions,
 )
-from como.utils import set_up_logging, split_gene_expression_data
+from como.utils import asyncable, set_up_logging, split_gene_expression_data
 
 
 def _reaction_indices_to_ids(
@@ -1014,3 +1014,6 @@ def create_context_specific_model(  # noqa: C901
         f"metabolites={len(context_model.metabolites)}"
     )
     return context_model
+
+
+async_create_context_specific_model = asyncable(create_context_specific_model)
diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
@@ -22,7 +22,7 @@
     _SourceWeights,
 )
 from como.project import Config
-from como.utils import get_missing_gene_data, read_file, return_placeholder_data, set_up_logging
+from como.utils import asyncable, get_missing_gene_data, read_file, return_placeholder_data, set_up_logging
 
 
 class _MergedHeaderNames:
@@ -616,3 +616,6 @@ def merge_xomics(  # noqa: C901
         output_final_model_scores_filepath=output_final_model_scores_filepath,
         output_figure_dirpath=output_figure_dirpath,
     )
+
+
+async_merge_xomics = asyncable(merge_xomics)
diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py
@@ -3,7 +3,7 @@
 import itertools
 import sys
 from pathlib import Path
-from typing import TextIO, cast
+from typing import TextIO
 
 import numpy as np
 import pandas as pd
@@ -13,7 +13,7 @@
 from como.data_types import LogLevel
 from como.project import Config
 from como.proteomics_preprocessing import protein_transform_main
-from como.utils import return_placeholder_data, set_up_logging
+from como.utils import asyncable, return_placeholder_data, set_up_logging
 
 
 # Load Proteomics
@@ -43,7 +43,7 @@ def process_proteomics_data(path: Path) -> pd.DataFrame:
 
 
 # read map to convert to entrez
-async def load_gene_symbol_map(gene_symbols: list[str], entrez_map: Path | None = None):
+def load_gene_symbol_map(gene_symbols: list[str], entrez_map: Path | None = None):
     """Load a mapping from gene symbols to Entrez IDs.
 
     Args:
@@ -188,8 +188,9 @@ def load_empty_dict():
         )
         return load_empty_dict()
 
+
 # TODO: Convert to synchronous function
-async def proteomics_gen(
+def proteomics_gen(
     context_name: str,
     config_filepath: Path,
     matrix_filepath: Path,
@@ -230,9 +231,9 @@ async def proteomics_gen(
     for group in groups:
         indices = np.where([g == group for g in config_df["group"]])
         sample_columns = [*np.take(config_df["sample_name"].to_numpy(), indices).ravel().tolist(), "gene_symbol"]
-        matrix = cast(pd.DataFrame, matrix.loc[:, sample_columns])
-
-        symbols_to_gene_ids = await load_gene_symbol_map(
+        matrix = matrix.loc[:, sample_columns]
+        
+        symbols_to_gene_ids = load_gene_symbol_map(
             gene_symbols=matrix["gene_symbol"].tolist(),
             entrez_map=input_entrez_map,
         )
@@ -264,3 +265,6 @@ async def proteomics_gen(
         hi_group_ratio=high_confidence_batch_ratio,
         group_names=groups,
     )
+
+
+async_proteomics_gen = asyncable(proteomics_gen)
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
@@ -25,7 +25,7 @@
 from como.migrations import gene_info_migrations
 from como.pipelines.identifier import contains_identical_gene_types, determine_gene_type
 from como.project import Config
-from como.utils import read_file, set_up_logging
+from como.utils import asyncable, read_file, set_up_logging
 
 
 class _FilteringOptions(NamedTuple):
@@ -185,6 +185,9 @@ def _build_matrix_results(
     elif isinstance(matrix, sc.AnnData):
         if not isinstance(matrix.var, pd.DataFrame):
             raise TypeError(f"Expected matrix.var object to be 'pd.DataFrame', got '{type(matrix.var)}'")
+        
+        if matrix.raw is not None:
+            matrix = matrix.raw.to_adata()
 
         gene_info = gene_info.sort_values(["entrez_gene_id", "size"], ascending=[True, True]).drop_duplicates(
             subset=["entrez_gene_id"], keep="first"
@@ -538,8 +541,6 @@ def umi_filter(
         adata: sc.AnnData = metric.count_matrix.copy()
 
         if perform_normalization:
-            if adata.raw is not None:
-                adata.X = adata.raw.X.copy()
             sc.pp.filter_cells(adata, min_genes=10)
             sc.pp.filter_genes(adata, min_cells=1)
             sc.pp.normalize_total(adata, target_sum=target_sum)
@@ -549,8 +550,8 @@ def umi_filter(
 
         adata_x = adata.X
         n_cells, n_genes = adata.shape
-
-        min_samples: float = round(min_sample_expression * n_cells)
+        
+        min_samples = round(min_sample_expression * n_cells)
         min_func = k_over_a(min_samples, cut_off)
         min_genes_mask = np.zeros(n_genes, dtype=bool)
         for j in range(n_genes):
@@ -709,7 +710,7 @@ def _process(
 
         merged_zscores = merged_zscores.reindex(columns=sorted(merged_zscores.columns))
         merged_zscores = merged_zscores.groupby(by=merged_zscores.index.name).mean()
-        merged_zscores.to_csv(output_zscore_normalization_filepath, index=True)
+        merged_zscores.to_csv(output_zscore_normalization_filepath.with_suffix(".csv"), index=True)
     elif isinstance(rnaseq_matrix, sc.AnnData):
         merged_zscores = ad.concat([m.z_score_matrix for m in metrics.values()], axis="obs")
         merged_zscores.var.index.name = "entrez_gene_id"
@@ -905,55 +906,4 @@ def rnaseq_gen(  # noqa: C901
     )
 
 
-if __name__ == "__main__":
-    import matplotlib.pyplot as plt
-
-    data = pd.read_csv("/Users/joshl/Downloads/fpkm_example_data/CD8.genes.results.txt", sep="\t")
-    data["gene_id"] = data["gene_id"].str.partition(".")[0]
-    counts = (
-        data[["gene_id", "expected_count"]]
-        .copy()
-        .set_index("gene_id")
-        .sort_index()
-        .rename(columns={"expected_count": "actual"})
-    )
-    eff_len = (
-        data[["gene_id", "effective_length"]]
-        .copy()
-        .set_index("gene_id")
-        .sort_index()
-        .rename(columns={"effective_length": "actual"})
-    )
-    expected_fpkm = (
-        data[["gene_id", "FPKM"]].copy().set_index("gene_id").sort_index().rename(columns={"FPKM": "expected"})
-    )
-
-    metrics = {
-        "S1": _StudyMetrics(
-            study="S1",
-            num_samples=1,
-            count_matrix=counts,
-            eff_length=eff_len,
-            sample_names=[""],
-            layout=[LayoutMethod.paired_end],
-            entrez_gene_ids=np.ndarray([0]),
-            gene_sizes=np.ndarray([0]),
-        )
-    }
-    calculated_fpkm = _calculate_fpkm(metrics)["S1"].normalization_matrix
-    calculated_fpkm = calculated_fpkm.round(2)
-
-    joined = calculated_fpkm.join(expected_fpkm, how="inner")
-    joined["actual"] = joined["actual"].replace([np.nan, np.inf], 0)
-
-    zfpkm_df, _ = zFPKM(joined, remove_na=True)
-    zfpkm_df = zfpkm_df.replace(-np.inf, np.nan)
-
-    fig, axes = cast(tuple[plt.Figure, list[plt.Axes]], plt.subplots(nrows=2, ncols=1))
-    axes[0].hist(zfpkm_df["actual"].to_numpy())
-    axes[0].set_title("Expected zFPKM")
-
-    axes[1].hist(zfpkm_df["expected"].to_numpy())
-    axes[1].set_title("Actual zFPKM")
-    fig.tight_layout()
-    fig.show()
+async_rnaseq_gen = asyncable(rnaseq_gen)
diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
@@ -15,7 +15,7 @@
 
 from como.data_types import LogLevel, RNAType
 from como.pipelines.identifier import build_gene_info, get_remaining_identifiers
-from como.utils import read_file, set_up_logging
+from como.utils import asyncable, read_file, set_up_logging
 
 
 @dataclass
@@ -739,3 +739,6 @@ def rnaseq_preprocess(  # noqa: C901
         cache=cache,
         create_gene_info_only=create_gene_info_only,
     )
+
+
+async_rnaseq_preprocess = asyncable(rnaseq_preprocess)
diff --git a/main/como/utils.py b/main/como/utils.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+import asyncio
 import contextlib
+import functools
 import io
 import sys
-from collections.abc import Iterator, Sequence
+from collections.abc import Awaitable, Callable, Iterator, Sequence
 from pathlib import Path
-from typing import Any, Literal, NoReturn, TextIO, TypeVar, overload
+from typing import Any, Literal, ParamSpec, TextIO, TypeVar, overload
 
 import numpy.typing as npt
 import pandas as pd
@@ -15,8 +17,11 @@
 from como.data_types import LOG_FORMAT, Algorithm, LogLevel
 from como.pipelines.identifier import get_remaining_identifiers
 
+P = ParamSpec("P")
 T = TypeVar("T")
+
 __all__ = [
+    "asyncable",
     "get_missing_gene_data",
     "num_columns",
     "num_rows",
@@ -309,3 +314,23 @@ def set_up_logging(
     with contextlib.suppress(ValueError):
         logger.remove(0)
         logger.add(sink=location, level=level.value, format=formatting)
+
+
+def asyncable(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
+    """Converts a synchronous function to asynchronous.
+
+    This wrapper functions by running the synchronous function in a separate thread using asyncio's run_in_executor
+    This allows the synchronous function to be called in an asynchronous context without blocking the event loop.
+
+    :param func: The synchronous function to convert.
+    :return: An asynchronous version of the input function that runs in a separate thread.
+    """
+    
+    @functools.wraps(func)
+    async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
+        loop = asyncio.get_running_loop()
+        call = functools.partial(func, *args, **kwargs)
+        return await loop.run_in_executor(None, call)
+        # return await loop.run_in_executor(None, lambda: func(*args, **kwargs))
+    
+    return wrapper

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`Solver,`
`33`	`33`	`_BoundaryReactions,`
`34`	`34`	`)`
`35`		`-from como.utils import set_up_logging, split_gene_expression_data`
	`35`	`+from como.utils import asyncable, set_up_logging, split_gene_expression_data`
`36`	`36`
`37`	`37`
`38`	`38`	`def _reaction_indices_to_ids(`
`@@ -1014,3 +1014,6 @@ def create_context_specific_model( # noqa: C901`
`1014`	`1014`	`f"metabolites={len(context_model.metabolites)}"`
`1015`	`1015`	`)`
`1016`	`1016`	`return context_model`
	`1017`	`+`
	`1018`	`+`
	`1019`	`+async_create_context_specific_model = asyncable(create_context_specific_model)`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`_SourceWeights,`
`23`	`23`	`)`
`24`	`24`	`from como.project import Config`
`25`		`-from como.utils import get_missing_gene_data, read_file, return_placeholder_data, set_up_logging`
	`25`	`+from como.utils import asyncable, get_missing_gene_data, read_file, return_placeholder_data, set_up_logging`
`26`	`26`
`27`	`27`
`28`	`28`	`class _MergedHeaderNames:`
`@@ -616,3 +616,6 @@ def merge_xomics( # noqa: C901`
`616`	`616`	`output_final_model_scores_filepath=output_final_model_scores_filepath,`
`617`	`617`	`output_figure_dirpath=output_figure_dirpath,`
`618`	`618`	`)`
	`619`	`+`
	`620`	`+`
	`621`	`+async_merge_xomics = asyncable(merge_xomics)`