From c97e6e796ba27f3bfc71b150f984cedef711f077 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 15:52:53 -0500 Subject: [PATCH 1/8] fix: allow `None` type for figure dirpath Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index f08430fd..88ae6a69 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -28,7 +28,7 @@ def _combine_z_distribution_for_batch( matrix: pd.DataFrame, source: SourceTypes, output_combined_matrix_filepath: Path, - output_figure_dirpath: Path, + output_figure_dirpath: Path | None, ) -> pd.DataFrame: """Combine z-score distributions across samples for a single batch. @@ -42,7 +42,9 @@ def _combine_z_distribution_for_batch( :returns: A pandas dataframe of the weighted z-distributions """ output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) - output_figure_dirpath.mkdir(parents=True, exist_ok=True) + + if output_figure_dirpath is not None: + output_figure_dirpath.mkdir(parents=True, exist_ok=True) logger.trace( f"Combining z-score distributions: batch #{batch.batch_num}, " @@ -107,7 +109,7 @@ def _combine_z_distribution_for_source( context_name: str, replicates_per_batch: Sequence[int], output_combined_matrix_filepath: Path, - output_figure_filepath: Path, + output_figure_filepath: Path | None, weighted_z_floor: int = -6, weighted_z_ceiling: int = 6, ) -> pd.DataFrame: @@ -135,7 +137,9 @@ def _combine_z_distribution_for_source( return out_df output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) - output_figure_filepath.parent.mkdir(parents=True, exist_ok=True) + + if output_figure_filepath is not None: + output_figure_filepath.parent.mkdir(parents=True, exist_ok=True) # alidate alignment between columns and replicate vector batch_column_names: list[str] = list(merged_source_data.columns) @@ -190,7 +194,7 @@ def _combine_z_distribution_for_source( def _combine_z_distribution_for_context( context: str, zscore_results: list[_CombineOmicsInput], - output_graph_filepath: Path, + output_graph_filepath: Path | None, ) -> pd.DataFrame: if not zscore_results: logger.warning("No zscore results exist, returning empty dataframe") @@ -275,14 +279,12 @@ def _begin_combining_distributions( batch_names: _BatchNames, source_weights: _SourceWeights, output_filepaths: _OutputCombinedSourceFilepath, - output_figure_dirpath: Path, + output_figure_dirpath: Path | None, output_final_model_scores: Path, weighted_z_floor: int = -6, weighted_z_ceiling: int = 6, ): logger.info(f"Starting to combine z-scores for context '{context_name}'") - output_figure_dirpath.mkdir(parents=True, exist_ok=True) - z_score_results: list[_CombineOmicsInput] = [] matrix: pd.DataFrame | None @@ -372,7 +374,9 @@ def _begin_combining_distributions( / f"{context_name}_{source.value}_combined_zscore_distribution.csv" ), output_figure_filepath=( - output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf" + (output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf") + if output_figure_dirpath is not None + else None ), weighted_z_floor=weighted_z_floor, weighted_z_ceiling=weighted_z_ceiling, @@ -394,7 +398,11 @@ def _begin_combining_distributions( merged_context_results = _combine_z_distribution_for_context( context=context_name, zscore_results=z_score_results, - output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf", + output_graph_filepath=( + (output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf") + if output_figure_dirpath is not None + else None + ), ) merged_context_results.to_csv(output_final_model_scores, index=len(merged_context_results.columns) <= 1) logger.success(f"Wrote combined z-scores for context '{context_name}' to {output_final_model_scores}") From 06251eb61f8fb8dc2c823991545268e49c9d6491 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 15:53:04 -0500 Subject: [PATCH 2/8] fix: allow `None` for output matrix filepaths Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 91 ++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index cf7d0842..93141b6a 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -307,9 +307,9 @@ def _write_matrices( fragment_lengths: pd.DataFrame, tpm_df: pd.DataFrame, como_context_dir: Path, - out_counts_filepath: Path, - out_frag_length_filepath: Path, - out_tpm_filepath: Path, + out_counts_filepath: Path | None, + out_frag_length_filepath: Path | None, + out_tpm_filepath: Path | None, rna: RNAType, ) -> pd.DataFrame: """Create a counts matrix file by reading gene counts table(s). @@ -326,25 +326,30 @@ def _write_matrices( :return: A pandas DataFrame representing the final counts matrix. """ study_metrics = _organize_gene_counts_files(data_dir=como_context_dir) - counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics] rna_specific_sample_names = sorted( set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()) ) - - final_matrix: pd.DataFrame = functools.reduce( - lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts - ) - final_matrix.fillna(value=0, inplace=True) - final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]] - final_matrix = final_matrix.reindex(columns=["ensembl_gene_id", *rna_specific_sample_names]) - fragment_lengths = fragment_lengths.reindex(columns=rna_specific_sample_names).sort_index() - tpm_df = tpm_df.reindex(columns=rna_specific_sample_names).sort_index() - - out_counts_filepath.parent.mkdir(parents=True, exist_ok=True) - out_frag_length_filepath.parent.mkdir(parents=True, exist_ok=True) - final_matrix.to_csv(out_counts_filepath, index=False, float_format="%.15g") - fragment_lengths[rna_specific_sample_names].to_csv(out_frag_length_filepath, index=True, float_format="%.15g") - tpm_df[rna_specific_sample_names].to_csv(out_tpm_filepath, index=True, float_format="%.15g") + + if out_counts_filepath: + out_counts_filepath.parent.mkdir(parents=True, exist_ok=True) + counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics] + final_matrix: pd.DataFrame = functools.reduce( + lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts + ) + final_matrix.fillna(value=0, inplace=True) + final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]] + final_matrix = final_matrix.reindex(columns=["ensembl_gene_id", *rna_specific_sample_names]) + final_matrix.to_csv(out_counts_filepath, index=False, float_format="%.15g") + + if out_frag_length_filepath: + out_frag_length_filepath.parent.mkdir(parents=True, exist_ok=True) + fragment_lengths = fragment_lengths.reindex(columns=rna_specific_sample_names).sort_index() + fragment_lengths[rna_specific_sample_names].to_csv(out_frag_length_filepath, index=True, float_format="%.15g") + + if out_tpm_filepath: + out_tpm_filepath.parent.mkdir(parents=True, exist_ok=True) + tpm_df = tpm_df.reindex(columns=rna_specific_sample_names).sort_index() + tpm_df[rna_specific_sample_names].to_csv(out_tpm_filepath, index=True, float_format="%.15g") logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{out_counts_filepath}'") @@ -499,33 +504,33 @@ def _process_como_input( out_trna_tpm: Path | None, ) -> None: config_df, fragment_lengths, tpm_df = _create_config_df(context_name, como_context_dir=como_context_dir) - - if out_trna_config and out_trna_counts and out_trna_fragments and out_trna_tpm: - _write_matrices( - config_df=config_df, - fragment_lengths=fragment_lengths, - tpm_df=tpm_df, - como_context_dir=como_context_dir, - out_counts_filepath=out_trna_counts, - out_frag_length_filepath=out_trna_fragments, - out_tpm_filepath=out_trna_tpm, - rna=RNAType.TRNA, - ) + + _write_matrices( + config_df=config_df, + fragment_lengths=fragment_lengths, + tpm_df=tpm_df, + como_context_dir=como_context_dir, + out_counts_filepath=out_trna_counts, + out_frag_length_filepath=out_trna_fragments, + out_tpm_filepath=out_trna_tpm, + rna=RNAType.TRNA, + ) + if out_trna_config: with pd.ExcelWriter(out_trna_config) as writer: subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.TRNA.value.lower()] subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) - - if out_mrna_config and out_mrna_counts and out_mrna_fragments and out_mrna_tpm: - _write_matrices( - config_df=config_df, - fragment_lengths=fragment_lengths, - tpm_df=tpm_df, - como_context_dir=como_context_dir, - out_counts_filepath=out_mrna_counts, - out_frag_length_filepath=out_mrna_fragments, - out_tpm_filepath=out_mrna_tpm, - rna=RNAType.MRNA, - ) + + _write_matrices( + config_df=config_df, + fragment_lengths=fragment_lengths, + tpm_df=tpm_df, + como_context_dir=como_context_dir, + out_counts_filepath=out_mrna_counts, + out_frag_length_filepath=out_mrna_fragments, + out_tpm_filepath=out_mrna_tpm, + rna=RNAType.MRNA, + ) + if out_mrna_config: with pd.ExcelWriter(out_mrna_config) as writer: subset_config = config_df[config_df["library_prep"].str.lower() == RNAType.MRNA.value.lower()] subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) From fdba79cb18342f5330087a80b0d75f986175ac6c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:11:36 -0500 Subject: [PATCH 3/8] fix: only use gurobi-specific solver settings if gurobi is being used Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index d479bd03..1beba460 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -292,10 +292,12 @@ def _build_with_imat( cfg.verbosity = build_settings.solver_verbosity cfg.timeout = build_settings.solver_timeout cfg.tolerances.feasibility = build_settings.solver_feasibility - cfg.tolerances.optimality = build_settings.solver_optimality cfg.tolerances.integrality = build_settings.solver_integrality + if solver.upper() == "GUROBI": + cfg.tolerances.optimality = build_settings.solver_optimality + grb_model = getattr(getattr(cfg, "problem", None), "problem", None) # `getattr` to handle non-gurobi solvers - if grb_model is not None: + if grb_model is not None and solver.upper() == "GUROBI": grb_model.Params.MIPGap = build_settings.gurobi_mipgap grb_model.Params.IntegralityFocus = build_settings.gurobi_integrality_focus grb_model.Params.NumericFocus = build_settings.gurobi_numeric_focus From 8f21380b8d6d99ef67638afe73a8463be401758c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:13:28 -0500 Subject: [PATCH 4/8] refactor: import required modules Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 1beba460..d3c76074 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -7,9 +7,8 @@ from collections.abc import Sequence from io import TextIOWrapper from math import isfinite -from multiprocessing import Value from pathlib import Path -from typing import Literal, Never, TextIO, cast, reveal_type +from typing import Literal, TextIO, cast import cobamp.core.optimization import cobra @@ -17,6 +16,7 @@ import numpy as np import numpy.typing as npt import pandas as pd +import scanpy as sc from loguru import logger from troppo.methods.reconstruction.corda import CORDA, CORDAProperties from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties From bddf2a3e80810b52e3516d455b91a19b7c333c56 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:15:03 -0500 Subject: [PATCH 5/8] feat: allow zFPKM cutoffs (e.g., -3) Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 74 +++++++++++++++++----- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index d3c76074..2ba73bd7 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -412,6 +412,8 @@ def _map_expression_to_reaction( taxon: int | str, low_percentile: int, high_percentile: int, + low_bin_cutoff: int, + high_bin_cutoff: int, ) -> tuple[collections.OrderedDict[str, float], float, float, float]: """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules. @@ -424,6 +426,9 @@ def _map_expression_to_reaction( taxon: Taxon ID or Taxon object for gene ID conversion. low_percentile: Low percentile for threshold calculation high_percentile: High percentile for threshold calculation + low_bin_cutoff: Values lower than this (e.g., -3 zFPKM or 25th percentile) will be placed in the low bin. + high_bin_cutoff: Values equal to or greater than this (e.g., 0 zFPKM or 75th percentile) will be placed in the high bin. + activity_is_zfpkm_data: Does `gene_expression_file` contain zFPKM-normalized data? Returns: [0]: An ordered dictionary mapping reaction IDs to their corresponding expression values. @@ -434,7 +439,15 @@ def _map_expression_to_reaction( Raises: ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file. """ - expression_data = pd.read_csv(gene_expression_file) + if gene_expression_file.suffix in {".csv", ".tsv"}: + expression_data = pd.read_csv(gene_expression_file) + elif gene_expression_file.suffix == ".h5ad": + adata = sc.read_h5ad(gene_expression_file) + expression_data = adata.to_df() + else: + raise ValueError( + f"Unknown file extension '{gene_expression_file.suffix}' for gene expression file: {gene_expression_file}" + ) gene_activity = split_gene_expression_data( expression_data, identifier_column="entrez_gene_id", recon_algorithm=recon_algorithm ) @@ -444,10 +457,30 @@ def _map_expression_to_reaction( min_val = float(gene_arr[np.isfinite(gene_arr)].min()) if min_val < 0: gene_arr[np.isfinite(gene_arr)] += abs(min_val) - - # We are computing percentiles on non-zero data because we need to make sure that - # zero values are definitely placed in the low bin - low_expr, high_expr = np.nanpercentile(gene_arr[gene_arr > 0], [low_percentile, high_percentile]) + + if not activity_is_zfpkm_data and low_bin_cutoff < 1: + logger.warning( + f"Low bin cutoff was less than 1, but not using zFPKM data (got: {low_bin_cutoff}). " + f"Assuming this is a percentile value, it should be in the range [1, 100], converting now." + ) + low_bin_cutoff *= 100 + if not activity_is_zfpkm_data and high_bin_cutoff < 1: + logger.warning( + f"High bin cutoff was less than 1, but not with zFPKM data (got: {high_bin_cutoff}). " + "Assuming this is a percentile value, it should be in the range [1, 100], converting now." + ) + high_bin_cutoff *= 100 + + # If the user provided zFPKM-normalized data, the low and high bin cutoffs should be kept constant + # Otherwise, we will calculate percentile expression values + if activity_is_zfpkm_data: + low_expr = low_bin_cutoff + high_expr = high_bin_cutoff + else: + # We are computing percentiles on non-zero data because we need to make sure that + # zero values are definitely placed in the low bin + low_expr, high_expr = np.nanpercentile(gene_arr[gene_arr > 0], [low_bin_cutoff, high_bin_cutoff]) + gene_activity["active"] = gene_arr gene_activity.index = gene_activity.index.astype(str) # maintain strings for reference model gene_ids @@ -546,14 +579,15 @@ def _build_model( lower_bounds: list[float], upper_bounds: list[float], solver: str, - low_percentile: int, - high_percentile: int, + low_bin_cutoff: int, + high_bin_cutoff: int, output_flux_result_filepath: Path, taxon: int | str, objective_direction: Literal["min", "max"], build_settings: ModelBuildSettings, *, force_boundary_rxn_inclusion: bool, + activity_is_zfpkm_data: bool, ) -> cobra.Model: """Seed a context specific reference_model. @@ -573,9 +607,12 @@ def _build_model( lower_bounds: List of lower bounds corresponding to boundary reactions upper_bounds: List of upper bounds corresponding to boundary reactions solver: Solver to use (e.g., 'glpk', 'cplex', 'gurobi') + low_bin_cutoff: Values lower than this (e.g., -3 zFPKM or 25th percentile) will be placed in the low bin. + high_bin_cutoff: Values equal to or greater than this (e.g., 0 zFPKM or 75th percentile) will be placed in the high bin. output_flux_result_filepath: Path to save flux results (for iMAT only) taxon: Taxon ID or Taxon object for gene ID conversion. force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model. + activity_is_zfpkm_data: Does the `gene_expression_file` contain zFPKM-normalized data? Returns: A _BuildResults object containing: @@ -621,6 +658,8 @@ def _build_model( taxon=taxon, low_percentile=low_percentile, high_percentile=high_percentile, + low_bin_cutoff=low_bin_cutoff, + high_bin_cutoff=high_bin_cutoff, ) expression_vector: npt.NDArray[np.floating] = np.asarray(list(reaction_expression.values()), dtype=float) for rxn_id in force_reactions: @@ -842,6 +881,9 @@ def create_context_specific_model( # noqa: C901 output_infeasible_reactions_filepath: Path, output_flux_result_filepath: Path, output_model_filepaths: Path | list[Path], + activity_is_zfpkm_data: bool, + low_bin_cutoff: int, + high_bin_cutoff: int, output_fastcore_expression_index_filepath: Path | None = None, objective: str = "biomass_reaction", objective_direction: Literal["min", "max"] = "max", @@ -850,8 +892,6 @@ def create_context_specific_model( # noqa: C901 exclude_rxns_filepath: str | Path | None = None, force_rxns_filepath: str | Path | None = None, algorithm: Algorithm = Algorithm.GIMME, - low_percentile: int | None = None, - high_percentile: int | None = None, solver: Solver = Solver.GLPK, log_level: LogLevel = LogLevel.INFO, log_location: str | TextIO | TextIOWrapper = sys.stderr, @@ -877,8 +917,8 @@ def create_context_specific_model( # noqa: C901 :param exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel). :param force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel). :param algorithm: Algorithm to use for reconstruction. - :param low_percentile: Low percentile for gene expression threshold calculation. - :param high_percentile: High percentile for gene expression threshold calculation. + :param low_bin_cutoff: Values lower than this (e.g., -3 zFPKM or 25th percentile) will be placed in the low bin. + :param high_bin_cutoff: Values equal to or greater than this (e.g., 0 zFPKM or 75th percentile) will be placed in the high bin. :param solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI :param log_level: Logging level :param log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout. @@ -889,10 +929,10 @@ def create_context_specific_model( # noqa: C901 :raises ImportError: If Gurobi solver is selected but gurobipy is not installed. """ # noqa: E501 set_up_logging(level=log_level, location=log_location) - if low_percentile is None: - raise ValueError("low_percentile must be provided") - if high_percentile is None: - raise ValueError("high_percentile must be provided") + if not activity_is_zfpkm_data and low_bin_cutoff < 0: + raise ValueError(f"Percentiles should be in the range [1, 100]. Got `{low_bin_cutoff=}`") + if not activity_is_zfpkm_data and high_bin_cutoff < 0: + raise ValueError(f"Percentiles should be in the range [1, 100]. Got `{high_bin_cutoff=}`") boundary_rxns_filepath = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None output_model_filepaths = ( @@ -987,8 +1027,8 @@ def create_context_specific_model( # noqa: C901 lower_bounds=boundary_reactions.lower_bounds, upper_bounds=boundary_reactions.upper_bounds, solver=solver.value.lower(), - low_percentile=low_percentile, - high_percentile=high_percentile, + low_bin_cutoff=low_bin_cutoff, + high_bin_cutoff=high_bin_cutoff, output_flux_result_filepath=output_flux_result_filepath, force_boundary_rxn_inclusion=force_boundary_rxn_inclusion, taxon=taxon_id, From 6617b6234c2852f571a73a7ad4dc4558de0f7f72 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:15:12 -0500 Subject: [PATCH 6/8] feat: allow zFPKM cutoffs (e.g., -3) Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 2ba73bd7..250afd67 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -410,10 +410,11 @@ def _map_expression_to_reaction( gene_expression_file: Path, recon_algorithm: Algorithm, taxon: int | str, - low_percentile: int, - high_percentile: int, low_bin_cutoff: int, high_bin_cutoff: int, + *, + force_add_base_exchange_reactions: bool, + activity_is_zfpkm_data: bool, ) -> tuple[collections.OrderedDict[str, float], float, float, float]: """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules. @@ -424,10 +425,12 @@ def _map_expression_to_reaction( gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls) recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) taxon: Taxon ID or Taxon object for gene ID conversion. - low_percentile: Low percentile for threshold calculation - high_percentile: High percentile for threshold calculation low_bin_cutoff: Values lower than this (e.g., -3 zFPKM or 25th percentile) will be placed in the low bin. high_bin_cutoff: Values equal to or greater than this (e.g., 0 zFPKM or 75th percentile) will be placed in the high bin. + force_add_base_exchange_reactions: If True, add a set of basic exchange reactions to the model + before mapping expression to reactions. + This is required for some algorithms (e.g., FASTCORE) to ensure that key exchange reactions are included + in the output model, but can be skipped if these reactions are already included in the reference model. activity_is_zfpkm_data: Does `gene_expression_file` contain zFPKM-normalized data? Returns: @@ -587,6 +590,7 @@ def _build_model( build_settings: ModelBuildSettings, *, force_boundary_rxn_inclusion: bool, + force_add_base_exchange_reactions: bool, activity_is_zfpkm_data: bool, ) -> cobra.Model: """Seed a context specific reference_model. @@ -656,10 +660,10 @@ def _build_model( gene_expression_file, recon_algorithm, taxon=taxon, - low_percentile=low_percentile, - high_percentile=high_percentile, low_bin_cutoff=low_bin_cutoff, high_bin_cutoff=high_bin_cutoff, + force_add_base_exchange_reactions=force_add_base_exchange_reactions, + activity_is_zfpkm_data=activity_is_zfpkm_data, ) expression_vector: npt.NDArray[np.floating] = np.asarray(list(reaction_expression.values()), dtype=float) for rxn_id in force_reactions: From b721bcaefa03c305095c08c74a8edf921a683c31 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:15:45 -0500 Subject: [PATCH 7/8] feat: allow forcibly adding basal exchange reactions Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 52 +++++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 250afd67..cde23957 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -504,6 +504,31 @@ def _map_expression_to_reaction( else 0 if recon_algorithm == Algorithm.FASTCORE else 1 ) + + # Some models use bracket notation for reactions: EX_glc_D[e] + # and others use underscores: EX_glc_D_e + # As a result, we need to make sure we can match either of these identifiers + basal_metabolite_ids: tuple[str, ...] = ( + # Ions + "fe3", "ca2", "na1", "cl", "k", "h", "h2o", + "o2", "pi", "ppi", "co2", "nh4", "no2", "co", + # Amino acids + # We are including single-underscore and double-underscore amino acids here because + # some models (e.g., Recon3D) use single underscores in the extracellular compartment, + # while others (e.g., Human-GEM) use double underscores + "ala_L", "arg_L", "asn_L", "asp_L", "Lcystin", "glu_L", "gln_L", + "gly", "his_L", "ile_L", "leu_L", "lys_L", "met_L", "phe_L", + "pro_L", "ser_L", "thr_L", "trp_L", "tyr_L", "val_L", "4hpro_LT", + # Dunder amino acids + "ala__L", "arg__L", "asn__L", "asp__L", "Lcystin", "glu__L", "gln__L", + "gly", "his__L", "ile__L", "leu__L", "lys__L", "met__L", "phe__L", + "pro__L", "ser__L", "thr__L", "trp__L", "tyr__L", "val__L", "4hpro__LT", + # Lipids + "lnlc", "lnlnca", "eicostet", "5eipenc", "hdca", + # Other + "ac", + ) + basal_exchange_reactions: set[str] = {f"EX_{i}{suffix}" for i in basal_metabolite_ids for suffix in ("[e]", "_e")} # fmt: on reaction_expression: collections.OrderedDict[str, float] = collections.OrderedDict() @@ -513,7 +538,12 @@ def _map_expression_to_reaction( for rxn in reference_model.reactions: rxn: cobra.Reaction gene_reaction_rule = rxn.gene_reaction_rule - + + # Forcibly set exchange reaction expressions in the high bin, irrespective of its GPR + if force_add_base_exchange_reactions and rxn.id in basal_exchange_reactions: + reaction_expression[rxn.id] = high_expr + 1 + continue + if not gene_reaction_rule: reaction_expression[rxn.id] = missing_gpr continue @@ -656,9 +686,9 @@ def _build_model( raise ValueError("Upper bounds contains unfilled values!") reaction_expression, min_expr_val, low_expr, high_expr = _map_expression_to_reaction( - reference_model, - gene_expression_file, - recon_algorithm, + reference_model=reference_model, + gene_expression_file=gene_expression_file, + recon_algorithm=recon_algorithm, taxon=taxon, low_bin_cutoff=low_bin_cutoff, high_bin_cutoff=high_bin_cutoff, @@ -751,8 +781,12 @@ def _build_model( cast(cobra.Reaction, context_model_cobra.reactions.get_by_id(rxn.id)).subsystem = cast( cobra.Reaction, reference_model.reactions.get_by_id(rxn.id) ).subsystem - - context_model_cobra.objective = objective + + try: + context_model_cobra.objective = objective + except ValueError: + logger.critical(f"Unable to set the model's objective value to '{objective}'.") + flux_sol: cobra.Solution = context_model_cobra.optimize() fluxes = flux_sol.fluxes model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions] @@ -901,6 +935,7 @@ def create_context_specific_model( # noqa: C901 log_location: str | TextIO | TextIOWrapper = sys.stderr, build_settings: ModelBuildSettings | None = None, *, + force_add_base_exchange_reactions: bool = False, force_boundary_rxn_inclusion: bool = False, close_unlisted_exchanges: bool = False, ) -> cobra.Model: @@ -913,6 +948,7 @@ def create_context_specific_model( # noqa: C901 :param output_infeasible_reactions_filepath: Path to save infeasible reactions (csv). :param output_flux_result_filepath: Path to save flux results (csv). :param output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json). + :param activity_is_zfpkm_data: Does the `active_genes_filepath` file contains zFPKM-normalized data :param output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). :param objective: Objective function reaction ID. :param objective_direction: Direction of the objective function, either 'min' or 'max'. @@ -926,6 +962,8 @@ def create_context_specific_model( # noqa: C901 :param solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI :param log_level: Logging level :param log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout. + :param force_add_base_exchange_reactions: If True, ensure that basal metabolite exchange reactions are included in the final model. + View the file `add_reactions` to see which reactions are added to include basal metabolites. :param force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model. :param build_settings: Optional ModelBuildSettings object to customize model building parameters. :param close_unlisted_exchanges: If True, all exchanges not listed in the boundary reactions input will be closed @@ -1037,6 +1075,8 @@ def create_context_specific_model( # noqa: C901 force_boundary_rxn_inclusion=force_boundary_rxn_inclusion, taxon=taxon_id, build_settings=build_settings or ModelBuildSettings(), + force_add_base_exchange_reactions=force_add_base_exchange_reactions, + activity_is_zfpkm_data=activity_is_zfpkm_data, ) context_model.id = contextualized_model_id or context_name From 76bfcd8746bf7518b3a65174d041aee65b94cae9 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 16 Mar 2026 21:16:01 -0500 Subject: [PATCH 8/8] refactor: re-add legacy column names for CC integration Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 52 +++++++++++++--------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index cde23957..3f7c583e 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -815,28 +815,34 @@ def _collect_boundary_reactions( close_unlisted_exchanges: bool, ) -> _BoundaryReactions: df: pd.DataFrame = _create_df(path, lowercase_col_names=True) - - for column in df.columns: - if column not in [ - "reaction", - "abbreviation", - "compartment", - "lower bound", - "upper bound", - ]: - raise ValueError( - f"Boundary reactions file must have columns named 'reaction', 'abbreviation', 'compartment', " - f"'lower bound', and 'upper bound'. Found: '{column}'" - ) - - for compartment in df["compartment"].unique(): + + required = { + "reaction", + "abbreviation", + "compartment", + "lower bound", + "upper bound", + # The following columns are legacy column names retained for backwards compatibility. + # All future code should work on the presumption of 'lower bound' and 'upper bound' + "minimum reaction rate", + "maximum reaction rate", + } + extra_columns = set(df.columns) - required + if extra_columns: + raise ValueError(f"Boundary reactions file has invalid columns. Extra: {', '.join(extra_columns)}. ") + df = df.rename( + columns={"minimum reaction rate": "lower bound", "maximum reaction rate": "upper bound"}, + errors="ignore", + ) + + for compartment in df["compartment"].str.lower().unique(): as_shorthand = CobraCompartments.get_shorthand(compartment) if len(compartment) > 2 else compartment - if as_shorthand not in reference_model.compartments: - raise ValueError(f"Unknown compartment: '{compartment}'") + if not as_shorthand: + raise ValueError(f"Unknown compartment: {compartment}") boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"} for exchange in df["reaction"].str.lower().unique(): - if boundary_map.get(exchange) is None: + if not boundary_map.get(exchange): raise ValueError( f"Unable to determine reaction type for: '{exchange}'\n" f"Validate the boundary reactions file has a 'Reaction' value of: 'Exchange', 'Demand', or 'Sink'." @@ -1019,17 +1025,21 @@ def create_context_specific_model( # noqa: C901 exclude_rxns: list[str] = [] if exclude_rxns_filepath: exclude_rxns_filepath = Path(exclude_rxns_filepath) - df = _create_df(exclude_rxns_filepath) - if "abbreviation" not in df.columns: + df = _create_df(exclude_rxns_filepath, lowercase_col_names=True) + # `reaction id` is a legacy column name; all assumptions going forward should use `abbreviation` + if "abbreviation" not in df.columns and "reaction id" not in df.columns: raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation") + df = df.rename(columns={"reaction id": "abbreviation"}, errors="ignore") exclude_rxns = df["abbreviation"].tolist() force_rxns: list[str] = [] if force_rxns_filepath: force_rxns_filepath = Path(force_rxns_filepath) df = _create_df(force_rxns_filepath, lowercase_col_names=True) - if "abbreviation" not in df.columns: + # `reaction id` is a legacy column name; all assumptions going forward should use `abbreviation` + if "abbreviation" not in df.columns and "reaction id" not in df.columns: raise ValueError("The force reactions file should have a single column with a header named Abbreviation") + df = df.rename(columns={"reaction id": "abbreviation"}, errors="ignore") force_rxns = df["abbreviation"].tolist() # Test that gurobi is using a valid license file