Update Jupyter Notebook (#254)

JoshLoecker · web-flow · commit d5bdf807cbde · 2026-03-06T14:23:07.000-06:00
diff --git a/main/COMO.ipynb b/main/COMO.ipynb
@@ -256,7 +256,7 @@
     "            raise FileNotFoundError(\"Root directory could not be determined; unable to find 'COMO.ipynb'\")\n",
     "        current_dir = current_dir.parent"
    ],
-   "id": "2656ec5201a33b9f"
+   "id": "f58450da8bd97732"
   },
   {
    "metadata": {},
@@ -299,7 +299,7 @@
     "mrna_metadata_filepath = Path(notebook_dir / \"data/config_sheets/mrna_config.xlsx\")\n",
     "proteomics_metadata_filepath = Path(notebook_dir / \"data/config_sheets/proteomics_config.xlsx\")\n"
    ],
-   "id": "3aa04b2bf1798c20"
+   "id": "c1d957a21a4b5393"
   },
   {
    "metadata": {},
@@ -311,7 +311,7 @@
     "- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use\n",
     "- `preprocess_mode`: This should be set to `\"create-matrix\"` if you are **not** providing a matrix, otherwise set it to `\"provide-matrix\"`"
    ],
-   "id": "11852d88cdc2ee32"
+   "id": "7d813235940a2e89"
   },
   {
    "metadata": {},
@@ -322,7 +322,7 @@
     "for context in context_names:\n",
     "    if context not in {*trna_matrix_filepath, *mrna_matrix_filepath}:\n",
     "        continue\n",
-    "    await rnaseq_preprocess(\n",
+    "    rnaseq_preprocess(\n",
     "        context_name=context,\n",
     "        taxon=taxon_id,\n",
     "        como_context_dir=como_context_dir[context],\n",
@@ -336,7 +336,7 @@
     "        log_level=\"INFO\",\n",
     "    )"
    ],
-   "id": "49a9206c10732797"
+   "id": "b2cb28bba57ed02f"
   },
   {
    "metadata": {},
@@ -369,7 +369,7 @@
     "\n",
     "This method is not recommended, as zFPKM is much more robust for a similar level of \"hands-off\" model building\n"
    ],
-   "id": "addc4f6dd55a33f3"
+   "id": "b9227a4f747356f2"
   },
   {
    "metadata": {},
@@ -390,7 +390,7 @@
     "#### Single Cell RNA Sequencing\n",
     "While the Snakemake pipeline does not yet support single-cell alignment, and COMO does not yet support automated configuration file and counts matrix file creation for single-cell alignment output from STAR, it is possible to use single-cell data to create a model with COMO. Because normalization strategies can be applied to single-cell data in the same way it is applied to bulk RNA sequencing, `como/rnaseq_gen.py` can be used with a provided counts matrix and configuration file, from [Step 1](Step-1:-Initialize-and-Preprocess-RNA-seq-data), above. Just like `\"total\"` and `\"mRNA\"`, `como/rnaseq_gen.py` can be executed with `\"SC\"` as the \"`--library-prep`\" argument to help COMO differentiate it from any bulk RNA sequencing data if multiple strategies are being used."
    ],
-   "id": "ed35195f4278ae5c"
+   "id": "4c960cd5ddcdd542"
   },
   {
    "metadata": {},
@@ -409,7 +409,7 @@
     "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
     "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n"
    ],
-   "id": "6dd118e06e9e4da0"
+   "id": "cbacf586116ce040"
   },
   {
    "metadata": {},
@@ -429,15 +429,17 @@
     "for context in context_names:\n",
     "    if context not in trna_matrix_filepath:\n",
     "        continue\n",
-    "    output_zscore_norm_filepath = Path(get_notebook_dir() / f\"data/results/{context}/z_score_normalization.csv\")\n",
-    "    await rnaseq_gen(\n",
+    "    output_zscore_norm_filepath = Path(\n",
+    "        get_notebook_dir() / \"data\" / \"results\" / context / \"trna_z_score_normalization.csv\"\n",
+    "    )\n",
+    "    rnaseq_gen(\n",
     "        context_name=context,\n",
+    "        input_metadata_filepath_or_df=trna_metadata_filepath,\n",
     "        input_rnaseq_filepath=trna_matrix_filepath[context],\n",
     "        input_gene_info_filepath=gene_info_filepath[context],\n",
     "        output_boolean_activity_filepath=trna_matrix_filepath[context],\n",
     "        prep=RNAType.TRNA,\n",
     "        taxon_id=taxon_id,\n",
-    "        input_metadata_filepath_or_df=trna_metadata_filepath,\n",
     "        replicate_ratio=replicate_ratio,\n",
     "        high_replicate_ratio=high_confidence_replicate_ratio,\n",
     "        batch_ratio=batch_ratio,\n",
@@ -447,7 +449,7 @@
     "        output_zscore_normalization_filepath=output_zscore_norm_filepath,\n",
     "    )"
    ],
-   "id": "bd15ec97dd0a38a8"
+   "id": "6f7e1634d7a912ba"
   },
   {
    "metadata": {},
@@ -467,7 +469,7 @@
     "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
     "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n"
    ],
-   "id": "71fd3eab25176aad"
+   "id": "ca2918f7d6e23e5c"
   },
   {
    "metadata": {},
@@ -487,26 +489,27 @@
     "for context in context_names:\n",
     "    if context not in mrna_matrix_filepath:\n",
     "        continue\n",
-    "    await rnaseq_gen(\n",
+    "    rnaseq_gen(\n",
     "        context_name=context,\n",
     "        input_metadata_filepath_or_df=mrna_metadata_filepath,\n",
     "        input_rnaseq_filepath=mrna_matrix_filepath[context],\n",
     "        input_gene_info_filepath=gene_info_filepath[context],\n",
     "        output_boolean_activity_filepath=mrna_matrix_filepath[context],\n",
     "        prep=RNAType.MRNA,\n",
     "        taxon_id=taxon_id,\n",
-    "        input_metadata_filepath=mrna_metadata_filepath,\n",
     "        replicate_ratio=replicate_ratio,\n",
     "        high_replicate_ratio=high_confidence_replicate_ratio,\n",
     "        batch_ratio=batch_ratio,\n",
     "        high_batch_ratio=high_confidence_batch_ratio,\n",
     "        technique=technique,\n",
     "        cutoff=cutoff,\n",
-    "        output_zscore_normalization_filepath=Path(get_notebook_dir(),\n",
-    "                                                  f\"data/results/{context}/z_score_normalization.csv\"),\n",
+    "        output_zscore_normalization_filepath=Path(\n",
+    "            get_notebook_dir(),\n",
+    "            f\"data/results/{context}/z_score_normalization.csv\"\n",
+    "        ),\n",
     "    )"
    ],
-   "id": "925f939b1f318673"
+   "id": "af4293b08391ad2f"
   },
   {
    "metadata": {},
@@ -526,7 +529,7 @@
     "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
     "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"scrna\"`,\n"
    ],
-   "id": "ddc6b8d634feaacc"
+   "id": "24d6510e09b88018"
   },
   {
    "metadata": {},
@@ -546,24 +549,27 @@
     "for context in context_names:\n",
     "    if context not in scrna_matrix_filepath:\n",
     "        continue\n",
-    "    await rnaseq_gen(\n",
+    "    output_zscore_norm_filepath = Path(\n",
+    "        get_notebook_dir() / \"data\" / \"results\" / context / \"mrna_zscore_normalization.csv\"\n",
+    "    )\n",
+    "    rnaseq_gen(\n",
     "        context_name=context,\n",
+    "        input_metadata_filepath_or_df=mrna_metadata_filepath,\n",
     "        input_rnaseq_filepath=scrna_matrix_filepath[context],\n",
     "        input_gene_info_filepath=gene_info_filepath[context],\n",
     "        output_boolean_activity_filepath=scrna_matrix_filepath[context],\n",
     "        prep=RNAType.SCRNA,\n",
     "        taxon_id=taxon_id,\n",
-    "        input_metadata_filepath=Path(\"./data/config_sheets/scrna_config.xlsx\"),\n",
     "        replicate_ratio=replicate_ratio,\n",
     "        high_replicate_ratio=high_confidence_replicate_ratio,\n",
     "        batch_ratio=batch_ratio,\n",
     "        high_batch_ratio=high_confidence_batch_ratio,\n",
     "        technique=technique,\n",
     "        cutoff=cutoff,\n",
-    "        output_zscore_normalization_filepath=None,\n",
+    "        output_zscore_normalization_filepath=output_zscore_norm_filepath\n",
     "    )"
    ],
-   "id": "ff137d18eed6995b"
+   "id": "ef57ea1b08c1b121"
   },
   {
    "metadata": {},
@@ -580,7 +586,7 @@
     "- `high_batch_ratio`: The ratio required before a gene is considered \"high-confidence\" in the study\n",
     "- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering"
    ],
-   "id": "8ca2a08af58c517d"
+   "id": "c2c5cc7eb9d2e44f"
   },
   {
    "metadata": {},
@@ -592,7 +598,7 @@
     "\n",
     "for context in context_names:\n",
     "    await proteomics_gen(\n",
-    "        context_name=context_names,\n",
+    "        context_name=context,\n",
     "        config_filepath=proteomics_metadata_filepath,\n",
     "        matrix_filepath=proteomics_matrix_filepath[context],\n",
     "        output_boolean_filepath=Path(f\"data/results/{context}/proteomics/{context}_proteomics_boolean_matrix.csv\"),\n",
@@ -607,7 +613,7 @@
     "        quantile=25,\n",
     "    )"
    ],
-   "id": "2124f206489b1002"
+   "id": "ca39fe2e0744a401"
   },
   {
    "metadata": {},
@@ -633,7 +639,7 @@
     "- `n_neighbors_context`: N nearest neighbors for context clustering. The default is `\"default\"`, which is the total number of contexts\n",
     "- `seed`: The random seed for clustering algorithm initialization. If not specified, `np.random.randint(0, 100000)` is used"
    ],
-   "id": "50c625b676e3d643"
+   "id": "6a7f40910eefa1cc"
   },
   {
    "metadata": {},
@@ -677,7 +683,7 @@
     "\n",
     "!{cmd}"
    ],
-   "id": "9f1a7a85673fc17"
+   "id": "1728cd710f834c2f"
   },
   {
    "metadata": {},
@@ -717,7 +723,7 @@
     "\n",
     "Each of the \"weights\" (`total_rna_weight`, `mrna_weight`, etc.) are used to place a significance on each method. Becuase there are many steps in the Dogma from transcription to translation, the gene expression as seen by total RNA or mRNA sequencing may not be representative of the gene's protein expression, and this its metabolic impact. Because of this, you are able to weight each source more (or less) than another."
    ],
-   "id": "5bb309f8d441ddcf"
+   "id": "db5681f5e8c4111e"
   },
   {
    "metadata": {},
@@ -746,16 +752,13 @@
     "trna_batches = {\"naiveB\": [\"naiveB_S3R1\", \"naiveB_S3R2\", \"naiveB_S3R3\"]}\n",
     "\n",
     "for context in context_names:\n",
-    "    await merge_xomics(\n",
+    "    merge_xomics(\n",
     "        context_name=context,\n",
+    "        taxon_id=taxon_id,\n",
     "        trna_matrix_or_filepath=trna_matrix_filepath[context],\n",
     "        mrna_matrix_or_filepath=mrna_matrix_filepath[context],\n",
     "        scrna_matrix_or_filepath=None,  # scrna_matrix_filepath[context],\n",
     "        proteomic_matrix_or_filepath=None,  # proteomics_matrix_filepath[context],\n",
-    "        trna_batches=trna_batches,\n",
-    "        mrna_batches=mrna_batches,\n",
-    "        scrna_batches=None,\n",
-    "        proteomic_batches=None,\n",
     "        trna_weight=total_rna_weight,\n",
     "        mrna_weight=mrna_weight,\n",
     "        scrna_weight=single_cell_weight,\n",
@@ -765,10 +768,7 @@
     "        adjust_method=requirement_adjustment_method,\n",
     "        force_activate_high_confidence=force_activate_high_confidence,\n",
     "        adjust_for_na=adjust_for_na_sources,\n",
-    "        merge_zfpkm_distribution=merge_zfpkm_distrubution,\n",
-    "        keep_transcriptomics_score=keep_transcriptomics_score,\n",
     "        output_merge_activity_filepath=Path(f\"data/results/{context}/ActiveGenes_{context}_Merged.csv\"),\n",
-    "        output_transcriptomic_details_filepath=Path(f\"data/results/{context}/TranscriptomicDetails_{context}.csv\"),\n",
     "        output_trna_activity_filepath=Path(f\"data/results/{context}/total-rna/trna_activity_{context}.csv\"),\n",
     "        output_mrna_activity_filepath=Path(f\"data/results/{context}/mrna/mrna_activity_{context}.csv\"),\n",
     "        output_scrna_activity_filepath=Path(f\"data/results/{context}/scrna/scrna_activity_{context}.csv\"),\n",
@@ -777,7 +777,7 @@
     "        output_figure_dirpath=Path(f\"data/results/{context}/figures\")\n",
     "    )"
    ],
-   "id": "5cf72339439acf79"
+   "id": "c48d8b8d6ab11e16"
   },
   {
    "metadata": {},
@@ -844,7 +844,7 @@
     "- `force_reactions_filename`: The filename of the force reactions to be used. Force reactions will (as the name implies) force the optimizer to use these reactions, **no matter their expression**\n",
     "- `exclude_reactions_filename`: The filename of reactions to exclude from the model, no matter their expression"
    ],
-   "id": "691335a66e36ee7c"
+   "id": "e80c7864a129ea83"
   },
   {
    "metadata": {},
@@ -870,31 +870,31 @@
     "recon_algorithms = [\"IMAT\"]\n",
     "solver = \"GUROBI\"\n",
     "\n",
-    "config = Config()\n",
-    "\n",
     "for recon_alg in recon_algorithms:\n",
     "    for context in context_names:\n",
-    "        await create_context_specific_model(\n",
+    "        create_context_specific_model(\n",
     "            context_name=context,\n",
-    "            reference_model=Path(\n",
-    "                \"/Users/satominakamura/Desktop/Dr.Helikar Lab/COMO/main/data/GeneralModelUpdatedV2.mat\"),\n",
+    "            taxon_id=taxon_id,\n",
+    "            reference_model_filepath=Path(\n",
+    "                get_notebook_dir() / \"data\" / \"reference_models\" / \"GeneralModelUpdatedV3.json\"\n",
+    "            ),\n",
     "            active_genes_filepath=Path(f\"{notebook_dir}/data/results/{context}/ActiveGenes_{context}_Merged.csv\"),\n",
     "            output_infeasible_reactions_filepath=Path(\n",
-    "                f\"{notebook_dir}/data/results/{context}/infeasible_reactions_{context}.csv\"),\n",
+    "                f\"{notebook_dir}/data/results/{context}/infeasible_reactions_{context}.csv\"\n",
+    "            ),\n",
     "            output_flux_result_filepath=Path(f\"{notebook_dir}/data/results/{context}/FluxResults_{context}.csv\"),\n",
     "            output_model_filepaths=Path(\n",
-    "                f\"{notebook_dir}/data/results/{context}/{context}_{recon_alg}_model.json\"),\n",
+    "                f\"{notebook_dir}/data/results/{context}/{context}_{recon_alg}_model.json\"\n",
+    "            ),\n",
     "            objective=\"biomass_maintenance\",\n",
     "            boundary_rxns_filepath=Path(f\"{notebook_dir}/data/boundary_rxns/{context}_boundary_rxns.csv\"),\n",
     "            exclude_rxns_filepath=Path(f\"{notebook_dir}/data/exclude_rxns/{context}_exclude_rxns.csv\"),\n",
     "            force_rxns_filepath=Path(f\"{notebook_dir}/data/force_rxns/{context}_force_rxns.csv\"),\n",
     "            algorithm=Algorithm.IMAT,\n",
     "            solver=Solver.GUROBI,\n",
-    "        )\n",
-    "        # fmt: on\n",
-    "        !{cmd}"
+    "        )"
    ],
-   "id": "961737f5e356c2b6"
+   "id": "4c9747c67bc80e88"
   },
   {
    "metadata": {},
@@ -919,7 +919,7 @@
     "- `exampleTissue`: This is the name of the tissue context\n",
     "- `ALGORITHM`: This is the algorithm (`recon_algorithm`) used in the above model creation step\n"
    ],
-   "id": "d5ce2da2d3b27868"
+   "id": "6e60a22a03b54803"
   },
   {
    "metadata": {},
@@ -997,7 +997,7 @@
     "\n",
     "        !{cmd}"
    ],
-   "id": "a8ed4f4f6f10fec8"
+   "id": "14d4ca54309049b0"
   },
   {
    "metadata": {},
@@ -1016,7 +1016,7 @@
     "- `data_source`: The datasource you are using for disease analysis. This should be`\"rnaseq\"`\n",
     "- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use for disease analysis"
    ],
-   "id": "84dd6096f89ad000"
+   "id": "51cd2ec29bbbd4a0"
   },
   {
    "metadata": {},
@@ -1047,7 +1047,7 @@
     "\n",
     "    !{cmd}"
    ],
-   "id": "6daddfb1edac03ab"
+   "id": "ead8911867f0e352"
   },
   {
    "metadata": {},
@@ -1082,7 +1082,7 @@
     "\n",
     "- `solver`: The solver you would like to use. Available options are `\"gurobi\"` or `\"glpk\"`\n"
    ],
-   "id": "645d829a6b3266cd"
+   "id": "a1ba8eca6e7c6991"
   },
   {
    "metadata": {},
@@ -1153,7 +1153,7 @@
     "            cmd = \" \".join(cmd)\n",
     "            !{cmd}"
    ],
-   "id": "9c1731579d3711b2"
+   "id": "fcdf3a87b72665a6"
   }
  ],
  "metadata": {},
diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py
@@ -188,7 +188,7 @@ def load_empty_dict():
         )
         return load_empty_dict()
 
-
+# TODO: Convert to synchronous function
 async def proteomics_gen(
     context_name: str,
     config_filepath: Path,

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ def load_empty_dict():`
`188`	`188`	`)`
`189`	`189`	`return load_empty_dict()`
`190`	`190`
`191`		`-`
	`191`	`+# TODO: Convert to synchronous function`
`192`	`192`	`async def proteomics_gen(`
`193`	`193`	`context_name: str,`
`194`	`194`	`config_filepath: Path,`