all metrics is drafted

janursa · janursa · commit 4ae3959f947d · 2025-10-28T13:54:37.000+07:00
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 876036f71713cbd79285b108ab0a9a8238f2b5e1
+Subproject commit f01ff2170161295e89014ee5453c61b29b4e4e77
diff --git a/dockers/dictys_0/Dockerfile b/dockers/dictys_0/Dockerfile
@@ -0,0 +1,40 @@
+# Lingfei Wang, 2022-2023. All rights reserved.
+FROM continuumio/miniconda3
+USER root
+SHELL ["/bin/bash", "-c"]
+
+#System update
+RUN DEBIAN_FRONTEND=noninteractive apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        curl gawk man pkg-config python3 python3-pip git wget zip unzip xzip \
+        awscli gzip samtools tabix \
+    && rm -Rf /var/lib/apt/lists/*
+
+# Install dictys
+# Name of conda environment to create
+ARG CONDAENV_NAME=dictys
+#Commit version to install. If empty, uses local version (./local).
+ARG COMMIT_VERSION=master
+# Python version
+ARG PYTHONVERSION_CONDA=3.9
+# CUDA version. When empty, uses CPU instead
+ARG CUDAVERSION_CONDA=
+COPY local /dictys/local
+RUN cd /dictys \
+	&& if [ "a${COMMIT_VERSION}" != "a" ]; then wget -O install.sh https://raw.githubusercontent.com/pinellolab/dictys/"${COMMIT_VERSION}"/doc/scripts/install.sh; localpath=""; else cp local/doc/scripts/install.sh ./; localpath="/dictys/local"; fi \
+	&& chmod u+x install.sh \
+	&& COMMIT_VERSION="${COMMIT_VERSION}" CONDAENV_NAME="${CONDAENV_NAME}" PYTHONVERSION_CONDA="${PYTHONVERSION_CONDA}" CUDAVERSION_CONDA="${CUDAVERSION_CONDA}" LOCAL_VERSION="$localpath" ./install.sh \
+	&& cd / \
+	&& rm -Rf /dictys
+
+#Create entry point
+RUN echo '#!/bin/bash' > /usr/bin/run_dictys \
+    && echo "source activate ${CONDAENV_NAME}" >> /usr/bin/run_dictys \
+    && echo 'dictys "$@"' >> /usr/bin/run_dictys \
+    && chmod u+x /usr/bin/run_dictys
+
+ENTRYPOINT ["/usr/bin/run_dictys"]
+	
+
+CMD ["/bin/bash"]
diff --git a/dockers/dictys_0/Dockerfile_0 b/dockers/dictys_0/Dockerfile_0
@@ -0,0 +1,38 @@
+# Lingfei Wang, 2022-2023. All rights reserved.
+FROM continuumio/miniconda3
+USER root
+SHELL ["/bin/bash", "-c"]
+
+#System update
+RUN DEBIAN_FRONTEND=noninteractive apt-get update \
+	&& DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
+	&& DEBIAN_FRONTEND=noninteractive apt-get install -y curl gawk man pkg-config python3 python3-pip git wget zip unzip xzip \
+	&& rm -Rf /var/lib/apt/lists/*
+
+# Install dictys
+# Name of conda environment to create
+ARG CONDAENV_NAME=dictys
+#Commit version to install. If empty, uses local version (./local).
+ARG COMMIT_VERSION=master
+# Python version
+ARG PYTHONVERSION_CONDA=3.9
+# CUDA version. When empty, uses CPU instead
+ARG CUDAVERSION_CONDA=
+COPY local /dictys/local
+RUN cd /dictys \
+	&& if [ "a${COMMIT_VERSION}" != "a" ]; then wget -O install.sh https://raw.githubusercontent.com/pinellolab/dictys/"${COMMIT_VERSION}"/doc/scripts/install.sh; localpath=""; else cp local/doc/scripts/install.sh ./; localpath="/dictys/local"; fi \
+	&& chmod u+x install.sh \
+	&& COMMIT_VERSION="${COMMIT_VERSION}" CONDAENV_NAME="${CONDAENV_NAME}" PYTHONVERSION_CONDA="${PYTHONVERSION_CONDA}" CUDAVERSION_CONDA="${CUDAVERSION_CONDA}" LOCAL_VERSION="$localpath" ./install.sh \
+	&& cd / \
+	&& rm -Rf /dictys
+
+#Create entry point
+RUN echo '#!/bin/bash' > /usr/bin/run_dictys \
+	&& echo ". activate ${CONDAENV_NAME}" >> /usr/bin/run_dictys \
+	&& echo 'dictys "$@"' >> /usr/bin/run_dictys \
+	&& chmod u+x /usr/bin/run_dictys
+
+#ENTRYPOINT ["/usr/bin/run_dictys"]
+	
+
+CMD ["/bin/bash"]
diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
@@ -1,25 +1,16 @@
 Datasets
 ========
-Here, we explain how to access datasets without installing geneRNIB. The available datasets include **OPSCA, Nakatake, Replogle, Adamson, Norman, Xaira_HCT116, Xaira_HEK293T** and **ParseBioscience**. 
-It should be noted that three datasets of **Xaira_HCT116, Xaira_HEK293T** and **ParseBioscience** are not added to the initial manuscript yet.
-All datasets provide RNA data, while the `OPSCA` dataset also includes ATAC data. 
-The perturbation signature of these datasets are given below. 
-You need `awscli` to download the datasets. If you don't have it installed, you can download it from [here](https://aws.amazon.com/cli/). You do not need to sign in to download the datasets.
-
+The list of datasets integrated into geneRNIB is provided below with their perturbation signatures as well as the type of perturbation used in each dataset.
 .. image:: images/datasets.png
    :width: 80%
    :align: center
 ----
 
-Downloading the test datasets
----------------------------------------------
+All datasets provide RNA data, while the `OPSCA` and `IBD` datasets also includes scATAC data. 
 
+You need `awscli` to download the datasets. 
 .. code-block:: bash
-
-   aws s3 sync s3://openproblems-data/resources_test/grn resources_test/ --no-sign-request
-
-This command downloads the data to `resources_test/`. The content of this folder is needed for testing component integration.
-
+   pip install awscli
 
 Downloading the main datasets
 ---------------------------------------------
@@ -30,7 +21,8 @@ Downloading the main datasets
 
 This command downloads the data to `resources/grn_benchmark/`, which is the default directory for geneRNIB for further GRN inference and evaluation.
 
-Additionally, you will find the `resources/grn_benchmark/prior/` folder, which contains supplementary files such as the list of known transcription factors (TFs). This list is used for GRN inference (causal TF-gene masking) and in the evaluation metrics to include only edges where the source gene is among these TFs. Additional files in this folder, such as those with `consensus` tags, are used in the evaluation metrics to standardize permitted edges per different metric.
+Additionally, you will find the `resources/grn_benchmark/prior/` folder, which contains supplementary files such as the list of known transcription factors (TFs). 
+Files containing `consensus` tags are used in the evaluation metrics to standardize comparisons.
 
 Downloading the extended datasets
 -----------------------------
@@ -46,6 +38,7 @@ To download the extended datasets, use:
 
    aws s3 sync s3://openproblems-data/resources/grn/extended_data/ resources/extended_data/ --no-sign-request
 
+
 Downloading the raw/unprocessed data
 --------------------------------
 
@@ -57,18 +50,6 @@ All previously mentioned datasets are processed versions. To access the raw, unp
 
 We have not provided raw data for a few recent datasets due to very large file sizes. Pls contact us if you need the raw data for these datasets.
 
-Downloading the GRN models
----------------------------------------------
-To download the GRN models used in geneRNIB, run:
-
-.. code-block:: bash
-
-   aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/ --no-sign-request
-
-These models are not necessarily the updated models as we are currently making changes to the results. To obtain a specific model, 
-you should run the inference method or reach out to us for the latest model.
-
-
 Downloading the results
 ---------------------------------------------
 To download the results of geneRNIB (needed for the leaderboard and the paper):
diff --git a/docs/source/evaluation.rst b/docs/source/evaluation.rst
@@ -3,45 +3,31 @@ GRN evaluation
 =================
 The evaluation metrics used in geneRNIB are summarized below. For a detailed description of each metric, refer to the geneRNIB paper.
 
-We originally defined **eight evaluation metrics**, grouped into three categories: **Regression 1, Regression 2, and Wasserstein Distance**. 
-However, we recently removed **Regression 1** as it did not prove to be effective for perturbational settings. 
-
-- The **regression-based metrics** assess the predictive power of an inferred GRN by using regression models to predict perturbation data (evaluation data) based on the feature space constructed from the inferred network.  
-- The **Wasserstein distance-based metric** evaluates GRN edges by measuring the distributional shift in target gene expression between observations and perturbation data for a given transcription factor (TF).
-
-Wasserstein distance-based metrics are only applicable for datasets that are gene perturbations and are in single cell format. Thus, currently the following datasets are supported:
-- Replogle
-- Xaira:HEK293T
-- Xaira:HCT116
-- Norman
-- Adamson
+
   
 .. image:: images/metrics.png
    :width: 90%
    :align: center
 ----
 
-The evaluation metrics expect the inferred network to be in the form of an AnnData object with specific format as explained here. It should be noted that the metric currently evaluate only the **top TF-gene pairs**, currently limited to **50,000 edges**, ranked by their assigned weight.  
+The evaluation metrics expect the inferred network to be in the form of an AnnData object with specific format as explained here. 
+It should be noted that the metric currently evaluate only the **top TF-gene pairs**, currently limited to **50,000 edges**, ranked by their assigned weight.  
 
 The inferred network should have a tabular format with the following columns:  
 
   - `source`: TF gene name
   - `target`: Target gene gene  
   - `weight`: Regulatory importance/likelihood score/etc.  
 
-See `resources_test/grn_models/op/collectri.h5ad` for an example of the expected format.
-
-For the regression based approaches, we used the pseudobulk version of the perturbation data while for the Wasserstein distance, the single cell data are used.
+See `resources/grn_benchmark/prior/collectri.h5ad` for an example of the expected format.
 
-It should be noted that for Wasserstein distance, we have already computed all possible combination of TF-gene pairs and stored it in the `resources/grn_benchmark/prior/` folder.
-This substantially reduces the computation time during evaluation.
 
 To run the evalution for a given GRN and dataset, use the following command:
 ```bash
-bash scripts/run_grn_evaluation.sh --prediction=<inferred GRN (e.g.collectri.h5ad)> --save_dir=<e.g.output/> --dataset=<e.g. replogle> --build_images=<true or false. true for the first time running> --run_test=<true or false. true to run on test data>
+bash scripts/run_grn_evaluation.sh --prediction=<inferred GRN (e.g.collectri.h5ad)> --save_dir=<e.g.output/> --dataset=<e.g. replogle> --build_images=<true or false. true for the first time running> 
 ```
 
 example command:
 ```bash
-bash scripts/run_grn_evaluation.sh --prediction=resources/grn_models/op/collectri.h5ad --save_dir=output/ --dataset=op --build_images=true --test_run=false
+bash scripts/run_grn_evaluation.sh --prediction=resources/grn_models/op/collectri.h5ad --save_dir=output/ --dataset=op --build_images=true 
 ```
diff --git a/docs/source/images/datasets.png b/docs/source/images/datasets.png
diff --git a/docs/source/images/metrics.png b/docs/source/images/metrics.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -28,6 +28,7 @@ To see the comparitive performance of the integrated GRN inference methods, refe
    :align: center
 ----
 
+
 Pls see the GitHub page for the list of currently integrated methods. The methods are implemented in Python and R, and they can be used to infer GRNs from the datasets provided by geneRNIB.
 
 In addition, three baseline methods are integrated into geneRNIB. These methods are used to evaluate the performance of new methods. The baseline methods are:
@@ -53,9 +54,6 @@ In addition, three baseline methods are integrated into geneRNIB. These methods
 ..      - author
 
 
-.. note::
-
-   This project is under active development and this documentation is still a draft.
 
 Contents
 --------
diff --git a/src/metrics/all_metrics/config.vsh.yaml b/src/metrics/all_metrics/config.vsh.yaml
@@ -21,6 +21,7 @@ engines:
       - type: python
         packages: [ lightgbm==4.3.0, numpy==1.26.4 , tqdm_joblib==0.0.5]
 
+
 runners:
   - type: executable
   - type: nextflow
diff --git a/src/metrics/all_metrics/script.py b/src/metrics/all_metrics/script.py
@@ -36,8 +36,8 @@
     sys.path.append(meta["util_dir"])
     sys.path.append(meta["resources_dir"])
 print(meta["resources_dir"])
-from helper_ws_distance import main as main_reg2
-aaa
+from helper_ws_distance import main as main_ws_distance
+
 from helper import main_all
 
 from util import parse_args, format_save_score
diff --git a/src/metrics/replica_consistency/helper.py b/src/metrics/replica_consistency/helper.py
@@ -233,8 +233,8 @@ def main(par):
             C, A, "tgtg",
             tf_tg=False,
             tg_tg=True,
-            n_tfs=40, 
-            max_targets_per_tf=300,
+            n_tfs=100, 
+            max_targets_per_tf=100,
             signed=signed
         ))
 
diff --git a/src/metrics/tf_recovery/helper.py b/src/metrics/tf_recovery/helper.py
@@ -45,7 +45,6 @@ def main(par):
     tf_counts = net['source'].value_counts()
     tfs_to_keep = tf_counts[tf_counts >= 3].index
     net = net[net['source'].isin(tfs_to_keep)]
-    n_tfs = net['source'].nunique()
 
     all_genes = list(df_de.columns)
     acts_in_net, pvals_in_net, acts_random_in_net, pvals_random_in_net = [], [], [], []