Skip to content

Commit b59166d

Browse files
authored
Merge pull request #87 from openproblems-bio/jalil
Restructuring process data folder + fixing bugs in geneformer
2 parents 41b4cb0 + 1d736c9 commit b59166d

48 files changed

Lines changed: 1816 additions & 147 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scripts/run_all.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
set -e
22

3-
datasets=( 'op' ) #'replogle' 'op' 'nakatake' 'adamson' 'norman' 'xaira_HEK293T' 'xaira_HCT116' 'parsebioscience' 'ibd' '300BCG'
3+
datasets=( 'replogle') #'replogle' 'op' 'nakatake' 'adamson' 'norman' 'xaira_HEK293T' 'xaira_HCT116' 'parsebioscience' 'ibd' '300BCG'
44

5-
run_local=false # set to true to run locally, false to run on AWS
5+
run_local=true # set to true to run locally, false to run on AWS
66

7-
run_grn_inference=true
8-
run_grn_evaluation=false
7+
run_grn_inference=false
8+
run_grn_evaluation=true
99
run_download=false
1010

1111

scripts/run_grn_evaluation.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ if [ -z "${DATASET:-}" ]; then
6565
fi
6666

6767
num_workers=10
68-
metric_ids="[regression_2, ws_distance, sem]" #regression_1, regression_2, ws_distance
68+
metric_ids="[regression_2, ws_distance, sem, tf_recovery, tf_binding, replica_consistency]" #regression_1, regression_2, ws_distance
6969
RUN_ID="${DATASET}_evaluation"
7070
models_folder="${DATASET}/"
7171
apply_skeleton=false
@@ -109,8 +109,6 @@ param_list:
109109
HERE
110110
fi
111111

112-
# Write YAML header
113-
114112
append_entry() {
115113
local grn_name="$1"
116114
local prediction="$2"
@@ -121,7 +119,7 @@ append_entry() {
121119
layer_=$layer
122120
fi
123121
cat >> "$param_local" << HERE
124-
- id: ${reg_type}_${grn_name}_${dataset}
122+
- id: ${grn_name}_${dataset}
125123
metric_ids: ${metric_ids}
126124
evaluation_data: ${resources_dir}/grn_benchmark/evaluation_data/${dataset}_bulk.h5ad
127125
tf_all: ${resources_dir}/grn_benchmark/prior/tf_all.csv
@@ -134,12 +132,12 @@ append_entry() {
134132
layer: $layer_
135133
136134
HERE
137-
138135
# Additional fields for specific datasets
139136
if [[ "$dataset" =~ ^(norman|replogle|adamson|xaira_HCT116|xaira_HEK293T)$ ]]; then
140137
cat >> "$param_local" << HERE
141138
ws_consensus: ${resources_dir}/grn_benchmark/prior/ws_consensus_${dataset}.csv
142139
ws_distance_background: ${resources_dir}/grn_benchmark/prior/ws_distance_background_${dataset}.csv
140+
evaluation_data_de: ${resources_dir}/grn_benchmark/evaluation_data/${dataset}_de.h5ad
143141
HERE
144142
fi
145143
}

scripts/run_grn_inference.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,7 @@ if [[ "$DATASET" =~ ^(replogle|parsescience|xaira_HEK293T|xaira_HCT116)$ ]]; the
118118
append_entry "$DATASET" "[pearson_corr, negative_control, positive_control, grnboost, ppcor, portia, scenic]"
119119
append_entry "$DATASET" "[scprint]" "true"
120120
elif [ "$DATASET" = "op" ] || [ "$DATASET" = "ibd" ]; then
121-
# append_entry "$DATASET" "[pearson_corr, negative_control, positive_control, grnboost, ppcor, portia, scenic, scprint, figr, scenicplus, celloracle, granie, scglue]"
122-
append_entry "$DATASET" "[ scenicplus, celloracle, geneformer]"
121+
append_entry "$DATASET" "[pearson_corr, spearman_corr, negative_control, positive_control, grnboost, ppcor, portia, scenic, scprint, geneformer, scgpt, figr, scenicplus, celloracle, granie, scglue]"
123122

124123
else
125124
append_entry "$DATASET" "[pearson_corr, negative_control, positive_control, grnboost, ppcor, portia, scenic, scprint]"

scripts/sync_resources.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,7 @@ set -e
2626
# aws s3 sync resources/extended_data/ s3://openproblems-data/resources/grn/extended_data --delete
2727
# aws s3 sync resources/results/experiment s3://openproblems-data/resources/grn/results/experiment --delete
2828
# aws s3 sync resources_test s3://openproblems-data/resources_test/grn/ --delete
29-
aws s3 sync resources/grn_benchmark/ground_truth s3://openproblems-data/resources/grn/grn_benchmark/ground_truth
29+
# aws s3 sync resources/grn_benchmark/ground_truth s3://openproblems-data/resources/grn/grn_benchmark/ground_truth
30+
aws s3 sync resources/grn_benchmark/evaluation_data s3://openproblems-data/resources/grn/grn_benchmark/evaluation_data --delete
31+
32+
# aws s3 sync s3://openproblems-data/resources/grn/grn_benchmark/ground_truth resources/grn_benchmark/ground_truth --no-sign-request

src/methods/geneformer/helper.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
logger = logging.getLogger(__name__)
2424

2525

26-
def tokenize_data(nproc, model_details=None, gene_median=None, token=None, gene_mapping_file=None, tokenized_dir=None):
26+
def tokenize_data(nproc, temp_dir, model_details=None, gene_median=None, token=None, gene_mapping_file=None, tokenized_dir=None):
2727
"""Tokenize data with required parameters"""
2828
if not all([model_details, gene_median, token, gene_mapping_file, tokenized_dir]):
2929
raise ValueError("Missing required parameters for tokenization")
@@ -37,7 +37,7 @@ def tokenize_data(nproc, model_details=None, gene_median=None, token=None, gene_
3737
)
3838

3939
tokenizer.tokenize_data(
40-
"/tmp/geneformer/", tokenized_dir, "tokenized", file_format="h5ad"
40+
temp_dir, tokenized_dir, "tokenized", file_format="h5ad"
4141
)
4242
# extract embeddings
4343
def get_embs(
@@ -970,6 +970,7 @@ def tryParallelFunction(func, label, **kwargs):
970970

971971
def compute_geneformer_network(
972972
adata,
973+
temp_dir,
973974
forward_batch_size=4,
974975
max_ncells=1000,
975976
n_processors=20,
@@ -990,19 +991,19 @@ def compute_geneformer_network(
990991
]
991992
adata.obs["n_counts"] = adata.X.sum(1)
992993
# Create the geneformer folder if it doesn't exist
993-
geneformer_folder = "/tmp/geneformer"
994+
geneformer_folder = f"{temp_dir}/geneformer"
994995
if not os.path.exists(geneformer_folder):
995996
os.makedirs(geneformer_folder)
996-
adata.write_h5ad("/tmp/geneformer/to_token.h5ad")
997+
adata.write_h5ad(f"{temp_dir}/geneformer/to_token.h5ad")
997998

998999
genelist = [gene_mapping_dict[u] for u in adata.var.index]
9991000

1000-
tokenized_data_path = "/tmp/geneformer/tokenized_data.dataset"
1001+
tokenized_data_path = f"{temp_dir}/geneformer/tokenized_data.dataset"
10011002
if os.path.exists(tokenized_data_path):
10021003
shutil.rmtree(tokenized_data_path)
10031004

10041005
# Note: This would need proper model_details, gene_median, gene_mapping_file parameters
1005-
# tryParallelFunction(tokenize_data, "Tokenizing data")
1006+
tryParallelFunction(tokenize_data, "Tokenizing data", temp_dir=geneformer_folder)
10061007

10071008
embex = EmbExtractor(
10081009
model_type="Pretrained", # CellClassifier
@@ -1153,6 +1154,7 @@ def main(par):
11531154
]
11541155
subadata, net = compute_geneformer_network(
11551156
subadata,
1157+
temp_dir=par["temp_dir"],
11561158
forward_batch_size=par["batch_size"],
11571159
n_processors=n_processors,
11581160
max_ncells=par["max_cells"],

src/methods/geneformer/run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
viash run src/methods/geneformer/config.vsh.yaml -- \
2+
--rna resources_test/grn_benchmark/inference_data/op_rna.h5ad \
3+
--tf_all resources_test/grn_benchmark/prior/tf_all.csv \
4+
--prediction output/prediction.h5ad \
5+
--temp_dir output/geneformer
6+

src/metrics/replica_consistency/script.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@
3333
method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
3434
dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id']
3535

36-
output = main(par)
37-
36+
try:
37+
output = main(par)
38+
except Exception as e:
39+
print({'error': str(e)})
40+
41+
output = pd.DataFrame({
42+
'key': [None],
43+
'value': [None],
44+
})
45+
3846
format_save_score(output, method_id, dataset_id, par['score'])

src/metrics/experimental/sem/config.novsh.yaml renamed to src/metrics/sem/config.vsh.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ engines:
2121
__merge__: /src/api/base_requirements.yaml
2222
setup:
2323
- type: python
24-
packages: [ tqdm_joblib==0.0.5]
24+
packages: [ ]
2525
runners:
2626
- type: executable
2727
- type: nextflow
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ for dataset in "${datasets[@]}"; do
4242
fi
4343

4444
echo -e "\nProcessing method: $method\n"
45-
python src/metrics/experimental/sem/script.py \
45+
python src/metrics/sem/script.py \
4646
--prediction "$prediction" \
4747
--evaluation_data "$evaluation_data" \
4848
--score "$score"

0 commit comments

Comments
 (0)