Skip to content

Commit 7257e4a

Browse files
committed
Add dataset workflow for 2021Wu_human_breast_cancer_sc
1 parent 78e1927 commit 7257e4a

3 files changed

Lines changed: 230 additions & 0 deletions

File tree

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
name: process_wu_human_breast_cancer_sc
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: string
8+
name: --cancer_subtypes
9+
required: false
10+
description: The cancer subtypes to download data for.
11+
multiple: true
12+
default:
13+
- HER2+
14+
- TNBC
15+
- ER+
16+
- name: Caching settings
17+
arguments:
18+
- type: boolean
19+
name: --keep_files
20+
required: false
21+
description: Whether to remove the downloaded files after processing.
22+
default: false
23+
- name: Metadata
24+
arguments:
25+
- type: string
26+
name: --dataset_id
27+
description: "A unique identifier for the dataset"
28+
required: false
29+
default: "2021Wu_human_breast_cancer_sc"
30+
- name: --dataset_name
31+
type: string
32+
description: Nicely formatted name.
33+
required: false
34+
default: "2021Wu_human_breast_cancer_sc"
35+
- type: string
36+
name: --dataset_url
37+
description: Link to the original source of the dataset.
38+
required: false
39+
default: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078"
40+
- name: --dataset_reference
41+
type: string
42+
description: Bibtex reference of the paper in which the dataset was published.
43+
required: false
44+
default: "https://doi.org/10.1038/s41588-021-00911-1"
45+
- name: --dataset_summary
46+
type: string
47+
description: Short description of the dataset.
48+
required: false
49+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
50+
- name: --dataset_description
51+
type: string
52+
description: Long description of the dataset.
53+
required: false
54+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
55+
- name: --dataset_organism
56+
type: string
57+
description: The organism of the sample in the dataset.
58+
required: false
59+
default: "Homo sapiens"
60+
- name: Outputs
61+
arguments:
62+
- name: "--output_dataset"
63+
__merge__: /src/api/file_common_scrnaseq.yaml
64+
direction: output
65+
required: true
66+
default: "$id/dataset.h5ad"
67+
- name: "--output_meta"
68+
direction: "output"
69+
type: file
70+
description: "Dataset metadata"
71+
default: "$id/dataset_meta.yaml"
72+
73+
resources:
74+
- type: nextflow_script
75+
path: main.nf
76+
entrypoint: run_wf
77+
- path: /common/nextflow_helpers/helper.nf
78+
79+
dependencies:
80+
- name: datasets/loaders/wu_human_breast_cancer_sc
81+
# - name: datasets/processors/subsample
82+
# repository: openproblems
83+
- name: datasets/normalization/log_cp
84+
repository: openproblems
85+
- name: datasets/processors/pca
86+
repository: openproblems
87+
- name: datasets/processors/hvg
88+
repository: openproblems
89+
- name: datasets/processors/knn
90+
repository: openproblems
91+
- name: h5ad/extract_uns_metadata
92+
repository: core
93+
94+
runners:
95+
- type: nextflow
96+
directives:
97+
label: [midcpu, midmem, hightime]
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
2+
3+
workflow auto {
4+
findStates(params, meta.config)
5+
| meta.workflow.run(
6+
auto: [publish: "state"]
7+
)
8+
}
9+
10+
workflow run_wf {
11+
take:
12+
input_ch
13+
14+
main:
15+
output_ch = input_ch
16+
17+
// copy id to the state
18+
| map{ id, state ->
19+
def new_state = state + [dataset_id: id]
20+
[id, new_state]
21+
}
22+
23+
| wu_human_breast_cancer_sc.run(
24+
fromState: [
25+
"cancer_subtypes",
26+
"dataset_id",
27+
"dataset_name",
28+
"dataset_url",
29+
"dataset_reference",
30+
"dataset_summary",
31+
"dataset_description",
32+
"dataset_organism",
33+
],
34+
toState: [
35+
"output_raw": "output"
36+
]
37+
)
38+
39+
| log_cp.run(
40+
key: "log_cp10k",
41+
fromState: [
42+
"input": "output_raw"
43+
],
44+
args: [
45+
"normalization_id": "log_cp10k",
46+
"n_cp": 10000
47+
],
48+
toState: [
49+
"output_normalized": "output"
50+
]
51+
)
52+
| hvg.run(
53+
fromState: ["input": "output_normalized"],
54+
toState: ["output_hvg": "output"]
55+
)
56+
57+
| pca.run(
58+
fromState: ["input": "output_hvg"],
59+
toState: ["output_pca": "output" ]
60+
)
61+
62+
| knn.run(
63+
fromState: ["input": "output_pca"],
64+
toState: ["output_knn": "output"]
65+
)
66+
// add synonym
67+
| map{ id, state ->
68+
[id, state + [output_dataset: state.output_knn]]
69+
}
70+
71+
| extract_uns_metadata.run(
72+
fromState: { id, state ->
73+
def schema = findArgumentSchema(meta.config, "output_dataset")
74+
// workaround: convert GString to String
75+
schema = iterateMap(schema, { it instanceof GString ? it.toString() : it })
76+
def schemaYaml = tempFile("schema.yaml")
77+
writeYaml(schema, schemaYaml)
78+
[
79+
"input": state.output_dataset,
80+
"schema": schemaYaml
81+
]
82+
},
83+
toState: ["output_meta": "output"]
84+
)
85+
86+
| setState([
87+
"output_dataset": "output_dataset",
88+
"output_meta": "output_meta"
89+
])
90+
91+
emit:
92+
output_ch
93+
}
94+
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
set -e
10+
11+
# Create local output directory
12+
output_dir="resources/datasets"
13+
14+
if [ ! -d "$output_dir" ]; then
15+
mkdir -p "$output_dir"
16+
fi
17+
18+
cat > /tmp/params.yaml << HERE
19+
param_list:
20+
- id: wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc
21+
cancer_subtypes:
22+
- HER2+
23+
- TNBC
24+
- ER+
25+
26+
keep_files: false
27+
28+
output_dataset: "\$id/dataset.h5ad"
29+
output_meta: "\$id/dataset_meta.yaml"
30+
output_state: "\$id/state.yaml"
31+
publish_dir: "$output_dir"
32+
HERE
33+
34+
# Run nextflow workflow locally
35+
nextflow run . \
36+
-main-script target/nextflow/datasets/workflows/process_wu_human_breast_cancer_sc/main.nf \
37+
-params-file /tmp/params.yaml \
38+
-profile docker \
39+
-c common/nextflow_helpers/labels_ci.config

0 commit comments

Comments
 (0)