-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdot_product_scatter.wdl
More file actions
123 lines (100 loc) · 3.14 KB
/
dot_product_scatter.wdl
File metadata and controls
123 lines (100 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
version 1.0
# This workflow is an example of the pair() input type. It is based upon
# an LD pruning workflow, but should not be used for actual scientific
# analysis -- use this instead:
# https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl
task ld_pruning {
input {
File gds_file
# runtime attributes
Int addldisk = 5
Int cpu = 2
Int memory = 4
Int preempt = 3
}
# Estimate disk size required
Int gds_size = ceil(size(gds_file, "GB"))
Int final_disk_dize = gds_size + addldisk
command {
set -eux -o pipefail
# Generate a configuration file -- this is specific to the R script that this
# task uses; generally, you wouldn't do this for most workflows.
python << CODE
import os
f = open("ld_pruning.config", "a")
f.write('gds_file "~{gds_file}"\n')
f.write('genome_build hg38\n')
# The R script expects the GDS files to contain "chr*" where * is chr number/X/Y,
# so use some string manipulation to determine the output file name. Again, this
# is one of those tricks that is specific to this particular R script.
if "chr" in "~{gds_file}":
parts = os.path.splitext(os.path.basename("~{gds_file}"))[0].split("chr")
outfile_temp = "pruned_variants_chr" + parts[1] + ".RData"
else:
outfile_temp = "pruned_variants.RData"
f.write('out_file "' + outfile_temp + '"\n')
f.close()
CODE
echo "Calling R script ld_pruning.R"
Rscript /usr/local/analysis_pipeline/R/ld_pruning.R ld_pruning.config
}
runtime {
cpu: cpu
docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
disks: "local-disk " + final_disk_dize + " HDD"
memory: "${memory} GB"
preemptible: "${preempt}"
}
output {
File ld_pruning_output = glob("*.RData")[0]
}
}
task echo_pairs {
input {
Pair[File, File] gds_n_varinc # [gds, variants to prune]
# runtime attributes
Int addldisk = 5
Int cpu = 2
Int memory = 4
Int preempt = 3
}
# Estimate disk size required
Int gds_size = ceil(size(gds_n_varinc.left, "GB"))
Int final_disk_dize = gds_size + addldisk
command {
printf "GDS file: ~{gds_n_varinc.left}\n\n"
printf "Resulting variant file it output: ~{gds_n_varinc.right}\n\n"
printf "We can now call another R script to subset each GDS file via the variants file..."
printf "...but we won't, because I want to encourage you to use this workflow instead: "
printf "https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl"
}
runtime {
cpu: cpu
docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
disks: "local-disk " + final_disk_dize + " HDD"
memory: "${memory} GB"
preemptibles: "${preempt}"
}
}
workflow dot_product_scatter {
input {
Array[File] gds_files
}
scatter(gds_file in gds_files) {
call ld_pruning {
input:
gds_file = gds_file
}
}
# CWL uses a dotproduct scatter; this is the closest WDL equivalent
scatter(gds_n_varinc in zip(gds_files, ld_pruning.ld_pruning_output)) {
call echo_pairs {
input:
gds_n_varinc = gds_n_varinc
}
}
meta {
author: "Ash O'Farrell"
email: "aofarrel@ucsc.edu"
}
}