Stuart-WDL/dot_product_scatter.wdl at main · aofarrel/Stuart-WDL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
version 1.0

# This workflow is an example of the pair() input type. It is based upon
# an LD pruning workflow, but should not be used for actual scientific
# analysis -- use this instead:
# https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl

task ld_pruning {
	input {
		File gds_file

		# runtime attributes
		Int addldisk = 5
		Int cpu = 2
		Int memory = 4
		Int preempt = 3
	}

	# Estimate disk size required
	Int gds_size = ceil(size(gds_file, "GB"))
	Int final_disk_dize = gds_size + addldisk

	command {
		set -eux -o pipefail

		# Generate a configuration file -- this is specific to the R script that this
		# task uses; generally, you wouldn't do this for most workflows.
		python << CODE
		import os
		f = open("ld_pruning.config", "a")
		f.write('gds_file "~{gds_file}"\n')
		f.write('genome_build hg38\n')

		# The R script expects the GDS files to contain "chr*" where * is chr number/X/Y,
		# so use some string manipulation to determine the output file name. Again, this
		# is one of those tricks that is specific to this particular R script.
		if "chr" in "~{gds_file}":
			parts = os.path.splitext(os.path.basename("~{gds_file}"))[0].split("chr")
			outfile_temp = "pruned_variants_chr" + parts[1] + ".RData"
		else:
			outfile_temp = "pruned_variants.RData"
		f.write('out_file "' + outfile_temp + '"\n')
		f.close()
		CODE

		echo "Calling R script ld_pruning.R"
		Rscript /usr/local/analysis_pipeline/R/ld_pruning.R ld_pruning.config
	}

	runtime {
		cpu: cpu
		docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
		disks: "local-disk " + final_disk_dize + " HDD"
		memory: "${memory} GB"
		preemptible: "${preempt}"
	}
	output {
		File ld_pruning_output = glob("*.RData")[0]
	}

}

task echo_pairs {
	input {
		Pair[File, File] gds_n_varinc  # [gds, variants to prune]

		# runtime attributes
		Int addldisk = 5
		Int cpu = 2
		Int memory = 4
		Int preempt = 3
	}

	# Estimate disk size required
	Int gds_size = ceil(size(gds_n_varinc.left, "GB"))
	Int final_disk_dize = gds_size + addldisk

	command {

		printf "GDS file: ~{gds_n_varinc.left}\n\n"
		printf "Resulting variant file it output: ~{gds_n_varinc.right}\n\n"
		printf "We can now call another R script to subset each GDS file via the variants file..."
		printf "...but we won't, because I want to encourage you to use this workflow instead: "
		printf "https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl"

	}

	runtime {
		cpu: cpu
		docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
		disks: "local-disk " + final_disk_dize + " HDD"
		memory: "${memory} GB"
		preemptibles: "${preempt}"
	}
}


workflow dot_product_scatter {
	input {
		Array[File] gds_files
	}

	scatter(gds_file in gds_files) {
		call ld_pruning {
			input:
				gds_file = gds_file
		}
	}

	# CWL uses a dotproduct scatter; this is the closest WDL equivalent
	scatter(gds_n_varinc in zip(gds_files, ld_pruning.ld_pruning_output)) {
		call echo_pairs {
			input:
				gds_n_varinc = gds_n_varinc
		}
	}

	meta {
		author: "Ash O'Farrell"
		email: "aofarrel@ucsc.edu"
	}
}