Skip to content

Commit d2325f3

Browse files
authored
Merge pull request #36 from fhdsl/argh!
Just paste the chapter 6 workflow in
2 parents 4e58d3a + abccc03 commit d2325f3

1 file changed

Lines changed: 218 additions & 1 deletion

File tree

06-arrays.Rmd

Lines changed: 218 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ A scatter is essentially the WDL version of a [for loop](https://en.wikipedia.or
118118
}
119119
```
120120

121-
You can reference a full copy of this workflow [here](../resources/chapter-6.wdl).
121+
You can reference a full copy of this workflow at the end of this chapter.
122122

123123

124124
## Referencing an array in a task
@@ -163,3 +163,220 @@ task count_words_python {
163163
}
164164
}
165165
```
166+
167+
## The workflow so far
168+
```
169+
version 1.0
170+
171+
## Workflow developed by Sitapriya Moorthi @ Fred Hutch LMD: 01/10/24 for use by DaSL @ Fred Hutch.
172+
## Workflow updated on: 02/28/24 by Ash O'Farrell (UCSC)
173+
174+
struct referenceGenome {
175+
File ref_fasta
176+
File ref_fasta_index
177+
File ref_dict
178+
File ref_amb
179+
File ref_ann
180+
File ref_bwt
181+
File ref_pac
182+
File ref_sa
183+
String ref_name
184+
}
185+
186+
187+
workflow minidata_mutation_calling_chapter6 {
188+
input {
189+
Array[File] tumorSamples
190+
191+
referenceGenome refGenome
192+
193+
File dbSNP_vcf
194+
File dbSNP_vcf_index
195+
File known_indels_sites_VCFs
196+
File known_indels_sites_indices
197+
198+
}
199+
200+
# Scatter for tumor samples
201+
scatter (tumorFastq in tumorSamples) {
202+
call BwaMem as tumorBwaMem {
203+
input:
204+
input_fastq = tumorFastq,
205+
refGenome = refGenome
206+
}
207+
208+
call MarkDuplicates as tumorMarkDuplicates {
209+
input:
210+
input_bam = tumorBwaMem.analysisReadySorted
211+
}
212+
213+
call ApplyBaseRecalibrator as tumorApplyBaseRecalibrator{
214+
input:
215+
input_bam = tumorMarkDuplicates.markDuplicates_bam,
216+
input_bam_index = tumorMarkDuplicates.markDuplicates_bai,
217+
dbSNP_vcf = dbSNP_vcf,
218+
dbSNP_vcf_index = dbSNP_vcf_index,
219+
known_indels_sites_VCFs = known_indels_sites_VCFs,
220+
known_indels_sites_indices = known_indels_sites_indices,
221+
refGenome = refGenome
222+
}
223+
}
224+
225+
output {
226+
Array[File] tumoralignedBamSorted = tumorBwaMem.analysisReadySorted
227+
Array[File] tumorMarkDuplicates_bam = tumorMarkDuplicates.markDuplicates_bam
228+
Array[File] tumorMarkDuplicates_bai = tumorMarkDuplicates.markDuplicates_bai
229+
Array[File] tumoranalysisReadyBam = tumorApplyBaseRecalibrator.recalibrated_bam
230+
Array[File] tumoranalysisReadyIndex = tumorApplyBaseRecalibrator.recalibrated_bai
231+
}
232+
}
233+
# TASK DEFINITIONS
234+
235+
# Align fastq file to the reference genome
236+
task BwaMem {
237+
input {
238+
File input_fastq
239+
referenceGenome refGenome
240+
Int threads = 16
241+
}
242+
243+
String base_file_name = basename(input_fastq, ".fastq")
244+
String ref_fasta_local = basename(refGenome.ref_fasta)
245+
246+
String read_group_id = "ID:" + base_file_name
247+
String sample_name = "SM:" + base_file_name
248+
String platform = "illumina"
249+
String platform_info = "PL:" + platform # Create the platform information
250+
251+
252+
command <<<
253+
set -eo pipefail
254+
255+
mv ~{refGenome.ref_fasta} .
256+
mv ~{refGenome.ref_fasta_index} .
257+
mv ~{refGenome.ref_dict} .
258+
mv ~{refGenome.ref_amb} .
259+
mv ~{refGenome.ref_ann} .
260+
mv ~{refGenome.ref_bwt} .
261+
mv ~{refGenome.ref_pac} .
262+
mv ~{refGenome.ref_sa} .
263+
264+
bwa mem \
265+
-p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
266+
~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
267+
samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
268+
samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
269+
>>>
270+
271+
output {
272+
File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
273+
}
274+
275+
runtime {
276+
memory: "48 GB"
277+
cpu: 16
278+
docker: "ghcr.io/getwilds/bwa:0.7.17"
279+
}
280+
}
281+
282+
# Mark duplicates
283+
task MarkDuplicates {
284+
input {
285+
File input_bam
286+
}
287+
288+
String base_file_name = basename(input_bam, ".sorted_query_aligned.bam")
289+
String output_bam = "~{base_file_name}.duplicates_marked.bam"
290+
String output_bai = "~{base_file_name}.duplicates_marked.bai"
291+
String metrics_file = "~{base_file_name}.duplicate_metrics"
292+
293+
command <<<
294+
gatk MarkDuplicates \
295+
--INPUT ~{input_bam} \
296+
--OUTPUT ~{output_bam} \
297+
--METRICS_FILE ~{metrics_file} \
298+
--CREATE_INDEX true \
299+
--OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 \
300+
--VALIDATION_STRINGENCY SILENT
301+
>>>
302+
303+
runtime {
304+
docker: "ghcr.io/getwilds/gatk:4.3.0.0"
305+
memory: "48 GB"
306+
cpu: 4
307+
}
308+
309+
output {
310+
File markDuplicates_bam = "~{output_bam}"
311+
File markDuplicates_bai = "~{output_bai}"
312+
File duplicate_metrics = "~{metrics_file}"
313+
}
314+
}
315+
316+
# Base quality recalibration
317+
task ApplyBaseRecalibrator {
318+
input {
319+
File input_bam
320+
File input_bam_index
321+
File dbSNP_vcf
322+
File dbSNP_vcf_index
323+
File known_indels_sites_VCFs
324+
File known_indels_sites_indices
325+
referenceGenome refGenome
326+
}
327+
328+
String base_file_name = basename(input_bam, ".duplicates_marked.bam")
329+
330+
String ref_fasta_local = basename(refGenome.ref_fasta)
331+
String dbSNP_vcf_local = basename(dbSNP_vcf)
332+
String known_indels_sites_VCFs_local = basename(known_indels_sites_VCFs)
333+
334+
335+
command <<<
336+
set -eo pipefail
337+
338+
mv ~{refGenome.ref_fasta} .
339+
mv ~{refGenome.ref_fasta_index} .
340+
mv ~{refGenome.ref_dict} .
341+
342+
mv ~{dbSNP_vcf} .
343+
mv ~{dbSNP_vcf_index} .
344+
345+
mv ~{known_indels_sites_VCFs} .
346+
mv ~{known_indels_sites_indices} .
347+
348+
samtools index ~{input_bam} #redundant? markduplicates already does this?
349+
350+
gatk --java-options "-Xms8g" \
351+
BaseRecalibrator \
352+
-R ~{ref_fasta_local} \
353+
-I ~{input_bam} \
354+
-O ~{base_file_name}.recal_data.csv \
355+
--known-sites ~{dbSNP_vcf_local} \
356+
--known-sites ~{known_indels_sites_VCFs_local} \
357+
358+
359+
gatk --java-options "-Xms8g" \
360+
ApplyBQSR \
361+
-bqsr ~{base_file_name}.recal_data.csv \
362+
-I ~{input_bam} \
363+
-O ~{base_file_name}.recal.bam \
364+
-R ~{ref_fasta_local} \
365+
366+
367+
#finds the current sort order of this bam file
368+
samtools view -H ~{base_file_name}.recal.bam | grep @SQ | sed 's/@SQ\tSN:\|LN://g' > ~{base_file_name}.sortOrder.txt
369+
>>>
370+
371+
output {
372+
File recalibrated_bam = "~{base_file_name}.recal.bam"
373+
File recalibrated_bai = "~{base_file_name}.recal.bai"
374+
File sortOrder = "~{base_file_name}.sortOrder.txt"
375+
}
376+
runtime {
377+
memory: "36 GB"
378+
cpu: 2
379+
docker: "ghcr.io/getwilds/gatk:4.3.0.0"
380+
}
381+
}
382+
```

0 commit comments

Comments
 (0)