@@ -118,7 +118,7 @@ A scatter is essentially the WDL version of a [for loop](https://en.wikipedia.or
118118 }
119119```
120120
121- You can reference a full copy of this workflow [ here ] ( ../resources/ chapter-6.wdl ) .
121+ You can reference a full copy of this workflow at the end of this chapter.
122122
123123
124124## Referencing an array in a task
@@ -163,3 +163,220 @@ task count_words_python {
163163 }
164164}
165165```
166+
167+ ## The workflow so far
168+ ```
169+ version 1.0
170+
171+ ## Workflow developed by Sitapriya Moorthi @ Fred Hutch LMD: 01/10/24 for use by DaSL @ Fred Hutch.
172+ ## Workflow updated on: 02/28/24 by Ash O'Farrell (UCSC)
173+
174+ struct referenceGenome {
175+ File ref_fasta
176+ File ref_fasta_index
177+ File ref_dict
178+ File ref_amb
179+ File ref_ann
180+ File ref_bwt
181+ File ref_pac
182+ File ref_sa
183+ String ref_name
184+ }
185+
186+
187+ workflow minidata_mutation_calling_chapter6 {
188+ input {
189+ Array[File] tumorSamples
190+
191+ referenceGenome refGenome
192+
193+ File dbSNP_vcf
194+ File dbSNP_vcf_index
195+ File known_indels_sites_VCFs
196+ File known_indels_sites_indices
197+
198+ }
199+
200+ # Scatter for tumor samples
201+ scatter (tumorFastq in tumorSamples) {
202+ call BwaMem as tumorBwaMem {
203+ input:
204+ input_fastq = tumorFastq,
205+ refGenome = refGenome
206+ }
207+
208+ call MarkDuplicates as tumorMarkDuplicates {
209+ input:
210+ input_bam = tumorBwaMem.analysisReadySorted
211+ }
212+
213+ call ApplyBaseRecalibrator as tumorApplyBaseRecalibrator{
214+ input:
215+ input_bam = tumorMarkDuplicates.markDuplicates_bam,
216+ input_bam_index = tumorMarkDuplicates.markDuplicates_bai,
217+ dbSNP_vcf = dbSNP_vcf,
218+ dbSNP_vcf_index = dbSNP_vcf_index,
219+ known_indels_sites_VCFs = known_indels_sites_VCFs,
220+ known_indels_sites_indices = known_indels_sites_indices,
221+ refGenome = refGenome
222+ }
223+ }
224+
225+ output {
226+ Array[File] tumoralignedBamSorted = tumorBwaMem.analysisReadySorted
227+ Array[File] tumorMarkDuplicates_bam = tumorMarkDuplicates.markDuplicates_bam
228+ Array[File] tumorMarkDuplicates_bai = tumorMarkDuplicates.markDuplicates_bai
229+ Array[File] tumoranalysisReadyBam = tumorApplyBaseRecalibrator.recalibrated_bam
230+ Array[File] tumoranalysisReadyIndex = tumorApplyBaseRecalibrator.recalibrated_bai
231+ }
232+ }
233+ # TASK DEFINITIONS
234+
235+ # Align fastq file to the reference genome
236+ task BwaMem {
237+ input {
238+ File input_fastq
239+ referenceGenome refGenome
240+ Int threads = 16
241+ }
242+
243+ String base_file_name = basename(input_fastq, ".fastq")
244+ String ref_fasta_local = basename(refGenome.ref_fasta)
245+
246+ String read_group_id = "ID:" + base_file_name
247+ String sample_name = "SM:" + base_file_name
248+ String platform = "illumina"
249+ String platform_info = "PL:" + platform # Create the platform information
250+
251+
252+ command <<<
253+ set -eo pipefail
254+
255+ mv ~{refGenome.ref_fasta} .
256+ mv ~{refGenome.ref_fasta_index} .
257+ mv ~{refGenome.ref_dict} .
258+ mv ~{refGenome.ref_amb} .
259+ mv ~{refGenome.ref_ann} .
260+ mv ~{refGenome.ref_bwt} .
261+ mv ~{refGenome.ref_pac} .
262+ mv ~{refGenome.ref_sa} .
263+
264+ bwa mem \
265+ -p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
266+ ~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
267+ samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
268+ samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
269+ >>>
270+
271+ output {
272+ File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
273+ }
274+
275+ runtime {
276+ memory: "48 GB"
277+ cpu: 16
278+ docker: "ghcr.io/getwilds/bwa:0.7.17"
279+ }
280+ }
281+
282+ # Mark duplicates
283+ task MarkDuplicates {
284+ input {
285+ File input_bam
286+ }
287+
288+ String base_file_name = basename(input_bam, ".sorted_query_aligned.bam")
289+ String output_bam = "~{base_file_name}.duplicates_marked.bam"
290+ String output_bai = "~{base_file_name}.duplicates_marked.bai"
291+ String metrics_file = "~{base_file_name}.duplicate_metrics"
292+
293+ command <<<
294+ gatk MarkDuplicates \
295+ --INPUT ~{input_bam} \
296+ --OUTPUT ~{output_bam} \
297+ --METRICS_FILE ~{metrics_file} \
298+ --CREATE_INDEX true \
299+ --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 \
300+ --VALIDATION_STRINGENCY SILENT
301+ >>>
302+
303+ runtime {
304+ docker: "ghcr.io/getwilds/gatk:4.3.0.0"
305+ memory: "48 GB"
306+ cpu: 4
307+ }
308+
309+ output {
310+ File markDuplicates_bam = "~{output_bam}"
311+ File markDuplicates_bai = "~{output_bai}"
312+ File duplicate_metrics = "~{metrics_file}"
313+ }
314+ }
315+
316+ # Base quality recalibration
317+ task ApplyBaseRecalibrator {
318+ input {
319+ File input_bam
320+ File input_bam_index
321+ File dbSNP_vcf
322+ File dbSNP_vcf_index
323+ File known_indels_sites_VCFs
324+ File known_indels_sites_indices
325+ referenceGenome refGenome
326+ }
327+
328+ String base_file_name = basename(input_bam, ".duplicates_marked.bam")
329+
330+ String ref_fasta_local = basename(refGenome.ref_fasta)
331+ String dbSNP_vcf_local = basename(dbSNP_vcf)
332+ String known_indels_sites_VCFs_local = basename(known_indels_sites_VCFs)
333+
334+
335+ command <<<
336+ set -eo pipefail
337+
338+ mv ~{refGenome.ref_fasta} .
339+ mv ~{refGenome.ref_fasta_index} .
340+ mv ~{refGenome.ref_dict} .
341+
342+ mv ~{dbSNP_vcf} .
343+ mv ~{dbSNP_vcf_index} .
344+
345+ mv ~{known_indels_sites_VCFs} .
346+ mv ~{known_indels_sites_indices} .
347+
348+ samtools index ~{input_bam} #redundant? markduplicates already does this?
349+
350+ gatk --java-options "-Xms8g" \
351+ BaseRecalibrator \
352+ -R ~{ref_fasta_local} \
353+ -I ~{input_bam} \
354+ -O ~{base_file_name}.recal_data.csv \
355+ --known-sites ~{dbSNP_vcf_local} \
356+ --known-sites ~{known_indels_sites_VCFs_local} \
357+
358+
359+ gatk --java-options "-Xms8g" \
360+ ApplyBQSR \
361+ -bqsr ~{base_file_name}.recal_data.csv \
362+ -I ~{input_bam} \
363+ -O ~{base_file_name}.recal.bam \
364+ -R ~{ref_fasta_local} \
365+
366+
367+ #finds the current sort order of this bam file
368+ samtools view -H ~{base_file_name}.recal.bam | grep @SQ | sed 's/@SQ\tSN:\|LN://g' > ~{base_file_name}.sortOrder.txt
369+ >>>
370+
371+ output {
372+ File recalibrated_bam = "~{base_file_name}.recal.bam"
373+ File recalibrated_bai = "~{base_file_name}.recal.bai"
374+ File sortOrder = "~{base_file_name}.sortOrder.txt"
375+ }
376+ runtime {
377+ memory: "36 GB"
378+ cpu: 2
379+ docker: "ghcr.io/getwilds/gatk:4.3.0.0"
380+ }
381+ }
382+ ```
0 commit comments