Merge pull request #36 from fhdsl/argh!

aofarrel · web-flow · commit d2325f382887 · 2024-02-29T13:32:06.000-08:00
Just paste the chapter 6 workflow in
diff --git a/06-arrays.Rmd b/06-arrays.Rmd
@@ -118,7 +118,7 @@ A scatter is essentially the WDL version of a [for loop](https://en.wikipedia.or
   }
 ```
 
-You can reference a full copy of this workflow [here](../resources/chapter-6.wdl).
+You can reference a full copy of this workflow at the end of this chapter.
 
 
 ## Referencing an array in a task
@@ -163,3 +163,220 @@ task count_words_python {
   }
 }
 ```
+
+## The workflow so far
+```
+version 1.0
+
+## Workflow developed by Sitapriya Moorthi @ Fred Hutch LMD: 01/10/24 for use by DaSL @ Fred Hutch.
+## Workflow updated on: 02/28/24 by Ash O'Farrell (UCSC)
+
+struct referenceGenome {
+    File ref_fasta
+    File ref_fasta_index
+    File ref_dict
+    File ref_amb
+    File ref_ann
+    File ref_bwt
+    File ref_pac
+    File ref_sa
+    String ref_name
+}
+
+
+workflow minidata_mutation_calling_chapter6 {
+  input {
+    Array[File] tumorSamples
+
+    referenceGenome refGenome
+    
+    File dbSNP_vcf
+    File dbSNP_vcf_index
+    File known_indels_sites_VCFs
+    File known_indels_sites_indices
+
+  }
+ 
+  # Scatter for tumor samples   
+  scatter (tumorFastq in tumorSamples) {
+    call BwaMem as tumorBwaMem {
+      input:
+        input_fastq = tumorFastq,
+        refGenome = refGenome
+    }
+    
+    call MarkDuplicates as tumorMarkDuplicates {
+      input:
+        input_bam = tumorBwaMem.analysisReadySorted
+    }
+
+    call ApplyBaseRecalibrator as tumorApplyBaseRecalibrator{
+      input:
+        input_bam = tumorMarkDuplicates.markDuplicates_bam,
+        input_bam_index = tumorMarkDuplicates.markDuplicates_bai,
+        dbSNP_vcf = dbSNP_vcf,
+        dbSNP_vcf_index = dbSNP_vcf_index,
+        known_indels_sites_VCFs = known_indels_sites_VCFs,
+        known_indels_sites_indices = known_indels_sites_indices,
+        refGenome = refGenome
+    }
+  }
+
+  output {
+    Array[File] tumoralignedBamSorted = tumorBwaMem.analysisReadySorted
+    Array[File] tumorMarkDuplicates_bam = tumorMarkDuplicates.markDuplicates_bam
+    Array[File] tumorMarkDuplicates_bai = tumorMarkDuplicates.markDuplicates_bai
+    Array[File] tumoranalysisReadyBam = tumorApplyBaseRecalibrator.recalibrated_bam 
+    Array[File] tumoranalysisReadyIndex = tumorApplyBaseRecalibrator.recalibrated_bai
+  }
+}
+# TASK DEFINITIONS
+
+# Align fastq file to the reference genome
+task BwaMem {
+  input {
+    File input_fastq
+    referenceGenome refGenome
+    Int threads = 16
+  }
+  
+  String base_file_name = basename(input_fastq, ".fastq")
+  String ref_fasta_local = basename(refGenome.ref_fasta)
+
+  String read_group_id = "ID:" + base_file_name
+  String sample_name = "SM:" + base_file_name
+  String platform = "illumina"
+  String platform_info = "PL:" + platform   # Create the platform information
+
+
+  command <<<
+    set -eo pipefail
+
+    mv ~{refGenome.ref_fasta} .
+    mv ~{refGenome.ref_fasta_index} .
+    mv ~{refGenome.ref_dict} .
+    mv ~{refGenome.ref_amb} .
+    mv ~{refGenome.ref_ann} .
+    mv ~{refGenome.ref_bwt} .
+    mv ~{refGenome.ref_pac} .
+    mv ~{refGenome.ref_sa} .
+
+    bwa mem \
+      -p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
+      ~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam 
+    samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
+    samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
+  >>>
+
+  output {
+    File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
+  }
+  
+  runtime {
+    memory: "48 GB"
+    cpu: 16
+    docker: "ghcr.io/getwilds/bwa:0.7.17"
+  }
+}
+
+# Mark duplicates
+task MarkDuplicates {
+  input {
+    File input_bam
+  }
+
+  String base_file_name = basename(input_bam, ".sorted_query_aligned.bam")
+  String output_bam = "~{base_file_name}.duplicates_marked.bam"
+  String output_bai = "~{base_file_name}.duplicates_marked.bai"
+  String metrics_file = "~{base_file_name}.duplicate_metrics"
+
+  command <<<
+    gatk MarkDuplicates \
+      --INPUT ~{input_bam} \
+      --OUTPUT ~{output_bam} \
+      --METRICS_FILE ~{metrics_file} \
+      --CREATE_INDEX true \
+      --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 \
+      --VALIDATION_STRINGENCY SILENT
+  >>>
+
+  runtime {
+    docker: "ghcr.io/getwilds/gatk:4.3.0.0"
+    memory: "48 GB"
+    cpu: 4
+  }
+
+  output {
+    File markDuplicates_bam = "~{output_bam}"
+    File markDuplicates_bai = "~{output_bai}"
+    File duplicate_metrics = "~{metrics_file}"
+  }
+}
+
+# Base quality recalibration
+task ApplyBaseRecalibrator {
+  input {
+    File input_bam
+    File input_bam_index
+    File dbSNP_vcf
+    File dbSNP_vcf_index
+    File known_indels_sites_VCFs
+    File known_indels_sites_indices
+    referenceGenome refGenome
+  }
+  
+  String base_file_name = basename(input_bam, ".duplicates_marked.bam")
+  
+  String ref_fasta_local = basename(refGenome.ref_fasta)
+  String dbSNP_vcf_local = basename(dbSNP_vcf)
+  String known_indels_sites_VCFs_local = basename(known_indels_sites_VCFs)
+
+
+  command <<<
+  set -eo pipefail
+
+  mv ~{refGenome.ref_fasta} .
+  mv ~{refGenome.ref_fasta_index} .
+  mv ~{refGenome.ref_dict} .
+
+  mv ~{dbSNP_vcf} .
+  mv ~{dbSNP_vcf_index} .
+
+  mv ~{known_indels_sites_VCFs} .
+  mv ~{known_indels_sites_indices} .
+
+  samtools index ~{input_bam} #redundant? markduplicates already does this?
+
+  gatk --java-options "-Xms8g" \
+      BaseRecalibrator \
+      -R ~{ref_fasta_local} \
+      -I ~{input_bam} \
+      -O ~{base_file_name}.recal_data.csv \
+      --known-sites ~{dbSNP_vcf_local} \
+      --known-sites ~{known_indels_sites_VCFs_local} \
+      
+
+  gatk --java-options "-Xms8g" \
+      ApplyBQSR \
+      -bqsr ~{base_file_name}.recal_data.csv \
+      -I ~{input_bam} \
+      -O ~{base_file_name}.recal.bam \
+      -R ~{ref_fasta_local} \
+      
+
+  #finds the current sort order of this bam file
+  samtools view -H ~{base_file_name}.recal.bam | grep @SQ | sed 's/@SQ\tSN:\|LN://g' > ~{base_file_name}.sortOrder.txt
+>>>
+
+  output {
+    File recalibrated_bam = "~{base_file_name}.recal.bam"
+    File recalibrated_bai = "~{base_file_name}.recal.bai"
+    File sortOrder = "~{base_file_name}.sortOrder.txt"
+  }
+  runtime {
+    memory: "36 GB"
+    cpu: 2
+    docker: "ghcr.io/getwilds/gatk:4.3.0.0"
+  }
+}
+```