@@ -230,6 +230,10 @@ task BwaMem {
230230 # basename() is a built-in WDL function that acts like bash's basename
231231 String base_file_name = basename(input_fastq, ".fastq")
232232 String ref_fasta_local = basename(ref_fasta)
233+ String read_group_id = "ID:" + base_file_name
234+ String sample_name = "SM:" + base_file_name
235+ String platform = "illumina"
236+ String platform_info = "PL:" + platform # Create the platform information
233237
234238 command <<<
235239 set -eo pipefail
@@ -245,10 +249,10 @@ task BwaMem {
245249
246250
247251 bwa mem \
248- -p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2 ' \
249- " ~{ref_fasta_local}" " ~{input_fastq}" > " ~{base_file_name}.sam"
250- samtools view -1bS -@ 15 -o " ~{base_file_name}.aligned.bam" " ~{base_file_name}.sam"
251- samtools sort -n - @ 15 -o " ~{base_file_name}.sorted_query_aligned.bam" " ~{base_file_name}.aligned.bam"
252+ -p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info} ' \
253+ ~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
254+ samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
255+ samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
252256
253257 >>>
254258}
@@ -261,7 +265,7 @@ The runtime attributes of a task tell the WDL executor important information abo
261265 runtime {
262266 memory: "48 GB"
263267 cpu: 16
264- docker: "fredhutch /bwa:0.7.17"
268+ docker: "ghcr.io/getwilds /bwa:0.7.17"
265269 disks: "local-disk 100 SSD"
266270 }
267271```
@@ -320,7 +324,6 @@ The outputs of a task are defined in the `output` section of your task. Typicall
320324
321325```
322326 output {
323- File analysisReadyBam = "~{base_file_name}.aligned.bam"
324327 File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
325328 }
326329```
@@ -329,12 +332,11 @@ Another way of writing this is with string concatenation. This is equivalent to
329332
330333```
331334 output {
332- File analysisReadyBam = base_file_name + ".aligned.bam"
333335 File analysisReadySorted = base_file_name + ".sorted_query_aligned.bam"
334336 }
335337```
336338
337- If the output was not in the working directory, we would need to change the output to point to the file's path relative to the working directory, such as ` File analysisReadyBam = "some_folder/~{base_file_name}.aligned .bam" ` .
339+ If the output was not in the working directory, we would need to change the output to point to the file's path relative to the working directory, such as ` File analysisReadySorted = "some_folder/~{base_file_name}.sorted_query_aligned .bam" ` .
338340
339341Below are some some additional ways you can handle task outputs.
340342
@@ -405,48 +407,47 @@ We've now designed a bwa mem task that can run on essentially any backend that s
405407task BwaMem {
406408 input {
407409 File input_fastq
408- File ref_fasta
409- File ref_fasta_index
410- File ref_dict
411- File ref_amb
412- File ref_ann
413- File ref_bwt
414- File ref_pac
415- File ref_sa
410+ referenceGenome refGenome
416411 Int threads = 16
417412 }
418413
419414 String base_file_name = basename(input_fastq, ".fastq")
420- String ref_fasta_local = basename(ref_fasta)
415+ String ref_fasta_local = basename(refGenome.ref_fasta)
416+
417+ String read_group_id = "ID:" + base_file_name
418+ String sample_name = "SM:" + base_file_name
419+ String platform = "illumina"
420+ String platform_info = "PL:" + platform # Create the platform information
421+
421422
422423 command <<<
423424 set -eo pipefail
424425
425- mv "~{ref_fasta}" .
426- mv "~{ref_fasta_index}" .
427- mv "~{ref_dict}" .
428- mv "~{ref_amb}" .
429- mv "~{ref_ann}" .
430- mv "~{ref_bwt}" .
431- mv "~{ref_pac}" .
432- mv "~{ref_sa}" .
426+ #can we iterate through a struct??
427+ mv ~{refGenome.ref_fasta} .
428+ mv ~{refGenome.ref_fasta_index} .
429+ mv ~{refGenome.ref_dict} .
430+ mv ~{refGenome.ref_amb} .
431+ mv ~{refGenome.ref_ann} .
432+ mv ~{refGenome.ref_bwt} .
433+ mv ~{refGenome.ref_pac} .
434+ mv ~{refGenome.ref_sa} .
433435
434436 bwa mem \
435- -p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2' \
436- "~{ref_fasta_local}" "~{input_fastq}" > "~{base_file_name}.sam"
437- samtools view -1bS -@ 15 -o "~{base_file_name}.aligned.bam" "~{base_file_name}.sam"
438- samtools sort -n -@ 15 -o "~{base_file_name}.sorted_query_aligned.bam" "~{base_file_name}.aligned.bam"
439-
437+ -p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
438+ ~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
439+ samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
440+ samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
440441 >>>
442+
441443 output {
442- File analysisReadyBam = "~{base_file_name}.aligned.bam"
443444 File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
444445 }
446+
445447 runtime {
446448 memory: "48 GB"
447449 cpu: 16
448- docker: "fredhutch/bwa:0.7.17"
449- disks: "local-disk 100 SSD"
450+ docker: "ghcr.io/getwilds/bwa:0.7.17"
450451 }
451452}
452453```
@@ -459,10 +460,10 @@ For the workflow to actually "see" the task, the task will either need to be imp
459460```
460461version 1.0
461462
462- workflow minidata_test_alignment {
463+ workflow mutation_calling {
463464 input {
464465 # Sample info
465- File sampleFastq
466+ File tumorFastq
466467 # Reference Genome information
467468 File ref_fasta
468469 File ref_fasta_index
@@ -479,7 +480,7 @@ workflow minidata_test_alignment {
479480 # Map reads to reference
480481 call BwaMem {
481482 input:
482- input_fastq = sampleFastq ,
483+ input_fastq = input_fastq ,
483484 ref_fasta = ref_fasta,
484485 ref_fasta_index = ref_fasta_index,
485486 ref_dict = ref_dict,
@@ -525,8 +526,10 @@ task BwaMem {
525526 Int threads = 16
526527 }
527528
528- String base_file_name = basename(input_fastq, ".fastq")
529- String ref_fasta_local = basename(ref_fasta)
529+ String read_group_id = "ID:" + base_file_name
530+ String sample_name = "SM:" + base_file_name
531+ String platform = "illumina"
532+ String platform_info = "PL:" + platform # Create the platform information
530533
531534 command <<<
532535 set -eo pipefail
@@ -541,7 +544,7 @@ task BwaMem {
541544 mv "~{ref_sa}" .
542545
543546 bwa mem \
544- -p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2 ' \
547+ -p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info} ' \
545548 "~{ref_fasta_local}" "~{input_fastq}" > "~{base_file_name}.sam"
546549 samtools view -1bS -@ 15 -o "~{base_file_name}.aligned.bam" "~{base_file_name}.sam"
547550 samtools sort -n -@ 15 -o "~{base_file_name}.sorted_query_aligned.bam" "~{base_file_name}.aligned.bam"
@@ -554,7 +557,7 @@ task BwaMem {
554557 runtime {
555558 memory: "48 GB"
556559 cpu: 16
557- docker: "fredhutch /bwa:0.7.17"
560+ docker: "ghcr.io/getwilds /bwa:0.7.17"
558561 disks: "local-disk 100 SSD"
559562 }
560563}
@@ -563,3 +566,21 @@ task BwaMem {
563566## Testing your first task
564567
565568To test your first task and your workflow, you should have expectation of output is. For this first ` BwaMem ` task, we just care that the BAM file is created with aligned reads. You can use ` samtools view output.sorted_query_aligned.bam ` to examine the reads and pipe it to wordcount ` wc ` to get the number of total reads. This number should be almost identical as the number of reads from your input FASTQ file if you run ` wc input.fastq ` . In other tasks, we might have a more precise expectation of what the output file should be, such as containing the specific somatic mutation call that we have curated.
569+
570+ Here is an example JSON with the test data needed to run this single-task workflow:
571+ ```
572+ {
573+ "mutation_calling.tumorFastq": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/workflow_testing_data/WDL/wdl_101/HCC4006_final.fastq",
574+ "mutation_calling.ref_fasta": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta",
575+ "mutation_calling.ref_fasta_index": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.fai",
576+ "mutation_calling.ref_dict": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.dict",
577+ "mutation_calling.ref_pac": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.pac",
578+ "mutation_calling.ref_sa": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.sa",
579+ "mutation_calling.ref_amb": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.amb",
580+ "mutation_calling.ref_ann": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.ann",
581+ "mutation_calling.ref_bwt": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.bwt",
582+ "mutation_calling.ref_name": "hg19"
583+ }
584+ ```
585+
586+ If you are not running on the Fred Hutch HPC, you'll need to modify your JSON file to point to wherever you have the data files stored. You can download the same fastq we're using from [ our sandbox repo] ( https://github.com/fhdsl/WDL-sandbox/tree/main/data ) , and the reference files can be generated via ` samtools index ` or [ downloaded from the Broad Institute's mirror] ( https://data.broadinstitute.org/snowman/hg19/ ) .
0 commit comments