Skip to content

Commit b4f87c6

Browse files
committed
Merge branch 'main' of https://github.com/fhdsl/WDL_Workflows_Guide into main
2 parents f1f872a + 2fd0f2d commit b4f87c6

7 files changed

Lines changed: 238 additions & 162 deletions

File tree

docs/03-first-task.md

Lines changed: 61 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,10 @@ task BwaMem {
230230
# basename() is a built-in WDL function that acts like bash's basename
231231
String base_file_name = basename(input_fastq, ".fastq")
232232
String ref_fasta_local = basename(ref_fasta)
233+
String read_group_id = "ID:" + base_file_name
234+
String sample_name = "SM:" + base_file_name
235+
String platform = "illumina"
236+
String platform_info = "PL:" + platform # Create the platform information
233237
234238
command <<<
235239
set -eo pipefail
@@ -245,10 +249,10 @@ task BwaMem {
245249
246250
247251
bwa mem \
248-
-p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2' \
249-
"~{ref_fasta_local}" "~{input_fastq}" > "~{base_file_name}.sam"
250-
samtools view -1bS -@ 15 -o "~{base_file_name}.aligned.bam" "~{base_file_name}.sam"
251-
samtools sort -n -@ 15 -o "~{base_file_name}.sorted_query_aligned.bam" "~{base_file_name}.aligned.bam"
252+
-p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
253+
~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
254+
samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
255+
samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
252256
253257
>>>
254258
}
@@ -261,7 +265,7 @@ The runtime attributes of a task tell the WDL executor important information abo
261265
runtime {
262266
memory: "48 GB"
263267
cpu: 16
264-
docker: "fredhutch/bwa:0.7.17"
268+
docker: "ghcr.io/getwilds/bwa:0.7.17"
265269
disks: "local-disk 100 SSD"
266270
}
267271
```
@@ -320,7 +324,6 @@ The outputs of a task are defined in the `output` section of your task. Typicall
320324

321325
```
322326
output {
323-
File analysisReadyBam = "~{base_file_name}.aligned.bam"
324327
File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
325328
}
326329
```
@@ -329,12 +332,11 @@ Another way of writing this is with string concatenation. This is equivalent to
329332

330333
```
331334
output {
332-
File analysisReadyBam = base_file_name + ".aligned.bam"
333335
File analysisReadySorted = base_file_name + ".sorted_query_aligned.bam"
334336
}
335337
```
336338

337-
If the output was not in the working directory, we would need to change the output to point to the file's path relative to the working directory, such as `File analysisReadyBam = "some_folder/~{base_file_name}.aligned.bam"`.
339+
If the output was not in the working directory, we would need to change the output to point to the file's path relative to the working directory, such as `File analysisReadySorted = "some_folder/~{base_file_name}.sorted_query_aligned.bam"`.
338340

339341
Below are some some additional ways you can handle task outputs.
340342

@@ -405,48 +407,47 @@ We've now designed a bwa mem task that can run on essentially any backend that s
405407
task BwaMem {
406408
input {
407409
File input_fastq
408-
File ref_fasta
409-
File ref_fasta_index
410-
File ref_dict
411-
File ref_amb
412-
File ref_ann
413-
File ref_bwt
414-
File ref_pac
415-
File ref_sa
410+
referenceGenome refGenome
416411
Int threads = 16
417412
}
418413
419414
String base_file_name = basename(input_fastq, ".fastq")
420-
String ref_fasta_local = basename(ref_fasta)
415+
String ref_fasta_local = basename(refGenome.ref_fasta)
416+
417+
String read_group_id = "ID:" + base_file_name
418+
String sample_name = "SM:" + base_file_name
419+
String platform = "illumina"
420+
String platform_info = "PL:" + platform # Create the platform information
421+
421422
422423
command <<<
423424
set -eo pipefail
424425
425-
mv "~{ref_fasta}" .
426-
mv "~{ref_fasta_index}" .
427-
mv "~{ref_dict}" .
428-
mv "~{ref_amb}" .
429-
mv "~{ref_ann}" .
430-
mv "~{ref_bwt}" .
431-
mv "~{ref_pac}" .
432-
mv "~{ref_sa}" .
426+
#can we iterate through a struct??
427+
mv ~{refGenome.ref_fasta} .
428+
mv ~{refGenome.ref_fasta_index} .
429+
mv ~{refGenome.ref_dict} .
430+
mv ~{refGenome.ref_amb} .
431+
mv ~{refGenome.ref_ann} .
432+
mv ~{refGenome.ref_bwt} .
433+
mv ~{refGenome.ref_pac} .
434+
mv ~{refGenome.ref_sa} .
433435
434436
bwa mem \
435-
-p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2' \
436-
"~{ref_fasta_local}" "~{input_fastq}" > "~{base_file_name}.sam"
437-
samtools view -1bS -@ 15 -o "~{base_file_name}.aligned.bam" "~{base_file_name}.sam"
438-
samtools sort -n -@ 15 -o "~{base_file_name}.sorted_query_aligned.bam" "~{base_file_name}.aligned.bam"
439-
437+
-p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
438+
~{ref_fasta_local} ~{input_fastq} > ~{base_file_name}.sam
439+
samtools view -1bS -@ 15 -o ~{base_file_name}.aligned.bam ~{base_file_name}.sam
440+
samtools sort -@ 15 -o ~{base_file_name}.sorted_query_aligned.bam ~{base_file_name}.aligned.bam
440441
>>>
442+
441443
output {
442-
File analysisReadyBam = "~{base_file_name}.aligned.bam"
443444
File analysisReadySorted = "~{base_file_name}.sorted_query_aligned.bam"
444445
}
446+
445447
runtime {
446448
memory: "48 GB"
447449
cpu: 16
448-
docker: "fredhutch/bwa:0.7.17"
449-
disks: "local-disk 100 SSD"
450+
docker: "ghcr.io/getwilds/bwa:0.7.17"
450451
}
451452
}
452453
```
@@ -459,10 +460,10 @@ For the workflow to actually "see" the task, the task will either need to be imp
459460
```
460461
version 1.0
461462
462-
workflow minidata_test_alignment {
463+
workflow mutation_calling {
463464
input {
464465
# Sample info
465-
File sampleFastq
466+
File tumorFastq
466467
# Reference Genome information
467468
File ref_fasta
468469
File ref_fasta_index
@@ -479,7 +480,7 @@ workflow minidata_test_alignment {
479480
# Map reads to reference
480481
call BwaMem {
481482
input:
482-
input_fastq = sampleFastq,
483+
input_fastq = input_fastq,
483484
ref_fasta = ref_fasta,
484485
ref_fasta_index = ref_fasta_index,
485486
ref_dict = ref_dict,
@@ -525,8 +526,10 @@ task BwaMem {
525526
Int threads = 16
526527
}
527528
528-
String base_file_name = basename(input_fastq, ".fastq")
529-
String ref_fasta_local = basename(ref_fasta)
529+
String read_group_id = "ID:" + base_file_name
530+
String sample_name = "SM:" + base_file_name
531+
String platform = "illumina"
532+
String platform_info = "PL:" + platform # Create the platform information
530533
531534
command <<<
532535
set -eo pipefail
@@ -541,7 +544,7 @@ task BwaMem {
541544
mv "~{ref_sa}" .
542545
543546
bwa mem \
544-
-p -v 3 -t ~{threads} -M -R '@RG\tID:foo\tSM:foo2' \
547+
-p -v 3 -t ~{threads} -M -R '@RG\t~{read_group_id}\t~{sample_name}\t~{platform_info}' \
545548
"~{ref_fasta_local}" "~{input_fastq}" > "~{base_file_name}.sam"
546549
samtools view -1bS -@ 15 -o "~{base_file_name}.aligned.bam" "~{base_file_name}.sam"
547550
samtools sort -n -@ 15 -o "~{base_file_name}.sorted_query_aligned.bam" "~{base_file_name}.aligned.bam"
@@ -554,7 +557,7 @@ task BwaMem {
554557
runtime {
555558
memory: "48 GB"
556559
cpu: 16
557-
docker: "fredhutch/bwa:0.7.17"
560+
docker: "ghcr.io/getwilds/bwa:0.7.17"
558561
disks: "local-disk 100 SSD"
559562
}
560563
}
@@ -563,3 +566,21 @@ task BwaMem {
563566
## Testing your first task
564567

565568
To test your first task and your workflow, you should have expectation of output is. For this first `BwaMem` task, we just care that the BAM file is created with aligned reads. You can use `samtools view output.sorted_query_aligned.bam` to examine the reads and pipe it to wordcount `wc` to get the number of total reads. This number should be almost identical as the number of reads from your input FASTQ file if you run `wc input.fastq`. In other tasks, we might have a more precise expectation of what the output file should be, such as containing the specific somatic mutation call that we have curated.
569+
570+
Here is an example JSON with the test data needed to run this single-task workflow:
571+
```
572+
{
573+
"mutation_calling.tumorFastq": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/workflow_testing_data/WDL/wdl_101/HCC4006_final.fastq",
574+
"mutation_calling.ref_fasta": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta",
575+
"mutation_calling.ref_fasta_index": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.fai",
576+
"mutation_calling.ref_dict": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.dict",
577+
"mutation_calling.ref_pac": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.pac",
578+
"mutation_calling.ref_sa": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.sa",
579+
"mutation_calling.ref_amb": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.amb",
580+
"mutation_calling.ref_ann": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.ann",
581+
"mutation_calling.ref_bwt": "/fh/fast/paguirigan_a/pub/ReferenceDataSets/genome_data/human/hg19/Homo_sapiens_assembly19.fasta.bwt",
582+
"mutation_calling.ref_name": "hg19"
583+
}
584+
```
585+
586+
If you are not running on the Fred Hutch HPC, you'll need to modify your JSON file to point to wherever you have the data files stored. You can download the same fastq we're using from [our sandbox repo](https://github.com/fhdsl/WDL-sandbox/tree/main/data), and the reference files can be generated via `samtools index` or [downloaded from the Broad Institute's mirror](https://data.broadinstitute.org/snowman/hg19/).

docs/WDL-Workflows-Guide.docx

803 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)