adna-workflow/demultiplex_broad.wdl at master · DReichLab/adna-workflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import "demultiplex.wdl" as demultiplex_master

workflow demultiplex_align_bams{
	# exclusive to Broad processing
	# end of exclusive section

	String blc_input_directory
	String dataset_label
	String date

	File i5_indices
	File i7_indices
	File barcodeSets

	File barcodes_q_only
	File labeled_i5
	File labeled_i7

	File adna_screen_jar
	File picard_jar

	Float missing_alignments_fraction
	Int max_open_gaps
	Int seed_length

	Int samples_to_demultiplex

	File index_barcode_keys

	File python_lane_name
	File python_index_pairs_without_barcodes
	File python_common_unknown_barcodes
	File python_kmer_analysis
	File python_prepare_report

	# the references need to appear in the same directory as the derived files
	# in the prepare_reference, we put all of these into the same directory
	# all subsequent uses of the reference need to use that copy
	File reference_in
	File mt_reference_rsrs_in

	String output_path_parent
	String output_path = output_path_parent + "/" + date + "_" + dataset_label
	# duplicated from demultiplex.wdl because I cannot figure out how to reuse
	String nuclear_demultiplex_subdirectory = "nuclear_aligned_unfiltered"
	String mt_demultiplex_subdirectory = "rsrs_aligned_filtered"
	String output_path_nuclear_aligned_unfiltered = output_path + "/" + nuclear_demultiplex_subdirectory
	String output_path_rsrs_aligned_filtered = output_path + "/" + mt_demultiplex_subdirectory

	call demultiplex_master.prepare_reference as prepare_reference_nuclear{ input:
		reference = reference_in
	}
	call demultiplex_master.prepare_reference as prepare_reference_rsrs{ input:
		reference = mt_reference_rsrs_in
	}
	call demultiplex_master.versions{ input:
		adna_screen_jar = adna_screen_jar,
		picard_jar = picard_jar,
		index_barcode_keys_to_stop_call_caching = index_barcode_keys
	}

	# exclusive to broad input
	call intake_fastq{
		# configured in inputs json
	}
	#

	scatter(lane in intake_fastq.read_files_by_lane){
		call demultiplex_master.barcode_count_check{ input:
			adna_screen_jar = adna_screen_jar,
			i5_indices = i5_indices,
			i7_indices = i7_indices,
			barcodeSets = barcodeSets,
			read_files_by_lane = lane
		}
	}
	call demultiplex_master.aggregate_statistics as aggregate_barcode_count_statistics{ input :
		adna_screen_jar=adna_screen_jar,
		statistics_by_group=barcode_count_check.barcode_count_statistics
	}
	scatter(lane in intake_fastq.read_files_by_lane){
		call demultiplex_master.merge_and_trim_lane { input :
			adna_screen_jar = adna_screen_jar,
			i5_indices = i5_indices,
			i7_indices = i7_indices,
			barcodeSets = barcodeSets,
			read_files_by_lane = lane,
			label = "merged",
			barcode_count_statistics = aggregate_barcode_count_statistics.statistics,
			index_barcode_keys = index_barcode_keys
		}
	}
	call demultiplex_master.collect_read_group_info{ input:
		read_groups_by_lane = merge_and_trim_lane.read_group
	}
	call demultiplex_master.aggregate_statistics as aggregate_lane_statistics{ input :
		adna_screen_jar=adna_screen_jar,
		statistics_by_group=merge_and_trim_lane.statistics
	}
	call demultiplex_master.prepare_demultiplex_report{ input:
		python_prepare_report = python_prepare_report,
		demultiplex_statistics = aggregate_lane_statistics.statistics,
		index_barcode_keys = index_barcode_keys,
		dataset_label = dataset_label,
		date = date
	}
	call demultiplex_master.collect_filenames{ input:
		filename_arrays = merge_and_trim_lane.fastq_to_align
	}
	String read_group = dataset_label
	scatter(fastq_to_align in collect_filenames.filenames){
		call demultiplex_master.align as align_nuclear{ input:
			missing_alignments_fraction = missing_alignments_fraction,
			max_open_gaps = max_open_gaps,
			seed_length = seed_length,
			fastq_to_align = fastq_to_align,
			reference = prepare_reference_nuclear.reference_fa,
			reference_amb = prepare_reference_nuclear.reference_amb,
			reference_ann = prepare_reference_nuclear.reference_ann,
			reference_bwt = prepare_reference_nuclear.reference_bwt,
			reference_pac = prepare_reference_nuclear.reference_pac,
			reference_sa = prepare_reference_nuclear.reference_sa
		}
	}
	call demultiplex_master.align_pool as align_rsrs{ input:
		missing_alignments_fraction = missing_alignments_fraction,
		max_open_gaps = max_open_gaps,
		seed_length = seed_length,
		fastq_to_align = collect_filenames.filenames,
		reference = prepare_reference_rsrs.reference_fa,
		reference_amb = prepare_reference_rsrs.reference_amb,
		reference_ann = prepare_reference_rsrs.reference_ann,
		reference_bwt = prepare_reference_rsrs.reference_bwt,
		reference_pac = prepare_reference_rsrs.reference_pac,
		reference_sa = prepare_reference_rsrs.reference_sa
	}
	#################################################################
	call demultiplex_master.clean_sam as clean_sam_nuclear{ input:
		picard_jar = picard_jar,
		bams = align_nuclear.bam
	}

	call demultiplex_master.clean_sam as clean_sam_rsrs{ input:
		picard_jar = picard_jar,
		bams = align_rsrs.bam
	}
	##################################################################
	call demultiplex_master.demultiplex as demultiplex_nuclear {input:
		adna_screen_jar = adna_screen_jar,
		prealignment_statistics = aggregate_lane_statistics.statistics,
		aligned_bam_files = clean_sam_nuclear.cleaned,
		samples_to_demultiplex = samples_to_demultiplex,
		index_barcode_keys = index_barcode_keys
	}
	call demultiplex_master.sort as sort_nuclear { input:
		picard_jar = picard_jar,
		bams = demultiplex_nuclear.demultiplexed_bam,
	}
	call demultiplex_master.demultiplex as demultiplex_rsrs {input:
		adna_screen_jar = adna_screen_jar,
		prealignment_statistics = aggregate_lane_statistics.statistics,
		aligned_bam_files = clean_sam_rsrs.cleaned,
		samples_to_demultiplex = samples_to_demultiplex,
		index_barcode_keys = index_barcode_keys
	}
	call demultiplex_master.filter_aligned_only as filter_aligned_only_rsrs{ input:
		picard_jar = picard_jar,
		bams = demultiplex_rsrs.demultiplexed_bam,
		minutes = 50
	}

	call demultiplex_master.index_pairs_without_barcodes{ input:
		python_index_pairs_without_barcodes = python_index_pairs_without_barcodes,
		barcode_count_statistics = aggregate_barcode_count_statistics.statistics
	}
	call demultiplex_master.demultiplex as demultiplex_for_unknown_barcodes{ input:
		adna_screen_jar = adna_screen_jar,
		prealignment_statistics = aggregate_lane_statistics.statistics,
		aligned_bam_files = clean_sam_rsrs.cleaned,
		samples_to_demultiplex = 0,
		index_barcode_keys = index_pairs_without_barcodes.index_pairs
	}
	call demultiplex_master.common_unknown_barcodes{ input:
		python_common_unknown_barcodes = python_common_unknown_barcodes,
		bams_without_known_barcodes = demultiplex_for_unknown_barcodes.demultiplexed_bam
	}
	call demultiplex_master.kmer_analysis{ input :
		python_kmer_analysis = python_kmer_analysis,
		python_prepare_report = python_prepare_report,
		barcodes_q_only = barcodes_q_only,
		labeled_i5 = labeled_i5,
		labeled_i7 = labeled_i7,
		counts_by_index_barcode_key = aggregate_lane_statistics.statistics,
		index_barcode_keys = index_barcode_keys,
		unknown_barcodes = common_unknown_barcodes.unknown_barcodes,
		dataset_label = dataset_label,
		date = date,
	}

	# output
	call demultiplex_master.copy_output as copy_nuclear_aligned_unfiltered{ input:
		files = sort_nuclear.sorted,
		output_path = output_path_nuclear_aligned_unfiltered
	}
	call demultiplex_master.copy_output as copy_rsrs_aligned_filtered{ input:
		files = filter_aligned_only_rsrs.filtered,
		output_path = output_path_rsrs_aligned_filtered
	}
	call demultiplex_master.copy_and_rename as copy_and_rename_demultiplex_nuclear_statistics{ input:
		source_file = demultiplex_nuclear.statistics,
		output_path = output_path,
		output_filename_no_path = "nuclear_statistics"
	}
	call demultiplex_master.copy_and_rename as copy_and_rename_demultiplex_mt_statistics{ input:
		source_file = demultiplex_rsrs.statistics,
		output_path = output_path,
		output_filename_no_path = "mt_statistics"
	}

	Array[File] misc_output_files = [collect_read_group_info.read_groups, kmer_analysis.analysis, versions.versions, prepare_demultiplex_report.report]
	call demultiplex_master.copy_output as copy_misc_output_files{input :
		files = misc_output_files,
		output_path = output_path
	}
	call demultiplex_master.copy_and_rename as copy_and_rename_lane_statistics{ input:
		source_file = aggregate_lane_statistics.statistics,
		output_path = output_path,
		output_filename_no_path = date + "_" + dataset_label + ".demultiplex_statistics"
	}

	call demultiplex_master.update_database_with_demultiplexed{ input:
		date_string = date,
		name = dataset_label,
		nuclear_demultiplex_subdirectory = nuclear_demultiplex_subdirectory,
		mt_demultiplex_subdirectory = mt_demultiplex_subdirectory,
		flowcell_by_lane = true,
		unused = (copy_nuclear_aligned_unfiltered.copied + copy_rsrs_aligned_filtered.copied + copy_and_rename_demultiplex_nuclear_statistics.copied + copy_and_rename_demultiplex_mt_statistics.copied + copy_misc_output_files.copied + copy_and_rename_lane_statistics.copied)
	}

	output{
		Array[File] nuclear_bams = sort_nuclear.sorted
		Array[File] rsrs_bams = filter_aligned_only_rsrs.filtered
		File kmer_analysis_report = kmer_analysis.analysis
		File aggregated_statistics = aggregate_lane_statistics.statistics
		File demultiplex_report = prepare_demultiplex_report.report
	}
}

task intake_fastq{
	String fastq_directory
	File python_arrange_lane_fastq
	Boolean has_index_reads = true

	command{
		python3 ${python_arrange_lane_fastq} ${if has_index_reads then "" else "--no_index_reads"} ${fastq_directory + "/*.fastq.gz"} > files_by_lane
	}
	output{
		Array[Array[File]] read_files_by_lane = read_tsv('files_by_lane')
	}
	runtime{
		runtime_minutes: 2
		requested_memory_mb_per_core: 100
	}
}