-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathprepare_report.py
More file actions
350 lines (321 loc) · 13.5 KB
/
prepare_report.py
File metadata and controls
350 lines (321 loc) · 13.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# combine statistics from samples with damage reports into a tab-separated file readable by MS Excel
import sys
import math
import argparse
READS_THRESHOLD_TO_REPORT_KEY = 25000
headersToReport = [
'sample_sheet_key',
'library_id',
'plate_id',
'experiment',
'i5',
'i7',
'p5_barcode',
'p7_barcode',
'sample_sheet_i5',
'sample_sheet_i7',
'sample_sheet_p5_barcode',
'sample_sheet_p7_barcode',
'raw',
'merged',
'oligo',
'endogenous_pre',
'autosome_pre', 'autosome_pre-coverageLength',
'autosome_post', 'autosome_post-coverageLength',
'X_pre', 'X_pre-coverageLength',
'X_post', 'X_post-coverageLength',
'Y_pre', 'Y_pre-coverageLength',
'Y_post', 'Y_post-coverageLength',
'duplicates_nuclear',
'median_nuclear',
'mean_nuclear',
'damage_nuclear_ct1',
'damage_nuclear_ct2',
'damage_nuclear_ga1',
'damage_nuclear_ga2',
'damage_nuclear_last_base_avg',
'damage_nuclear_penultimate_base_avg',
'MT_pre', 'MT_pre-coverageLength',
'MT_post', 'MT_post-coverageLength',
'MT_nuclear_ratio',
'duplicates_rsrs',
'median_rsrs',
'mean_rsrs',
'damage_rsrs_ct1',
'damage_rsrs_ct2',
'damage_rsrs_ga1',
'damage_rsrs_ga2',
'damage_rsrs_last_base_avg',
'damage_rsrs_penultimate_base_avg',
'MT_Haplogroup',
'MT_Haplogroup_rank',
'MT_Haplogroup_NotFoundPolys',
'MT_Haplogroup_FoundPolys',
'MT_Haplogroup_RemainingPolys',
# 'contamination_schmutzi',
# 'contamination_schmutzi_lower',
# 'contamination_schmutzi_upper',
# 'contamination_rare_variant',
# 'contamination_rare_variant_lower',
# 'contamination_rare_variant_upper',
'contamination_contammix',
'contamination_contammix_lower',
'contamination_contammix_upper',
'contamination_contammix_gelman',
'contamination_contammix_inferred_error',
'1240k_pre_autosome',
'1240k_post_autosome',
'1240k_pre_x',
'1240k_post_x',
'1240k_pre_y',
'1240k_post_y',
'1240k_post_sex',
'angsd_nsites',
'angsd_MoM_z',
'angsd_MoM',
'angsd_SE(MoM)',
'angsd_ML_z',
'angsd_ML',
'angsd_SE(ML)',
'1240k_unique_target_frac',
'preseq_unique_targets_hit',
'preseq_marginal_uniqueness',
'preseq_raw_reads_inverse_e',
'preseq_raw_reads_tenth',
'preseq_coverage_at_marginal_uniqueness_0.368',
'preseq_coverage_at_marginal_uniqueness_0.10',
]
preseq_unique_target_per_raw_read_thresholds = ['0.005']
for threshold in preseq_unique_target_per_raw_read_thresholds:
headersToReport.append('preseq_total_reads_required_{}'.format(threshold))
headersToReport.append('preseq_additional_reads_required_{}'.format(threshold))
headersToReport.append('preseq_expected_unique_targets_at_threshold_{}'.format(threshold))
headersToReport.append('preseq_target_coverage_at_threshold_{}'.format(threshold))
samples = dict()
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def addToSamples(f):
for line in f:
#key label1 value1 label2 value2 ...
tokens = str.strip(line).split('\t')
# to workaround Cromwell problem where task output filenames cannot contain ':'
# we re-replace alternate character with ':' for analysis
sampleID = tokens[0].replace('-', ':')
labels = tokens[1:len(tokens):2]
values = tokens[2:len(tokens):2]
if sampleID not in samples:
samples[sampleID] = {}
for n in range(0, len(labels)):
try:
samples[sampleID][labels[n]] = values[n]
except Exception as e:
print('Error for {}\t{}'.format(sampleID, labels[n]), file=sys.stderr)
raise e
def isCoverageLabel(label):
return "coverageLength" in label
# if a coverage label, normalize the coverage length sum to a coverage multiplier
# otherwise, do nothing
def coverageNormalization(originalLabel, value):
label = originalLabel
length = 1
if isCoverageLabel(label):
label = originalLabel.replace('Length', '')
if label.startswith('MT'):
length = 16569
elif label.startswith('autosome'):
length = 2881033286
elif label.startswith('X'):
length = 155270560
elif label.startswith('Y'):
length = 59373566
return label, (float(value) / length)
def printSample(sampleID, thisSample):
print(sampleID, end='\t')
for label in headersToReport:
if thisSample is not None and label in thisSample:
# adjust coverage values, and pass through non-coverage values
value = thisSample[label]
if isCoverageLabel(label):
ignoredLabel, value = coverageNormalization(label, value)
value = '%.4g' % value
print(value, end='')
print('\t', end='')
print('')
# match a barcode pair with the sample sheet barcodes
# each of the Q barcodes comprises 4 7-base-pair sequences
# for example: Q2 = {13, 14, 15, 16}
# the demultiplexing will use the Q sequences, so 13 will appear as Q2
# if the sample sheet uses 13, match a Q2 with 13
# returns:
# 1: index-barcode 4 tuple
# 2: sample/extract/library id if any, or empty string
def findSampleSheetEntry(sampleID, keyMapping):
libraryID, plateID, experiment = keyMapping.get(sampleID, ['','',''])
sampleSheetID = ''
if libraryID != '':
sampleSheetID = sampleID
else:
i5, i7, p5_set, p7_set = sampleID.split('_')
for p5 in p5_set.split(':'):
for p7 in p7_set.split(':'):
trialSampleID = '{}_{}_{}_{}'.format(i5, i7, p5, p7)
trialLibraryID, trialPlateID, trialExperiment = keyMapping.get(trialSampleID, ['','',''])
if libraryID == '':
if trialLibraryID != '':
libraryID = trialLibraryID
sampleSheetID = trialSampleID
plateID = trialPlateID
experiment = trialExperiment
# if there is more than one libraryID that matches, we have a nonprogramming problem
elif trialLibraryID != '':
MULTIPLE = 'MULTIPLE'
libraryID = MULTIPLE
sampleSheetID = MULTIPLE
plateID = MULTIPLE
experiment = MULTIPLE
return sampleSheetID, libraryID, plateID, experiment
# parse an index-barcode key into labels
def parse_index_barcode_key_into_labels(key, i5_labels, i7_labels, barcode_labels):
i5 = i7 = p5 = p7 = ''
try:
i5_sequence, i7_sequence, p5_sequence_set, p7_sequence_set = key.split('_')
i5 = sequence_to_label(i5_sequence, i5_labels)
i7 = sequence_to_label(i7_sequence, i7_labels)
p5 = sequence_to_label(p5_sequence_set, barcode_labels)
p7 = sequence_to_label(p7_sequence_set, barcode_labels)
except:
pass
return i5, i7, p5, p7
# if any component sequences are not in the dictionary, then return the sequence
def sequence_to_label(sequence, labels):
if sequence in labels:
return labels[sequence]
else:
return sequence
# return a map of sequences to labels
# Labels are much easier to read than the sequences
def sequence_labels(filename):
mapping = {}
try:
with open(filename) as f:
for line in f:
fields = line.split()
sequence_string = fields[0]
label = fields[1]
mapping[sequence_string] = label # set (or singleton) is always labeled
if ':' in sequence_string: # for Q barcodes, each individual sequence has a name
sequences = sequence_string.split(':')
for i in range(len(sequences)):
mapping[sequences[i]] = fields[2+i]
except:
pass
return mapping
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Combine ancient DNA analysis outputs into a single file keyed by index-barcode key")
parser.add_argument("--barcode_labels_filename", help="File containing mapping of barcode sequences (P5 and P7) to labels")
parser.add_argument("--i5_labels_filename", help="File containing mapping of i5 sequences to labels")
parser.add_argument("--i7_labels_filename", help="File containing mapping of i7 sequences to labels")
parser.add_argument("statistics_filename", help="File containing read count total and per sample")
parser.add_argument("index_barcode_filename", help="Map from index-barcode key to library ID, plate ID, and experiment")
parser.add_argument("keyed_values", help="Any number of files containing keyed values. First column is index-barcode key, followed by pairs of (field name, value) columns", nargs='*')
args = parser.parse_args()
barcode_labels = sequence_labels(args.barcode_labels_filename)
i5_labels = sequence_labels(args.i5_labels_filename)
i7_labels = sequence_labels(args.i7_labels_filename)
# read from stats
statsFilename = args.statistics_filename
with open(statsFilename, "r") as f:
line = f.readline()
total_reads = int(line)
print('Total reads: ', total_reads)
addToSamples(f)
# mapping from index and barcodes to sample/extract/library ID and plate ID
keyMapping = dict()
keyMappingFilename = args.index_barcode_filename
with open(keyMappingFilename, "r") as f:
for line in f:
fields = [x.strip() for x in line.split('\t')]
index_barcode_key = fields[0]
sample_extract_library_id = fields[1]
plate_id = fields[2]
experiment = fields[3]
keyMapping[index_barcode_key] = [sample_extract_library_id, plate_id, experiment]
# read from damages, medians, haplogroups
# these are all files with the index barcode keys and additional keyed fields
filenames = args.keyed_values
for filename in filenames:
with open(filename, "r") as f:
addToSamples(f)
# populate additional sample fields derivable from other fields
for sampleID in samples:
singleSample = samples[sampleID]
# parse index-barcode fields into components
singleSample['i5'], singleSample['i7'], singleSample['p5_barcode'], singleSample['p7_barcode'] = parse_index_barcode_key_into_labels(sampleID, i5_labels, i7_labels, barcode_labels)
# add sample/extract/library ID, if available
samples[sampleID]['sample_sheet_key'], samples[sampleID]['library_id'], samples[sampleID]['plate_id'], samples[sampleID]['experiment'] = findSampleSheetEntry(sampleID, keyMapping)
singleSample['sample_sheet_i5'], singleSample['sample_sheet_i7'], singleSample['sample_sheet_p5_barcode'], singleSample['sample_sheet_p7_barcode'] = parse_index_barcode_key_into_labels(singleSample.get('sample_sheet_key', ''), i5_labels, i7_labels, barcode_labels)
# add % endogenous
if ('autosome_pre' in singleSample
or 'X_pre' in singleSample
or 'Y_pre' in singleSample
or 'MT_pre' in singleSample):
merged_count = int(samples[sampleID].get('merged', '0'))
samples[sampleID]['endogenous_pre'] = float(
int(samples[sampleID].get('autosome_pre', '0'))
+ int(samples[sampleID].get('X_pre', '0'))
+ int(samples[sampleID].get('Y_pre', '0'))
+ int(samples[sampleID].get('MT_pre', '0'))) / merged_count if merged_count > 0 else 0.0
# % MT_nuclear_ratio
chromosome_count = int(singleSample.get('autosome_post', '0')) + int(singleSample.get('X_post', '0')) + int(singleSample.get('Y_post', '0'))
MT_count = int(singleSample.get('MT_post', '0'))
if MT_count == 0:
singleSample['MT_nuclear_ratio'] = 0
else:
singleSample['MT_nuclear_ratio'] = MT_count / chromosome_count if chromosome_count > 0 else 100
# 1240k_unique_target_frac
num_1240k_autosomes = 1150639
singleSample['1240k_unique_target_frac'] = int(singleSample.get('preseq_unique_targets_hit', '0')) / num_1240k_autosomes
# damage for hg19 at last base and second to last base
if 'damage_nuclear_ct1' in singleSample and 'damage_nuclear_ga1' in singleSample:
singleSample['damage_nuclear_last_base_avg'] = (float(singleSample['damage_nuclear_ct1']) + float(singleSample[ 'damage_nuclear_ga1'])) / 2
if 'damage_nuclear_ct2' in singleSample and 'damage_nuclear_ga2' in singleSample:
singleSample['damage_nuclear_penultimate_base_avg'] = (float(singleSample['damage_nuclear_ct2']) + float(singleSample[ 'damage_nuclear_ga2'])) / 2
# damage for rsrs at last base and second to last base
if 'damage_rsrs_ct1' in singleSample and 'damage_rsrs_ga1' in singleSample:
singleSample['damage_rsrs_last_base_avg'] = (float(singleSample['damage_rsrs_ct1']) + float(singleSample[ 'damage_rsrs_ga1'])) / 2
if 'damage_rsrs_ct2' in singleSample and 'damage_rsrs_ga2' in singleSample:
singleSample['damage_rsrs_penultimate_base_avg'] = (float(singleSample['damage_rsrs_ct2']) + float(singleSample[ 'damage_rsrs_ga2'])) / 2
# angsd z scores
if 'angsd_MoM' in singleSample and 'angsd_SE(MoM)' in singleSample:
value = float(singleSample['angsd_MoM'])
stdev = float(singleSample['angsd_SE(MoM)'])
if stdev > 0:
singleSample['angsd_MoM_z'] = value / stdev
if 'angsd_ML' in singleSample and 'angsd_SE(ML)' in singleSample:
value = float(singleSample['angsd_ML'])
stdev = float(singleSample['angsd_SE(ML)'])
if stdev > 0:
singleSample['angsd_ML_z'] = value / stdev
# print headers
print ('Index-Barcode Key', end='\t')
for header in headersToReport:
headerToPrint = header
# if a coverage label, shorten it
if isCoverageLabel(header):
headerToPrint, unusedValue = coverageNormalization(header, 0)
print(headerToPrint, end='\t')
print ('') # includes newline
sorted_samples = sorted(samples, key=lambda x: int(samples[x].get('raw', 0)), reverse=True)
# output each sample with data using preset header order
for sampleID in sorted_samples:
thisSample = samples[sampleID]
try:
if int(samples[sampleID].get('raw', 0)) >= READS_THRESHOLD_TO_REPORT_KEY or sampleID in keyMapping:
printSample(sampleID, thisSample)
except KeyError:
eprint('KeyError', sampleID)
# samples that are expected, but do not have results
for sampleID in keyMapping:
if sampleID not in samples:
libraryID, plateID, experiment = keyMapping[sampleID]
print('{0}\t{0}\t{1}\t{2}\t{3}'.format(sampleID, libraryID, plateID, experiment) )