-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy patharrange_lane_fastq.py
More file actions
73 lines (63 loc) · 2.78 KB
/
arrange_lane_fastq.py
File metadata and controls
73 lines (63 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
from pathlib import Path
import re
from collections import OrderedDict
class LaneFiles:
def __init__(self):
self.I1 = None
self.I2 = None
self.R1 = None
self.R2 = None
# this is consumed by WDL read_tsv()
# this order matches the inputs for merge_and_trim_lane and barcode_count
def __repr__(self):
if self.is_complete():
return '{}\t{}\t{}\t{}'.format(self.R1, self.R2, self.I1, self.I2)
elif self.is_complete_no_index_reads():
return '{}\t{}'.format(self.R1, self.R2)
def is_complete(self):
return self.I1 != None and self.I2 != None and self.is_complete_no_index_reads()
def is_complete_no_index_reads(self):
return self.R1 != None and self.R2 != None
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Group fastq files by lane for index-barcode analysis and merging", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--no_index_reads", help="Allow index (barcode) reads to be missing", action='store_true')
parser.add_argument("fastq", help="fastq files: index and regular reads", nargs='+')
args = parser.parse_args()
files_by_lane = OrderedDict()
# Populate LaneFiles objects with the file paths
for fullpath in args.fastq:
try:
# Broad non-demultiplexed fastq files look like:
# 1_HY352CCXY.1.1.fastq.gz
# 1_HY352CCXY.1.2.fastq.gz
# 1_HY352CCXY.1.barcode_1.fastq.gz
# 1_HY352CCXY.1.barcode_2.fastq.gz
m = re.match('(\d+)_([a-zA-Z0-9]+).(\d).([a-zA-Z0-9_]+).fastq', Path(fullpath).name)
flowcell = m.group(2)
lane_number = int(m.group(3))
read_type = m.group(4)
#print('{}\t{:d}\t{}\t{}'.format(fullpath, lane_number, read_type, flowcell))
except: # fullpaths from the output of bcl2fastq for NextSeq are expected to look like this Undetermined_S0_L001_I1_001.fastq
m = re.match('Undetermined_S0_L(\d+)_([IR]\d)_001.fastq', Path(fullpath).name)
lane_number = int(m.group(1))
read_type = m.group(2)
#print('{}\t{:d}\t{}'.format(fullpath, lane_number, read_type))
if lane_number not in files_by_lane:
files_by_lane[lane_number] = LaneFiles()
if read_type in ['I1', 'barcode_1']:
files_by_lane[lane_number].I1 = fullpath
elif read_type in ['I2', 'barcode_2']:
files_by_lane[lane_number].I2 = fullpath
elif read_type in ['R1', '1']:
files_by_lane[lane_number].R1 = fullpath
elif read_type in ['R2', '2']:
files_by_lane[lane_number].R2 = fullpath
else:
raise ValueError('Unhandled read type {}'.format(read_type))
# print out data by lane numbers
for lane_number, lane_data in files_by_lane.items():
if lane_data.is_complete() or (args.no_index_reads and lane_data.is_complete_no_index_reads()):
print(str(lane_data))
else: # all lanes should have complete data
raise ValueError('Lane {:d} is incomplete'.format(lane_number))