-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathfind_merges.py
More file actions
62 lines (54 loc) · 2.55 KB
/
find_merges.py
File metadata and controls
62 lines (54 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bam_finder import getBamPath, library_default_dir, MT_default_dir, ShopVersion
import argparse
import re
import sys
from library_id import LibraryID
def readAnnoFile(anno_filename):
libraries_by_master_id = {}
remaps = {}
with open(anno_filename, errors='surrogateescape') as anno_file:
headers = anno_file.readline().split('\t')
num_headers = len(headers)
# find header indices, and map them to correct IDs
master_id_index = headers.index('Master ID')
libraries_index = headers.index('LibraryID(s)')
for line in anno_file:
try:
fields = re.split('\t|\n', line)
master_id = fields[master_id_index]
libraries = fields[libraries_index].split(',')
master_id_number = int(master_id[1:]) # remove leading 'I'
libraries_by_master_id[master_id_number] = libraries
for library in libraries:
library_id = LibraryID(library)
if library_id.sample in remaps and master_id_number != remaps[library_id.sample]:
raise ValueError('{} maps to {} and {}'.format(library_id.sample, master_id_number, remaps[library_id.sample]))
remaps[library_id.sample] = master_id_number
except:
print(line, file=sys.stderr)
return libraries_by_master_id, remaps
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Augment the bam list for a release with a prior existing version of the library")
parser.add_argument("-a", "--anno", help="Use anno file for bam hints and read groups", required=True)
parser.add_argument("-l", "--label", help="Label to apply to instance names", required=True)
parser.add_argument("libraries", help="Use anno file for bam hints and read groups", nargs='+')
args = parser.parse_args()
libraries_by_master_id, remaps = readAnnoFile(args.anno)
master_ids = {}
for libraries_file in args.libraries:
with open(libraries_file) as f:
f.readline() # skip header
for line in f:
fields = re.split('\t|\n', line)
library_id = LibraryID(fields[0])
master_id_number = library_id.sample
if master_id_number in remaps:
master_id_number = remaps[master_id_number]
if master_id_number not in libraries_by_master_id:
libraries_by_master_id[master_id_number] = []
if str(library_id) not in libraries_by_master_id[master_id_number]:
libraries_by_master_id[master_id_number].append(str(library_id))
master_ids[master_id_number] = len(libraries_by_master_id[master_id_number])
for master_id_number, count in master_ids.items():
if count > 1:
print('I{:04d}_{}\t{}'.format(master_id_number, args.label, '\t'.join(libraries_by_master_id[master_id_number])))