@@ -6,6 +6,39 @@ module HTS
66 class Bam < Hts
77 # A class for working with alignment header.
88 class Header
9+ HD_TAG_MAP = {
10+ version : "VN" ,
11+ sort_order : "SO" ,
12+ group_order : "GO" ,
13+ subsorting : "SS"
14+ } . freeze
15+
16+ SQ_TAG_MAP = {
17+ name : "SN" ,
18+ length : "LN" ,
19+ assembly : "AS" ,
20+ md5 : "M5" ,
21+ species : "SP" ,
22+ uri : "UR" ,
23+ alt_names : "AN"
24+ } . freeze
25+
26+ RG_TAG_MAP = {
27+ id : "ID" ,
28+ sample : "SM" ,
29+ library : "LB" ,
30+ platform : "PL" ,
31+ platform_unit : "PU" ,
32+ center : "CN" ,
33+ description : "DS" ,
34+ date : "DT" ,
35+ flow_order : "FO" ,
36+ key_sequence : "KS" ,
37+ program : "PG" ,
38+ insert_size : "PI" ,
39+ molecule_topology : "PM"
40+ } . freeze
41+
942 def self . parse ( text )
1043 new ( LibHTS . sam_hdr_parse ( text . size , text ) )
1144 end
@@ -66,6 +99,11 @@ def write(...)
6699 add_lines ( ...)
67100 end
68101
102+ def append ( line )
103+ add_lines ( ensure_newline ( line . to_s ) )
104+ self
105+ end
106+
69107 # experimental
70108 def <<( obj )
71109 case obj
@@ -89,6 +127,16 @@ def find_line(type, key, value)
89127 end
90128 end
91129
130+ def find_tag ( type , id_key , id_value , key )
131+ ks = LibHTS ::KString . new
132+ begin
133+ r = LibHTS . sam_hdr_find_tag_id ( @sam_hdr , type , id_key , id_value , key , ks )
134+ r == 0 ? ks . read_string_copy : nil
135+ ensure
136+ ks . free_buffer
137+ end
138+ end
139+
92140 # experimental
93141 def find_line_at ( type , pos )
94142 ks = LibHTS ::KString . new
@@ -110,6 +158,26 @@ def remove_line_at(type, pos)
110158 LibHTS . sam_hdr_remove_line_pos ( @sam_hdr , type , pos )
111159 end
112160
161+ def delete_line ( type , key = nil , value = nil )
162+ LibHTS . sam_hdr_remove_line_id ( @sam_hdr , type , key , value ) . zero?
163+ end
164+
165+ def delete_tag ( type , id_key , id_value , key )
166+ LibHTS . sam_hdr_remove_tag_id ( @sam_hdr , type , id_key , id_value , key ) == 1
167+ end
168+
169+ def count_lines ( type )
170+ LibHTS . sam_hdr_count_lines ( @sam_hdr , type )
171+ end
172+
173+ def line_index ( type , key )
174+ LibHTS . sam_hdr_line_index ( @sam_hdr , type , key )
175+ end
176+
177+ def line_name ( type , pos )
178+ LibHTS . sam_hdr_line_name ( @sam_hdr , type , pos )
179+ end
180+
113181 def to_s
114182 LibHTS . sam_hdr_str ( @sam_hdr )
115183 end
@@ -119,6 +187,46 @@ def get_tid(name)
119187 name2tid ( name )
120188 end
121189
190+ def update_hd ( **tags )
191+ pairs = merge_sam_pairs ( find_line_pairs ( "HD" , nil , nil ) , normalize_hd_tags ( tags ) )
192+ replace_sam_line ( "HD" , nil , nil , pairs , %w[ VN SO GO SS ] )
193+ self
194+ end
195+
196+ def add_sq ( name , length :, **tags )
197+ pairs = [ [ "SN" , name . to_s ] , [ "LN" , length . to_s ] ]
198+ pairs . concat normalize_sq_tags ( tags )
199+ add_structured_sam_line ( "SQ" , pairs , %w[ SN LN AS M5 SP UR AN ] )
200+ self
201+ end
202+
203+ def update_sq ( name , **tags )
204+ pairs = merge_identified_sam_line ( "SQ" , "SN" , name . to_s , normalize_sq_tags ( tags ) , protected_keys : [ "SN" ] )
205+ replace_sam_line ( "SQ" , "SN" , name . to_s , pairs , %w[ SN LN AS M5 SP UR AN ] )
206+ self
207+ end
208+
209+ def remove_sq ( name )
210+ delete_line ( "SQ" , "SN" , name . to_s )
211+ end
212+
213+ def add_rg ( id , **tags )
214+ pairs = [ [ "ID" , id . to_s ] ]
215+ pairs . concat normalize_rg_tags ( tags )
216+ add_structured_sam_line ( "RG" , pairs , %w[ ID SM LB PL PU CN DS DT FO KS PG PI PM ] )
217+ self
218+ end
219+
220+ def update_rg ( id , **tags )
221+ pairs = merge_identified_sam_line ( "RG" , "ID" , id . to_s , normalize_rg_tags ( tags ) , protected_keys : [ "ID" ] )
222+ replace_sam_line ( "RG" , "ID" , id . to_s , pairs , %w[ ID SM LB PL PU CN DS DT FO KS PG PI PM ] )
223+ self
224+ end
225+
226+ def remove_rg ( id )
227+ delete_line ( "RG" , "ID" , id . to_s )
228+ end
229+
122230 # Add a @PG (program) line to the header
123231 # @param program_name [String] Name of the program
124232 # @param options [Hash] Key-value pairs for @PG tags (ID, PN, VN, CL, PP, etc.)
@@ -141,6 +249,93 @@ def add_pg(program_name, **options)
141249
142250 private
143251
252+ def normalize_hd_tags ( tags )
253+ normalize_sam_tags ( tags , HD_TAG_MAP )
254+ end
255+
256+ def normalize_sq_tags ( tags )
257+ normalize_sam_tags ( tags , SQ_TAG_MAP )
258+ end
259+
260+ def normalize_rg_tags ( tags )
261+ normalize_sam_tags ( tags , RG_TAG_MAP )
262+ end
263+
264+ def normalize_sam_tags ( tags , tag_map )
265+ tags . each_with_object ( [ ] ) do |( key , value ) , pairs |
266+ sam_key = tag_map . fetch ( key . to_sym , key . to_s . upcase )
267+ sam_value = value . is_a? ( Array ) ? value . join ( "," ) : value . to_s
268+ raise ArgumentError , "Header tag keys must not be empty" if sam_key . empty?
269+ if sam_value . include? ( "\t " ) || sam_value . include? ( "\n " ) || sam_value . include? ( "\r " )
270+ raise ArgumentError , "Header tag values must not contain tabs or newlines"
271+ end
272+
273+ pairs << [ sam_key , sam_value ]
274+ end
275+ end
276+
277+ def parse_sam_pairs ( line )
278+ line . to_s . chomp . split ( "\t " ) [ 1 ..] . to_a . map do |field |
279+ key , value = field . split ( ":" , 2 )
280+ [ key , value . to_s ]
281+ end
282+ end
283+
284+ def find_line_pairs ( type , id_key , id_value )
285+ line = find_line ( type , id_key , id_value )
286+ line ? parse_sam_pairs ( line ) : [ ]
287+ end
288+
289+ def merge_identified_sam_line ( type , id_key , id_value , updates , protected_keys : [ ] )
290+ line = find_line ( type , id_key , id_value )
291+ raise ArgumentError , "Header line not found: @#{ type } #{ id_key } :#{ id_value } " unless line
292+
293+ merge_sam_pairs ( parse_sam_pairs ( line ) , updates , protected_keys :)
294+ end
295+
296+ def merge_sam_pairs ( existing_pairs , updates , protected_keys : [ ] )
297+ pairs = existing_pairs . map ( &:dup )
298+ updates . each do |key , value |
299+ if protected_keys . include? ( key )
300+ raise ArgumentError , "Header tag #{ key } cannot be updated" unless existing_pairs . none? { |pair | pair [ 0 ] == key && pair [ 1 ] == value }
301+
302+ next
303+ end
304+
305+ index = pairs . index { |pair | pair [ 0 ] == key }
306+ if index
307+ pairs [ index ] = [ key , value ]
308+ else
309+ pairs << [ key , value ]
310+ end
311+ end
312+ pairs
313+ end
314+
315+ def add_structured_sam_line ( type , pairs , preferred_order )
316+ append ( build_sam_line ( type , pairs , preferred_order ) )
317+ end
318+
319+ def replace_sam_line ( type , id_key , id_value , pairs , preferred_order )
320+ delete_line ( type , id_key , id_value )
321+ append ( build_sam_line ( type , pairs , preferred_order ) )
322+ end
323+
324+ def build_sam_line ( type , pairs , preferred_order )
325+ ordered_pairs = preferred_order . filter_map do |key |
326+ pairs . find { |pair | pair [ 0 ] == key }
327+ end
328+ pairs . each do |pair |
329+ ordered_pairs << pair unless preferred_order . include? ( pair [ 0 ] )
330+ end
331+
332+ "@#{ type } \t #{ ordered_pairs . map { |key , value | "#{ key } :#{ value } " } . join ( "\t " ) } \n "
333+ end
334+
335+ def ensure_newline ( text )
336+ text . end_with? ( "\n " ) ? text : "#{ text } \n "
337+ end
338+
144339 def build_pg_line ( program_name , options )
145340 ordered_tags = normalize_pg_tags ( program_name , options )
146341 "@PG\t #{ ordered_tags . map { |key , value | "#{ key } :#{ value } " } . join ( "\t " ) } \n "
0 commit comments