Skip to content

Commit 1c979c8

Browse files
committed
BCF and BAM header enhancements: add structured methods for contigs, filters, and info/format fields; improve error handling and synchronization logic; update tests for new functionalities
1 parent c1489e2 commit 1c979c8

4 files changed

Lines changed: 497 additions & 8 deletions

File tree

lib/hts/bam/header.rb

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,39 @@ module HTS
66
class Bam < Hts
77
# A class for working with alignment header.
88
class Header
9+
HD_TAG_MAP = {
10+
version: "VN",
11+
sort_order: "SO",
12+
group_order: "GO",
13+
subsorting: "SS"
14+
}.freeze
15+
16+
SQ_TAG_MAP = {
17+
name: "SN",
18+
length: "LN",
19+
assembly: "AS",
20+
md5: "M5",
21+
species: "SP",
22+
uri: "UR",
23+
alt_names: "AN"
24+
}.freeze
25+
26+
RG_TAG_MAP = {
27+
id: "ID",
28+
sample: "SM",
29+
library: "LB",
30+
platform: "PL",
31+
platform_unit: "PU",
32+
center: "CN",
33+
description: "DS",
34+
date: "DT",
35+
flow_order: "FO",
36+
key_sequence: "KS",
37+
program: "PG",
38+
insert_size: "PI",
39+
molecule_topology: "PM"
40+
}.freeze
41+
942
def self.parse(text)
1043
new(LibHTS.sam_hdr_parse(text.size, text))
1144
end
@@ -66,6 +99,11 @@ def write(...)
6699
add_lines(...)
67100
end
68101

102+
def append(line)
103+
add_lines(ensure_newline(line.to_s))
104+
self
105+
end
106+
69107
# experimental
70108
def <<(obj)
71109
case obj
@@ -89,6 +127,16 @@ def find_line(type, key, value)
89127
end
90128
end
91129

130+
def find_tag(type, id_key, id_value, key)
131+
ks = LibHTS::KString.new
132+
begin
133+
r = LibHTS.sam_hdr_find_tag_id(@sam_hdr, type, id_key, id_value, key, ks)
134+
r == 0 ? ks.read_string_copy : nil
135+
ensure
136+
ks.free_buffer
137+
end
138+
end
139+
92140
# experimental
93141
def find_line_at(type, pos)
94142
ks = LibHTS::KString.new
@@ -110,6 +158,26 @@ def remove_line_at(type, pos)
110158
LibHTS.sam_hdr_remove_line_pos(@sam_hdr, type, pos)
111159
end
112160

161+
def delete_line(type, key = nil, value = nil)
162+
LibHTS.sam_hdr_remove_line_id(@sam_hdr, type, key, value).zero?
163+
end
164+
165+
def delete_tag(type, id_key, id_value, key)
166+
LibHTS.sam_hdr_remove_tag_id(@sam_hdr, type, id_key, id_value, key) == 1
167+
end
168+
169+
def count_lines(type)
170+
LibHTS.sam_hdr_count_lines(@sam_hdr, type)
171+
end
172+
173+
def line_index(type, key)
174+
LibHTS.sam_hdr_line_index(@sam_hdr, type, key)
175+
end
176+
177+
def line_name(type, pos)
178+
LibHTS.sam_hdr_line_name(@sam_hdr, type, pos)
179+
end
180+
113181
def to_s
114182
LibHTS.sam_hdr_str(@sam_hdr)
115183
end
@@ -119,6 +187,46 @@ def get_tid(name)
119187
name2tid(name)
120188
end
121189

190+
def update_hd(**tags)
191+
pairs = merge_sam_pairs(find_line_pairs("HD", nil, nil), normalize_hd_tags(tags))
192+
replace_sam_line("HD", nil, nil, pairs, %w[VN SO GO SS])
193+
self
194+
end
195+
196+
def add_sq(name, length:, **tags)
197+
pairs = [["SN", name.to_s], ["LN", length.to_s]]
198+
pairs.concat normalize_sq_tags(tags)
199+
add_structured_sam_line("SQ", pairs, %w[SN LN AS M5 SP UR AN])
200+
self
201+
end
202+
203+
def update_sq(name, **tags)
204+
pairs = merge_identified_sam_line("SQ", "SN", name.to_s, normalize_sq_tags(tags), protected_keys: ["SN"])
205+
replace_sam_line("SQ", "SN", name.to_s, pairs, %w[SN LN AS M5 SP UR AN])
206+
self
207+
end
208+
209+
def remove_sq(name)
210+
delete_line("SQ", "SN", name.to_s)
211+
end
212+
213+
def add_rg(id, **tags)
214+
pairs = [["ID", id.to_s]]
215+
pairs.concat normalize_rg_tags(tags)
216+
add_structured_sam_line("RG", pairs, %w[ID SM LB PL PU CN DS DT FO KS PG PI PM])
217+
self
218+
end
219+
220+
def update_rg(id, **tags)
221+
pairs = merge_identified_sam_line("RG", "ID", id.to_s, normalize_rg_tags(tags), protected_keys: ["ID"])
222+
replace_sam_line("RG", "ID", id.to_s, pairs, %w[ID SM LB PL PU CN DS DT FO KS PG PI PM])
223+
self
224+
end
225+
226+
def remove_rg(id)
227+
delete_line("RG", "ID", id.to_s)
228+
end
229+
122230
# Add a @PG (program) line to the header
123231
# @param program_name [String] Name of the program
124232
# @param options [Hash] Key-value pairs for @PG tags (ID, PN, VN, CL, PP, etc.)
@@ -141,6 +249,93 @@ def add_pg(program_name, **options)
141249

142250
private
143251

252+
def normalize_hd_tags(tags)
253+
normalize_sam_tags(tags, HD_TAG_MAP)
254+
end
255+
256+
def normalize_sq_tags(tags)
257+
normalize_sam_tags(tags, SQ_TAG_MAP)
258+
end
259+
260+
def normalize_rg_tags(tags)
261+
normalize_sam_tags(tags, RG_TAG_MAP)
262+
end
263+
264+
def normalize_sam_tags(tags, tag_map)
265+
tags.each_with_object([]) do |(key, value), pairs|
266+
sam_key = tag_map.fetch(key.to_sym, key.to_s.upcase)
267+
sam_value = value.is_a?(Array) ? value.join(",") : value.to_s
268+
raise ArgumentError, "Header tag keys must not be empty" if sam_key.empty?
269+
if sam_value.include?("\t") || sam_value.include?("\n") || sam_value.include?("\r")
270+
raise ArgumentError, "Header tag values must not contain tabs or newlines"
271+
end
272+
273+
pairs << [sam_key, sam_value]
274+
end
275+
end
276+
277+
def parse_sam_pairs(line)
278+
line.to_s.chomp.split("\t")[1..].to_a.map do |field|
279+
key, value = field.split(":", 2)
280+
[key, value.to_s]
281+
end
282+
end
283+
284+
def find_line_pairs(type, id_key, id_value)
285+
line = find_line(type, id_key, id_value)
286+
line ? parse_sam_pairs(line) : []
287+
end
288+
289+
def merge_identified_sam_line(type, id_key, id_value, updates, protected_keys: [])
290+
line = find_line(type, id_key, id_value)
291+
raise ArgumentError, "Header line not found: @#{type} #{id_key}:#{id_value}" unless line
292+
293+
merge_sam_pairs(parse_sam_pairs(line), updates, protected_keys:)
294+
end
295+
296+
def merge_sam_pairs(existing_pairs, updates, protected_keys: [])
297+
pairs = existing_pairs.map(&:dup)
298+
updates.each do |key, value|
299+
if protected_keys.include?(key)
300+
raise ArgumentError, "Header tag #{key} cannot be updated" unless existing_pairs.none? { |pair| pair[0] == key && pair[1] == value }
301+
302+
next
303+
end
304+
305+
index = pairs.index { |pair| pair[0] == key }
306+
if index
307+
pairs[index] = [key, value]
308+
else
309+
pairs << [key, value]
310+
end
311+
end
312+
pairs
313+
end
314+
315+
def add_structured_sam_line(type, pairs, preferred_order)
316+
append(build_sam_line(type, pairs, preferred_order))
317+
end
318+
319+
def replace_sam_line(type, id_key, id_value, pairs, preferred_order)
320+
delete_line(type, id_key, id_value)
321+
append(build_sam_line(type, pairs, preferred_order))
322+
end
323+
324+
def build_sam_line(type, pairs, preferred_order)
325+
ordered_pairs = preferred_order.filter_map do |key|
326+
pairs.find { |pair| pair[0] == key }
327+
end
328+
pairs.each do |pair|
329+
ordered_pairs << pair unless preferred_order.include?(pair[0])
330+
end
331+
332+
"@#{type}\t#{ordered_pairs.map { |key, value| "#{key}:#{value}" }.join("\t")}\n"
333+
end
334+
335+
def ensure_newline(text)
336+
text.end_with?("\n") ? text : "#{text}\n"
337+
end
338+
144339
def build_pg_line(program_name, options)
145340
ordered_tags = normalize_pg_tags(program_name, options)
146341
"@PG\t#{ordered_tags.map { |key, value| "#{key}:#{value}" }.join("\t")}\n"

0 commit comments

Comments
 (0)