Skip to content

Commit c1489e2

Browse files
committed
Enhance BCF format handling with new decoding methods for integers and floats, and add tests for high-level value retrieval and raw buffer preservation
1 parent 09ec3cb commit c1489e2

2 files changed

Lines changed: 209 additions & 9 deletions

File tree

lib/hts/bcf/format.rb

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,23 @@ def initialize(record)
1212
# which provides methods like `get_int`, `get_float`, etc.
1313
# I think they are better than `fetch_int`` and `fetch_float`.
1414
def get(key, type = nil)
15+
return get_raw(key, type) unless type.nil?
16+
17+
return decode_genotypes if key == "GT"
18+
19+
case header_format_type(key)
20+
when :int
21+
decode_integer_values(key)
22+
when :float
23+
decode_float_values(key)
24+
when :flag
25+
raise_unsupported_format_flag(key)
26+
when :string
27+
get_string_values(key)
28+
end
29+
end
30+
31+
def get_raw(key, type = nil)
1532
# The GT FORMAT field is special in that it is marked as a string in the header,
1633
# but it is actually encoded as an integer.
1734
type = if type.nil?
@@ -39,22 +56,22 @@ def get(key, type = nil)
3956

4057
# For compatibility with HTS.cr.
4158
def get_int(key)
42-
get(key, :int)
59+
get_raw(key, :int)
4360
end
4461

4562
# For compatibility with HTS.cr.
4663
def get_float(key)
47-
get(key, :float)
64+
get_raw(key, :float)
4865
end
4966

5067
# For compatibility with HTS.cr.
5168
def get_flag(key)
52-
get(key, :flag)
69+
get_raw(key, :flag)
5370
end
5471

5572
# For compatibility with HTS.cr.
5673
def get_string(key)
57-
get(key, :string)
74+
get_raw(key, :string)
5875
end
5976

6077
# For compatibility with HTS.cr.
@@ -201,6 +218,38 @@ def get_string_values(key)
201218
end
202219
end
203220

221+
def decode_integer_values(key)
222+
values = get_raw(key, :int)
223+
return nil unless values
224+
225+
sample_values = split_sample_values(values)
226+
if scalar_format?(key)
227+
sample_values.map do |values_per_sample|
228+
map_integer_missing_value(trim_integer_vector_end(values_per_sample).first)
229+
end
230+
else
231+
sample_values.map do |values_per_sample|
232+
map_integer_missing(trim_integer_vector_end(values_per_sample))
233+
end
234+
end
235+
end
236+
237+
def decode_float_values(key)
238+
values = get_float_words(key)
239+
return nil unless values
240+
241+
sample_values = split_sample_values(values)
242+
if scalar_format?(key)
243+
sample_values.map do |values_per_sample|
244+
decode_float_word(trim_float_vector_end(values_per_sample).first)
245+
end
246+
else
247+
sample_values.map do |values_per_sample|
248+
map_float_words(trim_float_vector_end(values_per_sample))
249+
end
250+
end
251+
end
252+
204253
def decode_genotypes
205254
genotypes = get_genotypes
206255
return nil unless genotypes
@@ -245,6 +294,38 @@ def trim_genotype_vector_end(values)
245294
values[0, end_index]
246295
end
247296

297+
def trim_integer_vector_end(values)
298+
end_index = values.index { |value| value == LibHTS.bcf_int32_vector_end } || values.size
299+
values[0, end_index]
300+
end
301+
302+
def trim_float_vector_end(values)
303+
end_index = values.index(0x7f80_0002) || values.size
304+
values[0, end_index]
305+
end
306+
307+
def map_integer_missing(values)
308+
values.map { |value| map_integer_missing_value(value) }
309+
end
310+
311+
def map_integer_missing_value(value)
312+
value == LibHTS.bcf_int32_missing ? nil : value
313+
end
314+
315+
def map_float_words(values)
316+
values.map { |value| decode_float_word(value) }
317+
end
318+
319+
def decode_float_word(value)
320+
return nil if value == 0x7f80_0001
321+
322+
[value].pack("V").unpack1("e")
323+
end
324+
325+
def get_float_words(key)
326+
get_numeric_values(key, LibHTS::BCF_HT_REAL, "float") { |dst, len| dst.get_array_of_uint32(0, len) }
327+
end
328+
248329
def normalize_format_rc(rc, key, expected_type)
249330
case rc
250331
when -1, -3
@@ -364,6 +445,17 @@ def get_fmt_type(qname)
364445
nil
365446
end
366447

448+
def scalar_format?(key)
449+
header_format_number(key) == 1
450+
end
451+
452+
def header_format_number(key)
453+
id = LibHTS.bcf_hdr_id2int(@record.header.struct, LibHTS::BCF_DT_ID, key)
454+
return nil unless LibHTS.bcf_hdr_idinfo_exists(@record.header.struct, LibHTS::BCF_HL_FMT, id)
455+
456+
LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
457+
end
458+
367459
def header_format_type_code(key)
368460
id = LibHTS.bcf_hdr_id2int(@record.header.struct, LibHTS::BCF_DT_ID, key)
369461
return nil unless LibHTS.bcf_hdr_idinfo_exists(@record.header.struct, LibHTS::BCF_HL_FMT, id)
@@ -388,6 +480,7 @@ def ht_type_to_sym(t)
388480
def int32_range?(value)
389481
value >= -2_147_483_648 && value <= 2_147_483_647
390482
end
483+
391484
end
392485
end
393486
end

test/bcf/format_test.rb

Lines changed: 112 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,69 @@ def with_temp_character_format_vcf
3636
end
3737
end
3838

39+
def with_temp_bcf
40+
Tempfile.create(["format_test", ".bcf"]) do |file|
41+
path = file.path
42+
file.close
43+
44+
header = HTS::Bcf::Header.new
45+
header.set_version("VCFv4.3")
46+
header.append("##contig=<ID=1,length=100>")
47+
header.append('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
48+
header.append('##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred likelihoods">')
49+
header.append('##FORMAT=<ID=MISSI,Number=1,Type=Integer,Description="defined but absent integer">')
50+
header.append('##FORMAT=<ID=MISSF,Number=1,Type=Float,Description="defined but absent float">')
51+
header.append('##FORMAT=<ID=IV,Number=.,Type=Integer,Description="integer with sentinels">')
52+
header.append('##FORMAT=<ID=FV,Number=.,Type=Float,Description="float with sentinels">')
53+
header.add_sample("S1", sync: false)
54+
header.add_sample("S2", sync: true)
55+
56+
HTS::Bcf.open(path, "wb") do |bcf|
57+
bcf.write_header(header)
58+
59+
record = HTS::Bcf::Record.new(header)
60+
record.rid = HTS::LibHTS.bcf_hdr_name2id(header.struct, "1")
61+
record.pos = 9
62+
63+
rc = HTS::LibHTS.bcf_update_alleles_str(header.struct, record.struct, "A,C")
64+
raise "bcf_update_alleles_str failed (rc=#{rc})" if rc < 0
65+
66+
genotypes = [
67+
HTS::LibHTS.bcf_gt_unphased(0),
68+
HTS::LibHTS.bcf_gt_unphased(1),
69+
HTS::LibHTS.bcf_gt_unphased(1),
70+
HTS::LibHTS.bcf_gt_unphased(1)
71+
]
72+
genotype_ptr = FFI::MemoryPointer.new(:int32, genotypes.size)
73+
genotype_ptr.write_array_of_int32(genotypes)
74+
rc = HTS::LibHTS.bcf_update_genotypes(header.struct, record.struct, genotype_ptr, genotypes.size)
75+
raise "bcf_update_genotypes failed (rc=#{rc})" if rc < 0
76+
77+
likelihoods = [10, 20, 30, 40, 50, 60]
78+
likelihood_ptr = FFI::MemoryPointer.new(:int32, likelihoods.size)
79+
likelihood_ptr.write_array_of_int32(likelihoods)
80+
rc = HTS::LibHTS.bcf_update_format_int32(header.struct, record.struct, "PL", likelihood_ptr, likelihoods.size)
81+
raise "bcf_update_format_int32 failed (rc=#{rc})" if rc < 0
82+
83+
int_with_sentinels = [10, HTS::LibHTS.bcf_int32_vector_end, HTS::LibHTS.bcf_int32_missing, HTS::LibHTS.bcf_int32_vector_end]
84+
int_ptr = FFI::MemoryPointer.new(:int32, int_with_sentinels.size)
85+
int_ptr.write_array_of_int32(int_with_sentinels)
86+
rc = HTS::LibHTS.bcf_update_format_int32(header.struct, record.struct, "IV", int_ptr, int_with_sentinels.size)
87+
raise "bcf_update_format_int32 failed for IV (rc=#{rc})" if rc < 0
88+
89+
float_words = [0x3fc0_0000, 0x7f80_0002, 0x7f80_0001, 0x7f80_0002]
90+
float_ptr = FFI::MemoryPointer.new(:uint32, float_words.size)
91+
float_ptr.write_array_of_uint32(float_words)
92+
rc = HTS::LibHTS.bcf_update_format_float(header.struct, record.struct, "FV", float_ptr, float_words.size)
93+
raise "bcf_update_format_float failed for FV (rc=#{rc})" if rc < 0
94+
95+
bcf.write(record)
96+
end
97+
98+
yield path
99+
end
100+
end
101+
39102
def with_temp_gt_vcf
40103
Tempfile.create(["format_gt", ".vcf"]) do |file|
41104
file.write <<~VCF
@@ -112,13 +175,15 @@ def test_get_like_crystal
112175
def test_get_without_type
113176
assert_equal [409, 409], @fmt.get("GQ")
114177
assert_equal [35, 35], @fmt.get("DP")
115-
assert_equal [-20.0, -5.0, -20.0, -20.0, -5.0, -20.0], @fmt.get("GL")
178+
assert_equal [[-20.0, -5.0, -20.0], [-20.0, -5.0, -20.0]], @fmt.get("GL")
179+
assert_equal ["0/1", "0/1"], @fmt.get("GT")
116180
end
117181

118182
def test_get_square_brackets
119183
assert_equal [409, 409], @fmt["GQ"]
120184
assert_equal [35, 35], @fmt["DP"]
121-
assert_equal [-20.0, -5.0, -20.0, -20.0, -5.0, -20.0], @fmt["GL"]
185+
assert_equal [[-20.0, -5.0, -20.0], [-20.0, -5.0, -20.0]], @fmt["GL"]
186+
assert_equal ["0/1", "0/1"], @fmt["GT"]
122187
end
123188

124189
def test_get_unknown_key
@@ -189,13 +254,55 @@ def test_fields
189254

190255
def test_to_h
191256
assert_equal(
192-
# FIXME: Maybe GT should be string?
193-
{ "GT" => [2, 4, 2, 4], "GQ" => [409, 409], "DP" => [35, 35],
194-
"GL" => [-20.0, -5.0, -20.0, -20.0, -5.0, -20.0] },
257+
{ "GT" => ["0/1", "0/1"], "GQ" => [409, 409], "DP" => [35, 35],
258+
"GL" => [[-20.0, -5.0, -20.0], [-20.0, -5.0, -20.0]] },
195259
@fmt.to_h
196260
)
197261
end
198262

263+
def test_get_high_level_shapes_values_by_sample
264+
with_temp_bcf do |path|
265+
HTS::Bcf.open(path) do |bcf|
266+
format = bcf.first.format
267+
268+
assert_equal ["0/1", "1/1"], format.get("GT")
269+
assert_equal [[10, 20, 30], [40, 50, 60]], format.get("PL")
270+
assert_equal [[10], [nil]], format.get("IV")
271+
272+
floats = format.get("FV")
273+
assert_equal 2, floats.size
274+
assert_equal [1.5], floats[0]
275+
assert_equal [nil], floats[1]
276+
assert_nil format.get("MISSI")
277+
assert_nil format.get("MISSF")
278+
end
279+
end
280+
end
281+
282+
def test_get_raw_preserves_flat_numeric_buffers
283+
with_temp_bcf do |path|
284+
HTS::Bcf.open(path) do |bcf|
285+
format = bcf.first.format
286+
287+
assert_equal [10, 20, 30, 40, 50, 60], format.get_raw("PL")
288+
289+
ints = format.get_raw("IV")
290+
assert_equal 4, ints.size
291+
assert_equal 10, ints[0]
292+
assert_equal HTS::LibHTS.bcf_int32_vector_end, ints[1]
293+
assert_equal HTS::LibHTS.bcf_int32_missing, ints[2]
294+
assert_equal HTS::LibHTS.bcf_int32_vector_end, ints[3]
295+
296+
floats = format.get_raw("FV")
297+
assert_equal 4, floats.size
298+
assert_in_delta 1.5, floats[0], 0.001
299+
assert_predicate floats[1], :nan?
300+
assert_predicate floats[2], :nan?
301+
assert_predicate floats[3], :nan?
302+
end
303+
end
304+
end
305+
199306
def test_update_methods_round_trip
200307
with_temp_format_source_vcf do |source_path|
201308
with_temp_output_path do |output_path|

0 commit comments

Comments
 (0)