Skip to content

Commit 189b85f

Browse files
committed
BCF and header enhancements: introduce error classes for better error handling; implement sample subsetting functionality in Bcf::Header; add tests for subset behavior and error conditions
1 parent 1c979c8 commit 189b85f

5 files changed

Lines changed: 214 additions & 20 deletions

File tree

lib/hts/bcf.rb

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
require_relative "../htslib"
44

55
require_relative "hts"
6+
require_relative "bcf/errors"
67
require_relative "bcf/header"
78
require_relative "bcf/info"
89
require_relative "bcf/format"
@@ -38,16 +39,16 @@ def self.build_index(file_name, index_name = nil, min_shift = 14, threads = 0, v
3839

3940
case LibHTS.bcf_index_build3(file_name, index_name, min_shift, threads)
4041
when 0 # successful
41-
when -1 then raise "indexing failed"
42-
when -2 then raise "opening #{file_name} failed"
43-
when -3 then raise "format not indexable"
44-
when -4 then raise "failed to create and/or save the index"
45-
else raise "unknown error"
42+
when -1 then raise IndexError, "Indexing failed for #{file_name}"
43+
when -2 then raise IndexError, "Opening #{file_name} failed while building the index"
44+
when -3 then raise IndexError, "#{file_name} is not in an indexable format"
45+
when -4 then raise IndexError, "Failed to create or save the index for #{file_name}"
46+
else raise IndexError, "Unknown index build error for #{file_name}"
4647
end
4748
end
4849

4950
def initialize(file_name, mode = "r", index: nil, threads: nil,
50-
build_index: false)
51+
build_index: false, subset: nil)
5152
if block_given?
5253
message = "HTS::Bcf.new() does not take block; Please use HTS::Bcf.open() instead"
5354
raise message
@@ -61,13 +62,16 @@ def initialize(file_name, mode = "r", index: nil, threads: nil,
6162
@nthreads = threads
6263
@hts_file = LibHTS.hts_open(@file_name, mode)
6364

64-
raise Errno::ENOENT, "Failed to open #{@file_name}" if @hts_file.null?
65+
raise OpenError, "Failed to open #{@file_name}" if @hts_file.null?
6566

6667
set_threads(threads) if threads
6768

69+
raise SubsetError, "Sample subsetting is only available when reading BCF/VCF files" if subset && @mode[0] == "w"
70+
6871
return if @mode[0] == "w"
6972

70-
@header = Bcf::Header.new(@hts_file)
73+
@read_header = Bcf::Header.new(@hts_file)
74+
@header = subset ? @read_header.subset(subset) : @read_header
7175
build_index(index) if build_index
7276
@idx = load_index(index)
7377
@start_position = tell
@@ -219,8 +223,8 @@ def each(copy: false, &block)
219223
def query(region, beg = nil, end_ = nil, copy: false, &block)
220224
check_closed
221225

222-
raise "query is only available for BCF files" unless file_format == "bcf"
223-
raise "Index file is required to call the query method." unless index_loaded?
226+
raise QueryError, "Query is only available for BCF files" unless file_format == "bcf"
227+
raise MissingIndexError, "Index file is required to call the query method for #{@file_name}" unless index_loaded?
224228

225229
case region
226230
when Array
@@ -269,7 +273,7 @@ def queryi_reuse(tid, beg, end_, &block)
269273
return to_enum(__method__, tid, beg, end_) unless block_given?
270274

271275
qiter = LibHTS.bcf_itr_queryi(@idx, tid, beg, end_)
272-
raise "Failed to query region #{tid} #{beg} #{end_}" if qiter.null?
276+
raise QueryError, "Failed to query region #{tid}:#{beg}-#{end_} in #{@file_name}" if qiter.null?
273277

274278
query_reuse_yield(qiter, &block)
275279
self
@@ -278,8 +282,8 @@ def queryi_reuse(tid, beg, end_, &block)
278282
def querys_reuse(region, &block)
279283
return to_enum(__method__, region) unless block_given?
280284

281-
qiter = LibHTS.bcf_itr_querys(@idx, header, region)
282-
raise "Failed to query region #{region}" if qiter.null?
285+
qiter = LibHTS.bcf_itr_querys(@idx, read_header, region)
286+
raise QueryError, "Failed to query region #{region.inspect} in #{@file_name}" if qiter.null?
283287

284288
query_reuse_yield(qiter, &block)
285289
self
@@ -303,6 +307,7 @@ def query_reuse_yield(qiter)
303307
break if slen == -1
304308
raise if slen < -1
305309

310+
apply_subset!(record)
306311
yield record
307312
end
308313
ensure
@@ -314,7 +319,7 @@ def queryi_copy(tid, beg, end_, &block)
314319
return to_enum(__method__, tid, beg, end_) unless block_given?
315320

316321
qiter = LibHTS.bcf_itr_queryi(@idx, tid, beg, end_)
317-
raise "Failed to query region #{tid} #{beg} #{end_}" if qiter.null?
322+
raise QueryError, "Failed to query region #{tid}:#{beg}-#{end_} in #{@file_name}" if qiter.null?
318323

319324
query_copy_yield(qiter, &block)
320325
self
@@ -323,8 +328,8 @@ def queryi_copy(tid, beg, end_, &block)
323328
def querys_copy(region, &block)
324329
return to_enum(__method__, region) unless block_given?
325330

326-
qiter = LibHTS.bcf_itr_querys(@idx, header, region)
327-
raise "Failed to query region #{region}" if qiter.null?
331+
qiter = LibHTS.bcf_itr_querys(@idx, read_header, region)
332+
raise QueryError, "Failed to query region #{region.inspect} in #{@file_name}" if qiter.null?
328333

329334
query_copy_yield(qiter, &block)
330335
self
@@ -346,7 +351,9 @@ def query_copy_yield(qiter)
346351
break if slen == -1
347352
raise if slen < -1
348353

349-
yield Record.new(header, bcf1)
354+
record = Record.new(header, bcf1)
355+
apply_subset!(record)
356+
yield record
350357
end
351358
ensure
352359
LibHTS.bcf_itr_destroy(qiter)
@@ -359,7 +366,10 @@ def each_record_reuse
359366

360367
bcf1 = LibHTS.bcf_init
361368
record = Record.new(header, bcf1)
362-
yield record while LibHTS.bcf_read(@hts_file, header, bcf1) != -1
369+
while LibHTS.bcf_read(@hts_file, read_header, bcf1) != -1
370+
apply_subset!(record)
371+
yield record
372+
end
363373
self
364374
end
365375

@@ -368,11 +378,25 @@ def each_record_copy
368378

369379
return to_enum(__method__) unless block_given?
370380

371-
while LibHTS.bcf_read(@hts_file, header, bcf1 = LibHTS.bcf_init) != -1
381+
while LibHTS.bcf_read(@hts_file, read_header, bcf1 = LibHTS.bcf_init) != -1
372382
record = Record.new(header, bcf1)
383+
apply_subset!(record)
373384
yield record
374385
end
375386
self
376387
end
388+
389+
def read_header
390+
@read_header || header
391+
end
392+
393+
def apply_subset!(record)
394+
return unless header.subset?
395+
396+
rc = LibHTS.bcf_subset(header.struct, record.struct, header.subset_sample_count, header.subset_imap_pointer || ::FFI::Pointer::NULL)
397+
return if rc >= 0
398+
399+
raise SubsetError, "Failed to subset samples #{header.subset_samples.inspect} while reading #{@file_name}"
400+
end
377401
end
378402
end

lib/hts/bcf/errors.rb

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# frozen_string_literal: true
2+
3+
module HTS
4+
class Bcf < Hts
5+
class Error < HTS::Error; end
6+
7+
class OpenError < Error; end
8+
class IndexError < Error; end
9+
class MissingIndexError < IndexError; end
10+
class QueryError < Error; end
11+
class HeaderError < Error; end
12+
class SubsetError < HeaderError; end
13+
class UnknownSampleError < SubsetError; end
14+
end
15+
end

lib/hts/bcf/header.rb

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ def initialize(arg = nil)
3535

3636
@sync_depth = 0
3737
@sync_needed = false
38+
@subset_samples = nil
39+
@subset_imap = nil
40+
@subset_imap_pointer = nil
3841

3942
yield self if block_given?
4043
end
@@ -87,6 +90,44 @@ def samples
8790
.map(&:read_string)
8891
end
8992

93+
attr_reader :subset_samples
94+
95+
def subset?( )
96+
!@subset_imap.nil?
97+
end
98+
99+
def subset_sample_count
100+
subset? ? @subset_samples.length : 0
101+
end
102+
103+
def subset_imap_pointer
104+
@subset_imap_pointer
105+
end
106+
107+
def subset(samples)
108+
subset_samples = normalize_subset_samples(samples)
109+
validate_subset_samples!(subset_samples)
110+
111+
sample_pointers = nil
112+
imap_pointer = nil
113+
if subset_samples.empty?
114+
subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, 0, ::FFI::Pointer::NULL, ::FFI::Pointer::NULL)
115+
else
116+
encoded_samples = subset_samples.map { |name| FFI::MemoryPointer.from_string(name) }
117+
sample_pointers = FFI::MemoryPointer.new(:pointer, subset_samples.length)
118+
sample_pointers.write_array_of_pointer(encoded_samples)
119+
imap_pointer = FFI::MemoryPointer.new(:int, subset_samples.length)
120+
subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, subset_samples.length, sample_pointers, imap_pointer)
121+
end
122+
123+
raise SubsetError, "Failed to subset BCF header samples #{subset_samples.inspect}" if subset_hdr.to_ptr.null?
124+
125+
composed_imap = compose_subset_imap(read_subset_imap(imap_pointer, subset_samples.length))
126+
self.class.new(subset_hdr).tap do |header|
127+
header.send(:set_subset_state, subset_samples, composed_imap)
128+
end
129+
end
130+
90131
def add_sample(sample, sync: true)
91132
rc = LibHTS.bcf_hdr_add_sample(@bcf_hdr, sample)
92133
raise "Failed to add sample #{sample}" if rc.negative?
@@ -341,6 +382,58 @@ def initialize_copy(orig)
341382
@bcf_hdr = LibHTS.bcf_hdr_dup(orig.struct)
342383
@sync_depth = 0
343384
@sync_needed = false
385+
set_subset_state(orig.subset_samples, orig.send(:subset_imap))
386+
end
387+
388+
protected
389+
390+
attr_reader :subset_imap
391+
392+
def set_subset_state(samples, imap)
393+
@subset_samples = samples&.dup
394+
@subset_imap = imap&.dup
395+
@subset_imap_pointer = build_subset_imap_pointer(@subset_imap)
396+
end
397+
398+
private
399+
400+
def normalize_subset_samples(samples)
401+
case samples
402+
when String
403+
[samples]
404+
else
405+
Array(samples).map(&:to_s)
406+
end
407+
rescue TypeError
408+
raise SubsetError, "Sample subset must be a String or an Array of sample names"
409+
end
410+
411+
def validate_subset_samples!(subset_samples)
412+
duplicates = subset_samples.group_by(&:itself).select { |_name, group| group.length > 1 }.keys
413+
raise SubsetError, "Duplicate sample names in subset: #{duplicates.join(', ')}" unless duplicates.empty?
414+
415+
missing = subset_samples.reject { |name| samples.include?(name) }
416+
raise UnknownSampleError, "Unknown sample names: #{missing.join(', ')}" unless missing.empty?
417+
end
418+
419+
def read_subset_imap(pointer, length)
420+
return [] if length.zero?
421+
422+
pointer.read_array_of_int(length)
423+
end
424+
425+
def compose_subset_imap(imap)
426+
base_imap = @subset_imap || Array.new(samples.length, &:itself)
427+
imap.map { |index| base_imap.fetch(index) }
428+
end
429+
430+
def build_subset_imap_pointer(imap)
431+
return nil unless imap
432+
return nil if imap.empty?
433+
434+
FFI::MemoryPointer.new(:int, imap.length).tap do |pointer|
435+
pointer.write_array_of_int(imap)
436+
end
344437
end
345438
end
346439
end

test/bcf/header_test.rb

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,30 @@ def test_add_sample
6868
assert_equal %w[A B kojix2 kojix3], hdr2.samples
6969
end
7070

71+
def test_subset_returns_new_header
72+
subset = @hdr.subset(["B"])
73+
74+
assert_equal %w[A B], @hdr.samples
75+
assert_equal ["B"], subset.samples
76+
assert_equal 1, subset.nsamples
77+
end
78+
79+
def test_subset_rejects_unknown_samples
80+
error = assert_raises(HTS::Bcf::UnknownSampleError) do
81+
@hdr.subset(["missing"])
82+
end
83+
84+
assert_match(/missing/, error.message)
85+
end
86+
87+
def test_subset_rejects_duplicates
88+
error = assert_raises(HTS::Bcf::SubsetError) do
89+
@hdr.subset(["A", "A"])
90+
end
91+
92+
assert_match(/Duplicate sample names/, error.message)
93+
end
94+
7195
def test_sync
7296
hdr2 = @hdr.clone
7397
hdr2.add_sample("kojix1", sync: false)

test/bcf_test.rb

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ def test_bcf_path
77
File.expand_path("../htslib/test/index.vcf", __dir__)
88
end
99

10+
def test_multi_sample_bcf_path
11+
File.expand_path("../htslib/test/tabix/vcf_file.bcf", __dir__)
12+
end
13+
1014
def setup
1115
@bcf = HTS::Bcf.new(test_bcf_path)
1216
end
@@ -128,11 +132,45 @@ def test_format
128132
def test_initialize_no_file_bcf
129133
stderr_old = $stderr.dup
130134
$stderr.reopen(File::NULL)
131-
assert_raises(Errno::ENOENT) { HTS::Bcf.new("/tmp/no_such_file") }
135+
assert_raises(HTS::Bcf::OpenError) { HTS::Bcf.new("/tmp/no_such_file") }
132136
$stderr.flush
133137
$stderr.reopen(stderr_old)
134138
end
135139

140+
def test_initialize_with_subset
141+
bcf = HTS::Bcf.new(test_multi_sample_bcf_path, subset: ["B"])
142+
143+
assert_equal ["B"], bcf.samples
144+
assert_equal 1, bcf.nsamples
145+
assert_equal ["0/1"], bcf.first.format("GT")
146+
ensure
147+
bcf&.close
148+
end
149+
150+
def test_query_requires_index
151+
bcf = HTS::Bcf.new(Fixtures["test.bcf"], index: "/tmp/no_such_test_bcf_index.csi")
152+
153+
error = assert_raises(HTS::Bcf::MissingIndexError) do
154+
bcf.query("poo:4000-4100").first
155+
end
156+
157+
assert_match(/Index file is required/, error.message)
158+
ensure
159+
bcf&.close
160+
end
161+
162+
def test_query_invalid_region_raises_query_error
163+
bcf = HTS::Bcf.open(Fixtures["test.bcf"])
164+
165+
error = assert_raises(HTS::Bcf::QueryError) do
166+
bcf.query("unknown:1-10").first
167+
end
168+
169+
assert_match(/unknown:1-10/, error.message)
170+
ensure
171+
bcf&.close
172+
end
173+
136174
def test_query
137175
bcf = HTS::Bcf.open(Fixtures["test.bcf"])
138176
assert_equal 4021, bcf.query("poo", 4000, 4100).first.pos + 1

0 commit comments

Comments
 (0)