Skip to content

Commit e051bc8

Browse files
Adding more comments and some const declarations.
1 parent dc38a4e commit e051bc8

1 file changed

Lines changed: 52 additions & 29 deletions

File tree

src/utils/format-reads.cpp

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -163,29 +163,44 @@ bam_set1_wrapper(bam1_t *bam,
163163
const int32_t mtid, const hts_pos_t mpos,
164164
const hts_pos_t isize, const size_t l_seq,
165165
const size_t l_aux) {
166-
/* This is a modified version of bam_set1.
167-
* It assigns the attributes of bam1_t object except for the sequence.
168-
* Many checks are skipped, as we assume that many quantities have
169-
* been validaded.
166+
/* This is based on how assignment is done in the `bam_set1`
167+
function defined in `sam.c` from htslib */
168+
169+
/*
170+
* This modification assigns variables of bam1_t struct but not the sequence.
171+
*
172+
* Many checks have been removed because they are checked in code
173+
* that calls this function, mostly because they already come from a
174+
* valid `bam1_t` struct and so the values have been individually
175+
* validated.
176+
*
170177
* Assumptions:
171-
* cigar has been computed
178+
* cigar has been computed and is in the right format
172179
* rlen = isize
173180
* qlen = l_seq
174181
* l_qname <= 254
175182
* HTS_POS_MAX - rlen > pos
183+
*
184+
* ADS: what is HTS_POS_MAX?
185+
*
176186
* Number of bytes needed for the data is smaller than INT32_MAX
177-
* qual = Null
187+
*
188+
* qual = NULL, because we do not keep the quality scores through
189+
* formatting the reads.
178190
*/
179191

180-
size_t qname_nuls = 4 - l_qname % 4;
192+
// `qname_nuls` below is the number of '\0' to use to pad the qname
193+
// so that the cigar has 4-byte alignment.
194+
const size_t qname_nuls = 4 - l_qname % 4;
181195
bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar,
182-
mtid, mpos, isize, l_seq, qname_nuls);
196+
mtid, mpos, isize, l_seq, qname_nuls);
197+
198+
const size_t data_len =
199+
(l_qname + qname_nuls + n_cigar*sizeof(uint32_t) + (l_seq + 1) / 2 + l_seq);
183200

184-
size_t data_len = l_qname + qname_nuls + n_cigar * 4 +
185-
(l_seq + 1) / 2 + l_seq;
186201
bam->l_data = data_len;
187202
if (data_len + l_aux > bam->m_data) {
188-
int ret = sam_realloc_bam_data(bam, data_len + l_aux);
203+
const int ret = sam_realloc_bam_data(bam, data_len + l_aux);
189204
if (ret < 0) {
190205
throw fr_expt(ret, "Failed to allocate memory for BAM record");
191206
}
@@ -196,14 +211,19 @@ bam_set1_wrapper(bam1_t *bam,
196211
std::fill_n(data_iter + l_qname, qname_nuls, '\0');
197212
data_iter += l_qname + qname_nuls;
198213

214+
// ADS: reinterpret here because we know the cigar is originally an
215+
// array of uint32_t and has been aligned for efficiency
199216
std::copy_n(cigar, n_cigar, reinterpret_cast<uint32_t *>(data_iter));
200217
data_iter += n_cigar * sizeof(uint32_t);
201218

202219
// skipping sequece assignment
203220
data_iter += (l_seq + 1) / 2;
204221

205222
std::fill(data_iter, data_iter + l_seq, '\xff');
206-
return (int)data_len;
223+
// ADS: this will never return a negative value, and either should
224+
// have the return type of this function changed, or change the type
225+
// of `data_len` so that it can be used to signal error.
226+
return static_cast<int>(data_len);
207227
}
208228

209229
static inline bool
@@ -363,23 +383,26 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops,
363383
return i;
364384
}
365385

386+
387+
// ADS: MN the table below needs some comments
366388
const uint8_t byte_revcom_table[] = {
367-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
368-
8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248,
369-
4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244,
370-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
371-
2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242,
372-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
373-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
374-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
375-
1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241,
376-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
377-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
378-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
379-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
380-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
381-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
382-
15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255};
389+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
390+
8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248,
391+
4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244,
392+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
393+
2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242,
394+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
395+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
396+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
397+
1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241,
398+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
399+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
400+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
401+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
402+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
403+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
404+
15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255
405+
};
383406

384407

385408
static inline void
@@ -399,7 +422,7 @@ static void
399422
revcomp_seq_by_byte(bam1_t *aln) {
400423
const size_t l_qseq = get_qlen(aln);
401424
auto seq = bam_get_seq(aln);
402-
size_t num_bytes = ceil(l_qseq / 2.0);
425+
const size_t num_bytes = ceil(l_qseq / 2.0);
403426
auto seq_end = seq + num_bytes;
404427
revcom_byte_then_reverse(seq, seq_end);
405428
if (l_qseq % 2 == 1) {

0 commit comments

Comments
 (0)