Skip to content

Commit c63d2b7

Browse files
Adding comments
1 parent cfe7350 commit c63d2b7

1 file changed

Lines changed: 32 additions & 30 deletions

File tree

src/utils/format-reads.cpp

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -130,17 +130,15 @@ bam_set1_core(bam1_core_t &core,
130130
const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar,
131131
const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize,
132132
const size_t l_seq, const size_t qname_nuls) {
133-
/* ADS: need to clarify what these mean. They are used in
134-
`hts_reg2bin` from `htslib/hts.h` and likely mean "region to bin"
135-
for indexing */
136-
// MN: hts_reg2bin categorizes the size of the reference region.
137-
// Here, we use the numbers used in htslib/cram/cram_samtools.h
133+
/* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and
134+
likely mean "region to bin" for indexing */
135+
/* MN: hts_reg2bin categorizes the size of the reference region.
136+
Here, we use the numbers used in htslib/cram/cram_samtools.h */
138137
static const int min_shift = 14;
139138
static const int n_lvls = 5;
140139

141140
core.pos = pos;
142141
core.tid = tid;
143-
/* ADS: MN I recall we migth not have needed this core.bin below */
144142
core.bin = hts_reg2bin(pos, pos + isize, min_shift, n_lvls);
145143
core.qual = mapq;
146144
core.l_extranul = qname_nuls - 1;
@@ -383,12 +381,12 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops,
383381

384382
/* This table converts 2 bases packed in a byte to their reverse
385383
* complement. The input is therefore a unit8_t representing 2 bases.
386-
* It is assumed that the input uint8_t value is of form "xx" or "x-", where
387-
* 'x' a 4-bit number representing either A, C, G, T, or N and '-' is 0000.
388-
* For example, the ouptut for "AG" is "CT". The format "x-" is often used
389-
* at the end of an odd-length sequence.
390-
* The output of "A-" is "-T", and the output of "C-" is "-G", and so forth.
391-
* The user must handle this case separately.
384+
* It is assumed that the input uint8_t value is of form "xx" or "x-",
385+
* where 'x' a 4-bit number representing either A, C, G, T, or N and
386+
* '-' is 0000. For example, the ouptut for "AG" is "CT". The format
387+
* "x-" is often used at the end of an odd-length sequence. The
388+
* output of "A-" is "-T", and the output of "C-" is "-G", and so
389+
* forth. The user must handle this case separately.
392390
*/
393391
const uint8_t byte_revcom_table[] = {
394392
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -430,8 +428,10 @@ revcomp_seq_by_byte(bam1_t *aln) {
430428
const size_t num_bytes = ceil(l_qseq / 2.0);
431429
auto seq_end = seq + num_bytes;
432430
revcom_byte_then_reverse(seq, seq_end);
433-
if (l_qseq % 2 == 1) {
431+
if (l_qseq % 2 == 1) { // for odd-length sequences
434432
for (size_t i = 0; i < num_bytes - 1; i++) {
433+
// swap 4-bit chunks within consecutive bytes like this:
434+
// (aaaabbbb ccccdddd) => (....aaaa bbbbcccc dddd....)
435435
seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4);
436436
}
437437
seq[num_bytes - 1] <<= 4;
@@ -446,15 +446,17 @@ revcomp_seq_by_byte(bam1_t *aln) {
446446
// a_used_len = c_seq_len - b_seq_len
447447
static void
448448
merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
449+
// ADS: (todo) need some functions for int_ceil and is_odd
449450
const size_t b_seq_len = get_qlen(b);
450451
const size_t c_seq_len = get_qlen(c);
451452
const size_t a_used_len = c_seq_len - b_seq_len;
453+
452454
const bool is_a_odd = a_used_len % 2 == 1;
453455
const bool is_b_odd = b_seq_len % 2 == 1;
454456
const bool is_c_odd = c_seq_len % 2 == 1;
457+
455458
const size_t a_num_bytes = ceil(a_used_len / 2.0);
456459
const size_t b_num_bytes = ceil(b_seq_len / 2.0);
457-
// const size_t c_num_bytes = ceil(c_seq_len / 2.0);
458460

459461
const size_t b_offset = is_b_odd;
460462

@@ -463,33 +465,33 @@ merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
463465
auto c_seq = bam_get_seq(c);
464466

465467
std::copy_n(a_seq, a_num_bytes, c_seq);
466-
// Here, c_seq looks either like aa aa aa aa
467-
// or like aa aa aa a-
468468
if (is_a_odd) {
469+
// c_seq looks like either [ aa aa aa aa ]
470+
// or [ aa aa aa a- ]
469471
c_seq[a_num_bytes - 1] &= 0xf0;
470472
c_seq[a_num_bytes - 1] |= is_b_odd ?
471473
byte_revcom_table[b_seq[b_num_bytes - 1]] :
472474
byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4;
473475
}
474-
// Here, c_seq looks either like aa aa aa aa
475-
// or like aa aa aa ab
476476
if (is_c_odd) {
477+
// c_seq looks like either [ aa aa aa aa ]
478+
// or [ aa aa aa ab ]
477479
for (size_t i = 0; i < b_num_bytes - 1; i++) {
478480
c_seq[a_num_bytes + i] =
479481
(byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) |
480482
(byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4);
481483
}
482484
c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4;
483-
// Here, c_seq looks either like aa aa aa aa bb bb bb b- (a even and b odd)
484-
// or like aa aa aa ab bb bb bb b- (a odd and b odd)
485+
// Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd)
486+
// or [ aa aa aa ab bb bb bb b- ] (a odd; b odd)
485487
}
486488
else {
487489
for (size_t i = 0; i < b_num_bytes - b_offset; i++) {
488490
c_seq[a_num_bytes + i] =
489491
byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]];
490492
}
491-
// Here, c_seq looks either like aa aa aa aa bb bb bb bb (a even and b even)
492-
// or like aa aa aa ab bb bb bb (a odd and b odd)
493+
// Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even)
494+
// or [ aa aa aa ab bb bb bb ] (a odd and b odd)
493495
}
494496
}
495497

@@ -537,7 +539,7 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
537539

538540
uint32_t part_op = 0;
539541
const uint32_t c_cur =
540-
get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op);
542+
get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op);
541543

542544
// ADS: hack here because the get_full_and_partial_ops doesn't do
543545
// exactly what is needed for this.
@@ -575,13 +577,14 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
575577
isize, // rlen from new cigar
576578
c_seq_len, // truncated seq length
577579
8); // enough for the 2 tags?
580+
if (ret < 0) throw fr_expt(ret, "bam_set1_wrapper");
581+
// ADS: might it be better to fill `c->data` directly?
578582
free(c_cig);
583+
579584
auto c_seq = bam_get_seq(c);
580-
size_t num_bytes_to_copy = (c_seq_len + 1) / 2;
585+
size_t num_bytes_to_copy = (c_seq_len + 1)/2;
581586
std::copy_n(bam_get_seq(a), num_bytes_to_copy, c_seq);
582587

583-
if (ret < 0) throw fr_expt(ret, "bam_set1");
584-
585588
/* add the tags */
586589
const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here!
587590
// "udpate" for "int" because it determines the right size
@@ -646,9 +649,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b,
646649
bam_cigar_oplen(b_cig[0]),
647650
bam_cigar_op(b_cig[0]));
648651
// copy the cigar from b into c
649-
memcpy(c_cig + c_cur,
650-
b_cig + merge_mid,
651-
(b_ops - merge_mid) * sizeof(uint32_t));
652+
memcpy(c_cig + c_cur, b_cig + merge_mid,
653+
(b_ops - merge_mid)*sizeof(uint32_t));
652654
/* done with cigar string here */
653655

654656
const uint32_t c_seq_len = a_seq_len + b->core.l_qseq;
@@ -676,7 +678,7 @@ merge_overlap(const bam1_t *a, const bam1_t *b,
676678
c_seq_len, // merged sequence length
677679
8); // enough for 2 tags?
678680
free(c_cig);
679-
if (ret < 0) throw fr_expt(ret, "bam_set1 in merge_overlap");
681+
if (ret < 0) throw fr_expt(ret, "bam_set1_wrapper in merge_overlap");
680682
// Merge the sequences by bytes
681683
merge_by_byte(a, b, c);
682684

0 commit comments

Comments
 (0)