@@ -130,17 +130,15 @@ bam_set1_core(bam1_core_t &core,
130130 const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar,
131131 const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize,
132132 const size_t l_seq, const size_t qname_nuls) {
133- /* ADS: need to clarify what these mean. They are used in
134- `hts_reg2bin` from `htslib/hts.h` and likely mean "region to bin"
135- for indexing */
136- // MN: hts_reg2bin categorizes the size of the reference region.
137- // Here, we use the numbers used in htslib/cram/cram_samtools.h
133+ /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and
134+ likely mean "region to bin" for indexing */
135+ /* MN: hts_reg2bin categorizes the size of the reference region.
136+ Here, we use the numbers used in htslib/cram/cram_samtools.h */
138137 static const int min_shift = 14 ;
139138 static const int n_lvls = 5 ;
140139
141140 core.pos = pos;
142141 core.tid = tid;
143- /* ADS: MN I recall we migth not have needed this core.bin below */
144142 core.bin = hts_reg2bin (pos, pos + isize, min_shift, n_lvls);
145143 core.qual = mapq;
146144 core.l_extranul = qname_nuls - 1 ;
@@ -383,12 +381,12 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops,
383381
384382/* This table converts 2 bases packed in a byte to their reverse
385383 * complement. The input is therefore a unit8_t representing 2 bases.
386- * It is assumed that the input uint8_t value is of form "xx" or "x-", where
387- * 'x' a 4-bit number representing either A, C, G, T, or N and '-' is 0000.
388- * For example, the ouptut for "AG" is "CT". The format "x-" is often used
389- * at the end of an odd-length sequence.
390- * The output of "A-" is "-T", and the output of "C-" is "-G", and so forth.
391- * The user must handle this case separately.
384+ * It is assumed that the input uint8_t value is of form "xx" or "x-",
385+ * where 'x' a 4-bit number representing either A, C, G, T, or N and
386+ * '-' is 0000. For example, the ouptut for "AG" is "CT". The format
387+ * "x-" is often used at the end of an odd-length sequence. The
388+ * output of "A-" is "-T", and the output of "C-" is "-G", and so
389+ * forth. The user must handle this case separately.
392390 */
393391const uint8_t byte_revcom_table[] = {
394392 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
@@ -430,8 +428,10 @@ revcomp_seq_by_byte(bam1_t *aln) {
430428 const size_t num_bytes = ceil (l_qseq / 2.0 );
431429 auto seq_end = seq + num_bytes;
432430 revcom_byte_then_reverse (seq, seq_end);
433- if (l_qseq % 2 == 1 ) {
431+ if (l_qseq % 2 == 1 ) { // for odd-length sequences
434432 for (size_t i = 0 ; i < num_bytes - 1 ; i++) {
433+ // swap 4-bit chunks within consecutive bytes like this:
434+ // (aaaabbbb ccccdddd) => (....aaaa bbbbcccc dddd....)
435435 seq[i] = (seq[i] << 4 ) | (seq[i + 1 ] >> 4 );
436436 }
437437 seq[num_bytes - 1 ] <<= 4 ;
@@ -446,15 +446,17 @@ revcomp_seq_by_byte(bam1_t *aln) {
446446// a_used_len = c_seq_len - b_seq_len
447447static void
448448merge_by_byte (const bam1_t *a, const bam1_t *b, bam1_t *c) {
449+ // ADS: (todo) need some functions for int_ceil and is_odd
449450 const size_t b_seq_len = get_qlen (b);
450451 const size_t c_seq_len = get_qlen (c);
451452 const size_t a_used_len = c_seq_len - b_seq_len;
453+
452454 const bool is_a_odd = a_used_len % 2 == 1 ;
453455 const bool is_b_odd = b_seq_len % 2 == 1 ;
454456 const bool is_c_odd = c_seq_len % 2 == 1 ;
457+
455458 const size_t a_num_bytes = ceil (a_used_len / 2.0 );
456459 const size_t b_num_bytes = ceil (b_seq_len / 2.0 );
457- // const size_t c_num_bytes = ceil(c_seq_len / 2.0);
458460
459461 const size_t b_offset = is_b_odd;
460462
@@ -463,33 +465,33 @@ merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
463465 auto c_seq = bam_get_seq (c);
464466
465467 std::copy_n (a_seq, a_num_bytes, c_seq);
466- // Here, c_seq looks either like aa aa aa aa
467- // or like aa aa aa a-
468468 if (is_a_odd) {
469+ // c_seq looks like either [ aa aa aa aa ]
470+ // or [ aa aa aa a- ]
469471 c_seq[a_num_bytes - 1 ] &= 0xf0 ;
470472 c_seq[a_num_bytes - 1 ] |= is_b_odd ?
471473 byte_revcom_table[b_seq[b_num_bytes - 1 ]] :
472474 byte_revcom_table[b_seq[b_num_bytes - 1 ]] >> 4 ;
473475 }
474- // Here, c_seq looks either like aa aa aa aa
475- // or like aa aa aa ab
476476 if (is_c_odd) {
477+ // c_seq looks like either [ aa aa aa aa ]
478+ // or [ aa aa aa ab ]
477479 for (size_t i = 0 ; i < b_num_bytes - 1 ; i++) {
478480 c_seq[a_num_bytes + i] =
479481 (byte_revcom_table[b_seq[b_num_bytes - i - 1 ]] << 4 ) |
480482 (byte_revcom_table[b_seq[b_num_bytes - i - 2 ]] >> 4 );
481483 }
482484 c_seq[a_num_bytes + b_num_bytes - 1 ] = byte_revcom_table[b_seq[0 ]] << 4 ;
483- // Here, c_seq looks either like aa aa aa aa bb bb bb b- (a even and b odd)
484- // or like aa aa aa ab bb bb bb b- (a odd and b odd)
485+ // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd)
486+ // or [ aa aa aa ab bb bb bb b- ] (a odd; b odd)
485487 }
486488 else {
487489 for (size_t i = 0 ; i < b_num_bytes - b_offset; i++) {
488490 c_seq[a_num_bytes + i] =
489491 byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]];
490492 }
491- // Here, c_seq looks either like aa aa aa aa bb bb bb bb (a even and b even)
492- // or like aa aa aa ab bb bb bb (a odd and b odd)
493+ // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even)
494+ // or [ aa aa aa ab bb bb bb ] (a odd and b odd)
493495 }
494496}
495497
@@ -537,7 +539,7 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
537539
538540 uint32_t part_op = 0 ;
539541 const uint32_t c_cur =
540- get_full_and_partial_ops (a_cig, a_ops, overlap, &part_op);
542+ get_full_and_partial_ops (a_cig, a_ops, overlap, &part_op);
541543
542544 // ADS: hack here because the get_full_and_partial_ops doesn't do
543545 // exactly what is needed for this.
@@ -575,13 +577,14 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
575577 isize, // rlen from new cigar
576578 c_seq_len, // truncated seq length
577579 8 ); // enough for the 2 tags?
580+ if (ret < 0 ) throw fr_expt (ret, " bam_set1_wrapper" );
581+ // ADS: might it be better to fill `c->data` directly?
578582 free (c_cig);
583+
579584 auto c_seq = bam_get_seq (c);
580- size_t num_bytes_to_copy = (c_seq_len + 1 ) / 2 ;
585+ size_t num_bytes_to_copy = (c_seq_len + 1 )/ 2 ;
581586 std::copy_n (bam_get_seq (a), num_bytes_to_copy, c_seq);
582587
583- if (ret < 0 ) throw fr_expt (ret, " bam_set1" );
584-
585588 /* add the tags */
586589 const int64_t nm = bam_aux2i (bam_aux_get (a, " NM" )); // ADS: do better here!
587590 // "udpate" for "int" because it determines the right size
@@ -646,9 +649,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b,
646649 bam_cigar_oplen (b_cig[0 ]),
647650 bam_cigar_op (b_cig[0 ]));
648651 // copy the cigar from b into c
649- memcpy (c_cig + c_cur,
650- b_cig + merge_mid,
651- (b_ops - merge_mid) * sizeof (uint32_t ));
652+ memcpy (c_cig + c_cur, b_cig + merge_mid,
653+ (b_ops - merge_mid)*sizeof (uint32_t ));
652654 /* done with cigar string here */
653655
654656 const uint32_t c_seq_len = a_seq_len + b->core .l_qseq ;
@@ -676,7 +678,7 @@ merge_overlap(const bam1_t *a, const bam1_t *b,
676678 c_seq_len, // merged sequence length
677679 8 ); // enough for 2 tags?
678680 free (c_cig);
679- if (ret < 0 ) throw fr_expt (ret, " bam_set1 in merge_overlap" );
681+ if (ret < 0 ) throw fr_expt (ret, " bam_set1_wrapper in merge_overlap" );
680682 // Merge the sequences by bytes
681683 merge_by_byte (a, b, c);
682684
0 commit comments