Skip to content

Commit 43639de

Browse files
Merge branch 'master' of github.com:smithlabcode/dnmtools into selectsites-input-file-type
2 parents 0c7c3f1 + eac0f9f commit 43639de

9 files changed

Lines changed: 234 additions & 84 deletions

File tree

src/analysis/bsrate.cpp

Lines changed: 96 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
* General Public License for more details.
1818
*/
1919

20+
#include <bamxx.hpp>
21+
2022
#include <algorithm>
2123
#include <fstream>
2224
#include <iostream>
@@ -32,14 +34,13 @@
3234
#include "dnmt_error.hpp"
3335
#include "smithlab_utils.hpp"
3436

35-
#include <bamxx.hpp>
36-
3737
using std::accumulate;
3838
using std::cerr;
3939
using std::cout;
4040
using std::endl;
4141
using std::max;
4242
using std::numeric_limits;
43+
using std::pair;
4344
using std::runtime_error;
4445
using std::string;
4546
using std::unordered_map;
@@ -48,10 +49,12 @@ using std::vector;
4849

4950
using bamxx::bam_rec;
5051

51-
static void
52+
static pair<uint32_t, uint32_t>
5253
count_states_pos(const bool INCLUDE_CPGS, const string &chrom,
5354
const bam_rec &aln, vector<size_t> &unconv,
5455
vector<size_t> &conv, vector<size_t> &err, size_t &hanging) {
56+
uint32_t n_conv = 0, n_uconv = 0;
57+
5558
/* iterate through reference, query/read and fragment */
5659
const auto seq = bam_get_seq(aln);
5760
const auto beg_cig = bam_get_cigar(aln);
@@ -73,10 +76,14 @@ count_states_pos(const bool INCLUDE_CPGS, const string &chrom,
7376
(rpos >= chrom_lim || !is_guanine(chrom[rpos + 1]) ||
7477
INCLUDE_CPGS)) {
7578
const auto qc = seq_nt16_str[bam_seqi(seq, qpos)];
76-
if (qc == 'C')
79+
if (qc == 'C') {
7780
++unconv[fpos];
78-
else if (qc == 'T')
81+
++n_uconv;
82+
}
83+
else if (qc == 'T') {
7984
++conv[fpos];
85+
++n_conv;
86+
}
8087
else if (qc != 'N')
8188
++err[fpos];
8289
}
@@ -90,12 +97,15 @@ count_states_pos(const bool INCLUDE_CPGS, const string &chrom,
9097
}
9198

9299
assert(qpos == get_l_qseq(aln));
100+
return {n_conv, n_conv + n_uconv};
93101
}
94102

95-
static void
103+
static pair<uint32_t, uint32_t>
96104
count_states_neg(const bool INCLUDE_CPGS, const string &chrom,
97105
const bam_rec &aln, vector<size_t> &unconv,
98106
vector<size_t> &conv, vector<size_t> &err, size_t &hanging) {
107+
uint32_t n_conv = 0, n_uconv = 0;
108+
99109
/* iterate backward over query/read positions but forward over
100110
reference and fragment positions */
101111
const auto seq = bam_get_seq(aln);
@@ -117,10 +127,14 @@ count_states_neg(const bool INCLUDE_CPGS, const string &chrom,
117127
if (is_guanine(chrom[rpos]) &&
118128
(rpos == 0 || !is_cytosine(chrom[rpos - 1]) || INCLUDE_CPGS)) {
119129
const auto qc = seq_nt16_str[bam_seqi(seq, qpos - 1)];
120-
if (qc == 'C')
130+
if (qc == 'C') {
121131
++unconv[fpos];
122-
else if (qc == 'T')
132+
++n_uconv;
133+
}
134+
else if (qc == 'T') {
123135
++conv[fpos];
136+
++n_conv;
137+
}
124138
else if (qc != 'N')
125139
++err[fpos];
126140
}
@@ -133,6 +147,7 @@ count_states_neg(const bool INCLUDE_CPGS, const string &chrom,
133147
}
134148
}
135149
assert(qpos == 0);
150+
return {n_conv, n_conv + n_uconv};
136151
}
137152

138153
static void
@@ -223,16 +238,65 @@ write_output(const string &outfile, const vector<size_t> &ucvt_count_p,
223238
}
224239
}
225240

241+
template<typename T> static inline void
242+
update_per_read_stats(const pair<T, T> &x, vector<vector<T>> &tab) {
243+
if (x.second < tab.size()) ++tab[x.second][x.first];
244+
}
245+
246+
static inline vector<double>
247+
format_histogram(const vector<vector<uint32_t>> &tab,
248+
const size_t n_hist_bins) {
249+
constexpr auto epsilon = 1e-6;
250+
vector<double> hist(n_hist_bins, 0.0);
251+
for (size_t i = 1; i < tab.size(); ++i) {
252+
const double denom = i + epsilon;
253+
for (size_t j = 0; j <= i; ++j) {
254+
const double frac = j / denom;
255+
const auto bin_id = std::floor(frac * n_hist_bins);
256+
assert(bin_id < hist.size());
257+
hist[bin_id] += tab[i][j];
258+
}
259+
}
260+
return hist;
261+
}
262+
263+
static void
264+
write_hanging_read_message(std::ostream &out, const size_t n_hanging) {
265+
out << "Warning: hanging reads detected at chromosome ends "
266+
<< "(N=" << n_hanging << ")." << endl
267+
<< "High numbers of hanging reads suggest inconsistent " << endl
268+
<< "reference genomes between stages of analysis." << endl
269+
<< "This is likely to result in analysis errors." << endl;
270+
}
271+
272+
template<typename T> static void
273+
write_per_read_histogram(const vector<vector<T>> &tab, const size_t n_hist_bins,
274+
std::ostream &out) {
275+
const auto hist = format_histogram(tab, n_hist_bins);
276+
out << std::fixed;
277+
for (auto i = 0.0; i < hist.size(); ++i)
278+
out << std::setprecision(3) << i / hist.size() << '\t'
279+
<< std::setprecision(3) << (i + 1) / hist.size() << '\t'
280+
<< std::setprecision(0) << hist[i] << endl;
281+
}
282+
226283
int
227284
main_bsrate(int argc, const char **argv) {
228285
try {
229-
// ASSUMED MAXIMUM LENGTH OF A FRAGMENT
230-
static const size_t output_size = 10000;
286+
// assumed maximum length of a fragment
287+
constexpr const size_t output_size = 10000;
288+
289+
// Assumed maximum cytosines per fragment. Currently the per-read
290+
// information is collected as counts in a 2D array, so that later
291+
// features may be able to fit distributions based these counts.
292+
constexpr const size_t max_cytosine_per_frag = 1000;
231293

232294
bool VERBOSE = false;
233295
bool INCLUDE_CPGS = false;
234296
bool reads_are_a_rich = false;
297+
bool report_per_read = false;
235298
size_t n_threads = 1;
299+
size_t n_hist_bins = 20;
236300

237301
string chroms_file;
238302
string outfile;
@@ -254,6 +318,10 @@ main_bsrate(int argc, const char **argv) {
254318
seq_to_use);
255319
opt_parse.add_opt("a-rich", 'A', "reads are A-rich", false,
256320
reads_are_a_rich);
321+
opt_parse.add_opt("per-read", 'p', "report per-read conversion to terminal",
322+
false, report_per_read);
323+
opt_parse.add_opt("bins", 'b', "number of bins for per-read", false,
324+
n_hist_bins);
257325
opt_parse.add_opt("threads", 't', "number of threads", false, n_threads);
258326
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
259327
vector<string> leftover_args;
@@ -295,8 +363,7 @@ main_bsrate(int argc, const char **argv) {
295363
bamxx::bam_header hdr(hts);
296364
if (!hdr) throw dnmt_error("failed to read header");
297365

298-
if (n_threads > 1)
299-
tp.set_io(hts);
366+
if (n_threads > 1) tp.set_io(hts);
300367

301368
// map the bam header index for each "target" to a sequence in the
302369
// reference genome
@@ -307,6 +374,9 @@ main_bsrate(int argc, const char **argv) {
307374
chrom_lookup.insert({sam_hdr_name2tid(hdr.h, names[i].data()), i});
308375
}
309376

377+
vector<vector<uint32_t>> per_read_counts(
378+
max_cytosine_per_frag, vector<uint32_t>(max_cytosine_per_frag, 0));
379+
310380
vector<size_t> unconv_count_pos(output_size, 0ul);
311381
vector<size_t> conv_count_pos(output_size, 0ul);
312382
vector<size_t> unconv_count_neg(output_size, 0ul);
@@ -324,7 +394,6 @@ main_bsrate(int argc, const char **argv) {
324394
unordered_set<int32_t> chroms_seen;
325395

326396
while (hts.read(hdr, aln)) {
327-
328397
if (reads_are_a_rich) flip_conversion(aln);
329398

330399
// get the correct chrom if it has changed
@@ -351,24 +420,25 @@ main_bsrate(int argc, const char **argv) {
351420
}
352421

353422
if (use_this_chrom) {
354-
// do the work for this mapped read
355-
if (bam_is_rev(aln))
356-
count_states_neg(INCLUDE_CPGS, chroms[chrom_idx], aln,
357-
unconv_count_neg, conv_count_neg, err_neg, hanging);
358-
else
359-
count_states_pos(INCLUDE_CPGS, chroms[chrom_idx], aln,
360-
unconv_count_pos, conv_count_pos, err_pos, hanging);
423+
// do the work for the current mapped read
424+
const auto conv_result =
425+
bam_is_rev(aln) ? count_states_neg(INCLUDE_CPGS, chroms[chrom_idx],
426+
aln, unconv_count_neg,
427+
conv_count_neg, err_neg, hanging)
428+
: count_states_pos(INCLUDE_CPGS, chroms[chrom_idx],
429+
aln, unconv_count_pos,
430+
conv_count_pos, err_pos, hanging);
431+
if (report_per_read)
432+
update_per_read_stats(conv_result, per_read_counts);
361433
}
362434
}
363435
write_output(outfile, unconv_count_pos, conv_count_pos, unconv_count_neg,
364436
conv_count_neg, err_pos, err_neg);
365437

366-
if (hanging > 0) // some overhanging reads
367-
cerr << "Warning: hanging reads detected at chrom ends "
368-
<< "(N=" << hanging << ")" << endl
369-
<< "High numbers of hanging reads suggest mismatch "
370-
<< "between assembly provided here and that used for mapping"
371-
<< endl;
438+
if (hanging > 0) write_hanging_read_message(cerr, hanging);
439+
440+
if (report_per_read)
441+
write_per_read_histogram(per_read_counts, n_hist_bins, cout);
372442
}
373443
catch (const runtime_error &e) {
374444
cerr << e.what() << endl;

src/analysis/hmr-rep.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,10 @@ main_hmr_rep(int argc, const char **argv) {
415415
const vector<string> cpgs_files(leftover_args);
416416
/****************** END COMMAND LINE OPTIONS *****************/
417417

418+
for (auto &filename : cpgs_files)
419+
if (!is_msite_file(filename))
420+
throw runtime_error("malformed counts file: " + filename);
421+
418422
vector<string> params_in_file;
419423
if (!params_in_files.empty()) {
420424
params_in_file = split_comma(params_in_files);

src/analysis/hmr.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,9 @@ main_hmr(int argc, const char **argv) {
442442
const string cpgs_file = leftover_args.front();
443443
/****************** END COMMAND LINE OPTIONS *****************/
444444

445+
if (!is_msite_file(cpgs_file))
446+
throw runtime_error("malformed counts file: " + cpgs_file);
447+
445448
// separate the regions by chrom and by desert
446449
vector<MSite> cpgs;
447450
vector<pair<double, double> > meth;

src/analysis/hypermr.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,9 @@ main_hypermr(int argc, const char **argv) {
278278
const string cpgs_file = leftover_args.front();
279279
/****************** END COMMAND LINE OPTIONS *****************/
280280

281+
if (!is_msite_file(cpgs_file))
282+
throw runtime_error("malformed counts file: " + cpgs_file);
283+
281284
if (VERBOSE) cerr << "[loading_data]" << endl;
282285
vector<MSite> cpgs;
283286
vector<pair<double, double>> meth;

src/analysis/levels.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ main_levels(int argc, const char **argv) {
120120
const string meth_file = leftover_args.front();
121121
/****************** END COMMAND LINE OPTIONS *****************/
122122

123+
if (!is_msite_file(meth_file))
124+
throw runtime_error("malformed counts file: " + meth_file);
125+
123126
const counts_file_format guessed_format =
124127
guess_counts_file_format(meth_file);
125128
if (guessed_format != counts_file_format::ordinary) {

0 commit comments

Comments
 (0)