Skip to content

Commit a5c618d

Browse files
Merge pull request #203 from smithlabcode/dmr-parse-bgzf
bgzf input format for dmr
2 parents 6c6b4fd + 336cb0a commit a5c618d

1 file changed

Lines changed: 195 additions & 86 deletions

File tree

src/radmeth/dmr.cpp

Lines changed: 195 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
1-
/* dmr: computes DMRs based on HMRs and probability of differences
2-
* at individual CpGs
1+
/* dmr: computes DMRs based on HMRs and probability of differences at
2+
* individual CpGs
33
*
4-
* Copyright (C) 2012-2022 University of Southern California and
5-
* Andrew D. Smith
4+
* Copyright (C) 2012-2023 University of Southern California and
5+
* Andrew D. Smith
66
*
7-
* Authors: Andrew D. Smith
7+
* Authors: Andrew D. Smith
88
*
9-
* This program is free software: you can redistribute it and/or modify
10-
* it under the terms of the GNU General Public License as published by
11-
* the Free Software Foundation, either version 3 of the License, or
12-
* (at your option) any later version.
9+
* This program is free software: you can redistribute it and/or
10+
* modify it under the terms of the GNU General Public License as
11+
* published by the Free Software Foundation, either version 3 of the
12+
* License, or (at your option) any later version.
1313
*
14-
* This program is distributed in the hope that it will be useful,
15-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17-
* GNU General Public License for more details.
14+
* This program is distributed in the hope that it will be useful, but
15+
* WITHOUT ANY WARRANTY; without even the implied warranty of
16+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17+
* General Public License for more details.
1818
*/
1919

2020
#include <string>
@@ -23,11 +23,15 @@
2323
#include <fstream>
2424
#include <algorithm>
2525
#include <stdexcept>
26+
#include <charconv>
2627

2728
#include "OptionParser.hpp"
2829
#include "smithlab_utils.hpp"
2930
#include "smithlab_os.hpp"
3031
#include "GenomicRegion.hpp"
32+
#include "MSite.hpp"
33+
34+
#include <bamxx.hpp>
3135

3236
using std::string;
3337
using std::vector;
@@ -38,32 +42,141 @@ using std::pair;
3842
using std::max;
3943
using std::ifstream;
4044
using std::runtime_error;
45+
using std::from_chars;
46+
using std::find_if;
4147

48+
using bamxx::bgzf_file;
4249

43-
static void
44-
read_diffs_file(const bool VERBOSE, const string &diffs_file,
45-
vector<GenomicRegion> &cpgs) {
4650

47-
cpgs.clear();
51+
static bool
52+
parse_methdiff_line(const char *c, const char *c_end,
53+
string &chrom, uint32_t &pos, char &strand,
54+
string &context, double &diffscore,
55+
uint32_t &n_meth_a, uint32_t &n_unmeth_a,
56+
uint32_t &n_meth_b, uint32_t &n_unmeth_b) {
57+
constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
58+
constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };
59+
60+
auto field_s = c;
61+
auto field_e = find_if(field_s + 1, c_end, is_sep);
62+
bool failed = field_e == c_end;
63+
64+
// chromosome name
65+
{
66+
const uint32_t d = std::distance(field_s, field_e);
67+
chrom = string{field_s, d};
68+
}
69+
70+
field_s = find_if(field_e + 1, c_end, not_sep);
71+
field_e = find_if(field_s + 1, c_end, is_sep);
72+
failed = failed || field_e == c_end;
73+
74+
// position
75+
{
76+
const auto [ptr, ec] = from_chars(field_s, field_e, pos);
77+
failed = failed || ec != std::errc();
78+
}
79+
80+
field_s = find_if(field_e + 1, c_end, not_sep);
81+
field_e = find_if(field_s + 1, c_end, is_sep);
82+
// below because strand is 1 base wide
83+
failed = failed || field_e != field_s + 1 || field_e == c_end;
84+
85+
// strand
86+
strand = *field_s;
87+
failed = failed || (strand != '-' && strand != '+');
88+
89+
field_s = find_if(field_e + 1, c_end, not_sep);
90+
field_e = find_if(field_s + 1, c_end, is_sep);
91+
failed = failed || field_e == c_end;
92+
93+
// context
94+
{
95+
const uint32_t d = std::distance(field_s, field_e);
96+
context = string{field_s, d};
97+
}
98+
99+
field_s = find_if(field_e + 1, c_end, not_sep);
100+
field_e = find_if(field_s + 1, c_end, is_sep);
101+
failed = failed || field_e == c_end;
102+
103+
// score for difference in methylation (contingency table p-value)
104+
{
105+
#ifdef __APPLE__
106+
const int ret = std::sscanf(field_s, "%lf", &diffscore);
107+
failed = failed || ret < 1;
108+
#else
109+
const auto [ptr, ec] = from_chars(field_s, field_e, diffscore);
110+
failed = failed || ec != std::errc();
111+
#endif
112+
}
113+
114+
field_s = find_if(field_e + 1, c_end, not_sep);
115+
field_e = find_if(field_s + 1, c_end, is_sep);
116+
failed = failed || (field_e == c_end);
117+
118+
// counts methylated in methylome "a"
119+
{
120+
const auto [ptr, ec] = from_chars(field_s, c_end, n_meth_a);
121+
failed = failed || ec != std::errc();
122+
}
123+
124+
field_s = find_if(field_e + 1, c_end, not_sep);
125+
field_e = find_if(field_s + 1, c_end, is_sep);
126+
failed = failed || field_e == c_end;
127+
128+
// counts unmethylated in methylome "a"
129+
{
130+
const auto [ptr, ec] = from_chars(field_s, c_end, n_unmeth_a);
131+
failed = failed || ec != std::errc();
132+
}
133+
134+
field_s = find_if(field_e + 1, c_end, not_sep);
135+
field_e = find_if(field_s + 1, c_end, is_sep);
136+
failed = failed || field_e == c_end;
137+
138+
// counts methylated in methylome "b"
139+
{
140+
const auto [ptr, ec] = from_chars(field_s, c_end, n_meth_b);
141+
failed = failed || ec != std::errc();
142+
}
143+
144+
field_s = find_if(field_e + 1, c_end, not_sep);
145+
146+
// counts unmethylated in methylome "a"
147+
{
148+
const auto [ptr, ec] = from_chars(field_s, c_end, n_unmeth_b);
149+
// final field needs to fail if we haven't reached the end
150+
failed = failed || ec != std::errc() || ptr != c_end;
151+
}
152+
153+
return !failed;
154+
}
155+
156+
157+
static vector<MSite>
158+
read_diffs_file(const string &diffs_file) {
159+
160+
bgzf_file in(diffs_file, "r");
161+
if (!in) throw runtime_error("could not open file: " + diffs_file);
162+
163+
string chrom, name;
164+
char strand{};
165+
double diffscore{};
166+
uint32_t pos{}, meth_a{}, unmeth_a{}, meth_b{}, unmeth_b{};
48167

49-
std::ifstream in(diffs_file);
50-
string chrom, strand, name;
51-
double diffscore;
52-
size_t pos, meth_a, unmeth_a, meth_b, unmeth_b;
168+
vector<MSite> cpgs;
53169
string line;
54170
while (getline(in, line)) {
55171

56-
std::istringstream iss(line);
57-
if (!(iss >> chrom >> pos >> strand >> name >>
58-
diffscore >> meth_a >> unmeth_a >> meth_b >> unmeth_b))
172+
if (!parse_methdiff_line(line.data(), line.data() + size(line),
173+
chrom, pos, strand, name, diffscore,
174+
meth_a, unmeth_a, meth_b, unmeth_b))
59175
throw runtime_error("bad methdiff line: " + line);
60176

61-
cpgs.push_back(GenomicRegion(chrom, pos, pos + 1,
62-
name, diffscore, strand[0]));
177+
cpgs.emplace_back(chrom, pos, strand, name, diffscore, 1);
63178
}
64-
if (VERBOSE)
65-
cerr << "[read " << cpgs.size()
66-
<< " sites from " + diffs_file << "]" << endl;
179+
return cpgs;
67180
}
68181

69182
static void
@@ -80,30 +193,28 @@ complement_regions(const size_t max_end, const vector<GenomicRegion> &a,
80193
cmpl.back().set_end(max_end);
81194
}
82195

83-
84-
static void
85-
get_chrom_ends(const vector<GenomicRegion> &r, vector<size_t> &ends) {
196+
static vector<size_t>
197+
get_chrom_ends(const vector<GenomicRegion> &r) {
198+
vector<size_t> ends;
86199
for (size_t i = 0; i < r.size() - 1; ++i)
87200
if (!r[i].same_chrom(r[i+1]))
88201
ends.push_back(i+1);
89202
ends.push_back(r.size());
203+
return ends;
90204
}
91205

92-
93-
static void
94-
complement_regions(const size_t max_end,
95-
const vector<GenomicRegion> &r, vector<GenomicRegion> &r_cmpl) {
96-
97-
vector<size_t> r_chroms;
98-
get_chrom_ends(r, r_chroms);
206+
static vector<GenomicRegion>
207+
complement_regions(const size_t max_end, const vector<GenomicRegion> &r) {
208+
vector<size_t> r_chroms = get_chrom_ends(r);
209+
vector<GenomicRegion> r_cmpl;
99210
size_t t = 0;
100-
for (size_t i = 0; i < r_chroms.size(); ++i) {
211+
for (size_t i = 0; i < size(r_chroms); ++i) {
101212
complement_regions(max_end, r, t, r_chroms[i], r_cmpl);
102213
t = r_chroms[i];
103214
}
215+
return r_cmpl;
104216
}
105217

106-
107218
static bool
108219
check_no_overlap(const vector<GenomicRegion> &regions) {
109220
for (size_t i = 1; i < regions.size(); ++i)
@@ -114,49 +225,50 @@ check_no_overlap(const vector<GenomicRegion> &regions) {
114225
}
115226

116227

117-
static void
118-
separate_sites(const vector<GenomicRegion> &dmrs,
119-
const vector<GenomicRegion> &sites,
120-
vector<pair<size_t, size_t> > &sep_sites) {
121-
const size_t n_dmrs = dmrs.size();
122-
123-
for (size_t i = 0; i < n_dmrs; ++i) {
124-
GenomicRegion a(dmrs[i]);
125-
a.set_end(a.get_start() + 1);
126-
GenomicRegion b(dmrs[i]);
127-
b.set_start(b.get_end());
128-
b.set_end(b.get_end() + 1);
129-
130-
auto a_insert = lower_bound(begin(sites), end(sites), a);
131-
auto b_insert = lower_bound(begin(sites), end(sites), b);
132-
133-
sep_sites.push_back(std::make_pair(a_insert - begin(sites),
134-
b_insert - begin(sites)));
135-
}
228+
static inline MSite
229+
get_left_msite(const GenomicRegion &r) {
230+
return {r.get_chrom(), r.get_start(), r.get_strand(), r.get_name(), 0.0, 1u};
231+
}
232+
233+
234+
static inline MSite
235+
get_right_msite(const GenomicRegion &r) {
236+
return {r.get_chrom(), r.get_end(), r.get_strand(), r.get_name(), 0.0, 1u};
136237
}
137238

138239

139-
template <class T> bool
140-
starts_before(const T &a, const T &b) {
141-
return (a.get_chrom() < b.get_chrom()) ||
142-
(a.same_chrom(b) && a.get_start() < b.get_start());
240+
static vector<pair<size_t, size_t> >
241+
separate_sites(const vector<GenomicRegion> &dmrs,
242+
const vector<MSite> &sites) {
243+
vector<pair<size_t, size_t>> sep_sites;
244+
for (const auto &dmr: dmrs) {
245+
const auto a = get_left_msite(dmr);
246+
const auto b = get_right_msite(dmr);
247+
const auto a_insert = lower_bound(cbegin(sites), cend(sites), a);
248+
const auto b_insert = lower_bound(cbegin(sites), cend(sites), b);
249+
sep_sites.emplace_back(distance(cbegin(sites), a_insert),
250+
distance(cbegin(sites), b_insert));
251+
}
252+
return sep_sites;
143253
}
144254

145-
template <class T> bool
146-
same_start(const T &a, const T &b) {
147-
return a.same_chrom(b) && a.get_start() == b.get_start();
255+
256+
static inline double
257+
pval_from_msite(const MSite &s) {
258+
return s.meth; // abused as a p-value here
148259
}
149260

150261

151262
static void
152263
get_cpg_stats(const bool LOW_CUTOFF, const double sig_cutoff,
153-
const vector<GenomicRegion> &cpgs,
264+
const vector<MSite> &cpgs,
154265
const size_t start_idx, const size_t end_idx,
155266
size_t &total_cpgs, size_t &total_sig) {
156267
total_cpgs = end_idx - start_idx;
157268
for (size_t i = start_idx; i < end_idx; ++i) {
158-
if ((LOW_CUTOFF && (cpgs[i].get_score() < sig_cutoff)) ||
159-
(!LOW_CUTOFF && (cpgs[i].get_score() > 1.0 - sig_cutoff)))
269+
const auto pval = pval_from_msite(cpgs[i]);
270+
if ((LOW_CUTOFF && (pval < sig_cutoff)) ||
271+
(!LOW_CUTOFF && (pval > 1.0 - sig_cutoff)))
160272
++total_sig;
161273
}
162274
}
@@ -232,16 +344,12 @@ main_dmr(int argc, const char **argv) {
232344
if (VERBOSE)
233345
cerr << "[COMPUTING SYMMETRIC DIFFERENCE]" << endl;
234346

235-
236347
size_t max_end = 0;
237-
for (size_t i = 0; i < regions_a.size(); ++i)
238-
max_end = max(max_end, regions_a[i].get_end());
239-
for (size_t i = 0; i < regions_b.size(); ++i)
240-
max_end = max(max_end, regions_b[i].get_end());
348+
for (const auto &r: regions_a) max_end = max(max_end, r.get_end());
349+
for (const auto &r: regions_b) max_end = max(max_end, r.get_end());
241350

242-
vector<GenomicRegion> a_cmpl, b_cmpl;
243-
complement_regions(max_end, regions_a, a_cmpl);
244-
complement_regions(max_end, regions_b, b_cmpl);
351+
const auto a_cmpl = complement_regions(max_end, regions_a);
352+
const auto b_cmpl = complement_regions(max_end, regions_b);
245353

246354
vector<GenomicRegion> dmrs_a, dmrs_b;
247355
genomic_region_intersection_by_base(regions_a, b_cmpl, dmrs_a);
@@ -250,15 +358,17 @@ main_dmr(int argc, const char **argv) {
250358
// separate the regions by chrom and by desert
251359
if (VERBOSE)
252360
cerr << "[READING CPG METH DIFFS]" << endl;
253-
vector<GenomicRegion> cpgs;
254-
read_diffs_file(VERBOSE, diffs_file, cpgs);
361+
const auto cpgs = read_diffs_file(diffs_file);
362+
if (VERBOSE)
363+
cerr << "[read " << size(cpgs)
364+
<< " sites from " + diffs_file << "]" << endl;
365+
255366
if (!check_sorted(cpgs))
256367
throw runtime_error("CpGs not sorted in: " + diffs_file);
257368
if (VERBOSE)
258369
cerr << "[TOTAL CPGS]: " << cpgs.size() << endl;
259370

260-
vector<pair<size_t, size_t> > sep_sites;
261-
separate_sites(dmrs_a, cpgs, sep_sites);
371+
auto sep_sites = separate_sites(dmrs_a, cpgs);
262372

263373
for (size_t i = 0; i < dmrs_a.size(); ++i) {
264374
size_t total_cpgs = 0, total_sig = 0;
@@ -269,8 +379,7 @@ main_dmr(int argc, const char **argv) {
269379
dmrs_a[i].set_score(total_sig);
270380
}
271381

272-
sep_sites.clear();
273-
separate_sites(dmrs_b, cpgs, sep_sites);
382+
sep_sites = separate_sites(dmrs_b, cpgs);
274383

275384
for (size_t i = 0; i < dmrs_b.size(); ++i) {
276385
size_t total_cpgs = 0, total_sig = 0;
@@ -282,11 +391,11 @@ main_dmr(int argc, const char **argv) {
282391
}
283392

284393
std::ofstream out_a(outfile_a);
285-
copy(begin(dmrs_a), end(dmrs_a),
394+
copy(cbegin(dmrs_a), cend(dmrs_a),
286395
std::ostream_iterator<GenomicRegion>(out_a, "\n"));
287396

288397
std::ofstream out_b(outfile_b);
289-
copy(begin(dmrs_b), end(dmrs_b),
398+
copy(cbegin(dmrs_b), cend(dmrs_b),
290399
std::ostream_iterator<GenomicRegion>(out_b, "\n"));
291400

292401
if (VERBOSE)

0 commit comments

Comments
 (0)