Skip to content

Commit 3884690

Browse files
src/gc_extrap.cpp: refactoring including how input is read
1 parent 794ee58 commit 3884690

1 file changed

Lines changed: 58 additions & 62 deletions

File tree

src/gc_extrap.cpp

Lines changed: 58 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -37,35 +37,46 @@
3737
#include <string>
3838
#include <vector>
3939

40-
// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions)
40+
// NOLINTBEGIN(*-narrowing-conversions)
41+
42+
static auto
43+
write_output(const std::string &outfile, const std::uint32_t bin_size,
44+
const std::uint32_t base_step_size,
45+
const std::vector<double> &coverage_estimates) {
46+
std::ofstream out(outfile);
47+
if (!out)
48+
throw std::runtime_error("failed to open output file: " + outfile);
49+
50+
out << "TOTAL_BASES\tEXPECTED_DISTINCT\n";
51+
52+
out << 0 << '\t' << 0 << '\n';
53+
for (auto i = 0u; i < std::size(coverage_estimates); ++i)
54+
out << (i + 1) * base_step_size << '\t' << coverage_estimates[i] * bin_size
55+
<< '\n';
56+
}
4157

4258
// ADS: functions same, header different (above and this one)
43-
static void
59+
static auto
4460
write_predicted_coverage_curve(
4561
const std::string &outfile, const double c_level, const double base_step_size,
46-
const std::size_t bin_size, const std::vector<double> &cvrg_estimates,
62+
const std::uint32_t bin_size, const std::vector<double> &cvrg_estimates,
4763
const std::vector<double> &cvrg_lower_ci_lognorm,
4864
const std::vector<double> &cvrg_upper_ci_lognorm) {
4965
static constexpr double one_hundred = 100.0;
50-
std::ofstream of;
51-
if (!outfile.empty())
52-
of.open(outfile);
53-
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
66+
std::ofstream out(outfile);
67+
if (!out)
68+
throw std::runtime_error("failed to open output file: " + outfile);
5469

5570
const double percentile = one_hundred * c_level;
5671
// clang-format off
5772
out << "TOTAL_BASES" << '\t'
5873
<< "EXPECTED_COVERED_BASES" << '\t'
59-
<< "LOWER_" << percentile << "%CI" << '\t'
60-
<< "UPPER_" << percentile << "%CI"
61-
<< '\n';
74+
<< "LOWER_" << percentile << "_CI" << '\t'
75+
<< "UPPER_" << percentile << "_CI\n";
6276
// clang-format on
6377

64-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
65-
out.precision(1);
66-
6778
out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << '\n';
68-
for (std::size_t i = 0; i < std::size(cvrg_estimates); ++i) {
79+
for (auto i = 0u; i < std::size(cvrg_estimates); ++i) {
6980
// clang-format off
7081
out << (i + 1) * base_step_size << '\t'
7182
<< cvrg_estimates[i] * bin_size << '\t'
@@ -78,29 +89,30 @@ write_predicted_coverage_curve(
7889
auto
7990
gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
8091
try {
81-
static constexpr auto MIN_REQUIRED_COUNTS = 4;
92+
static constexpr auto min_required_counts = 4;
93+
static constexpr auto max_iter_per_bootstrap = 10;
8294

8395
std::string outfile;
8496
std::string infile;
8597
std::string histogram_outfile;
8698

99+
// NOLINTBEGIN(*-avoid-magic-numbers)
87100
int diagonal = 0;
88-
std::size_t orig_max_terms = 100;
89-
std::size_t bin_size = 10;
90-
bool verbose = false;
101+
std::uint32_t orig_max_terms = 100;
102+
std::uint32_t bin_size = 10;
91103
double base_step_size = 1.0e8;
92-
std::size_t max_width = 10000;
93-
bool SINGLE_ESTIMATE = false;
104+
std::uint32_t max_width = 10000;
94105
double max_extrap = 1.0e12;
95-
std::size_t n_bootstraps = 100;
106+
std::uint32_t n_bootstraps = 100;
96107
std::uint32_t seed = 408;
97-
bool allow_defects = false;
98-
99108
double c_level = 0.95;
100-
bool BAM_FORMAT_INPUT = false;
101-
#ifdef HAVE_HTSLIB
102109
std::uint32_t n_threads{1};
103-
#endif
110+
// NOLINTEND(*-avoid-magic-numbers)
111+
112+
bool allow_defects{false};
113+
bool verbose{false};
114+
bool single_estimate{false};
115+
104116
CLI::App app{rlstrip(about_msg)};
105117
argv = app.ensure_utf8(argv);
106118
app.usage("\nUsage: preseq gc_extrap [OPTIONS]");
@@ -124,11 +136,8 @@ gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
124136
app.add_option("-n,--bootstraps", n_bootstraps, "number of bootstraps");
125137
app.add_option("-c,--cval", c_level, "level for confidence intervals");
126138
app.add_option("-x,--terms", orig_max_terms, "maximum number of terms");
127-
#ifdef HAVE_HTSLIB
128-
app.add_flag("-B,--bam", BAM_FORMAT_INPUT, "input is in BAM format");
129-
#endif
130139
app.add_option("-r,--seed", seed, "seed for random number generator");
131-
app.add_flag("-Q,--quick", SINGLE_ESTIMATE,
140+
app.add_flag("-Q,--quick", single_estimate,
132141
"quick mode: run gc_extrap without bootstrapping for confidence intervals");
133142
app.add_flag("-D,--defects", allow_defects,
134143
"defects mode to extrapolate without testing for defects");
@@ -142,30 +151,30 @@ gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
142151
}
143152
CLI11_PARSE(app, argc, argv);
144153

145-
const auto input_format = BAM_FORMAT_INPUT ? "BAM" : "BED";
154+
const auto bam_format_input = is_sam_or_bam_format(infile);
155+
156+
const auto input_format = bam_format_input ? "BAM" : "BED";
146157
if (verbose)
147158
std::cerr << "LOADING READS (" << input_format << " format)\n";
148159

149-
const auto [n_reads, coverage_hist] = [&] {
160+
const auto [n_reads, coverage_hist] =
150161
#ifdef HAVE_HTSLIB
151-
if (BAM_FORMAT_INPUT)
152-
return load_coverage_counts_BAM(n_threads, infile, seed, bin_size,
153-
max_width);
154-
else
162+
bam_format_input
163+
? load_coverage_counts_BAM(n_threads, infile, seed, bin_size, max_width)
164+
:
155165
#endif
156-
return load_coverage_counts(infile, seed, bin_size, max_width);
157-
}();
166+
load_coverage_counts(infile, seed, bin_size, max_width);
158167

159168
const auto total_bins = get_counts_from_hist(coverage_hist);
160169
const auto distinct_bins = std::accumulate(std::cbegin(coverage_hist),
161170
std::cend(coverage_hist), 0.0);
162171
const double avg_bins_per_read = total_bins / n_reads;
163172
const double bin_step_size = base_step_size / bin_size;
164173

165-
const std::size_t max_observed_count = std::size(coverage_hist) - 1;
174+
const std::uint32_t max_observed_count = std::size(coverage_hist) - 1;
166175

167176
// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
168-
std::size_t first_zero{1};
177+
std::uint32_t first_zero{1};
169178
while (first_zero < std::size(coverage_hist) &&
170179
coverage_hist[first_zero] > 0)
171180
++first_zero;
@@ -188,7 +197,7 @@ gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
188197
report_histogram(histogram_outfile, coverage_hist);
189198

190199
// catch if all reads are distinct
191-
if (orig_max_terms < MIN_REQUIRED_COUNTS)
200+
if (orig_max_terms < min_required_counts)
192201
throw std::runtime_error("max count before zero is les than min required "
193202
"count (4), sample not sufficiently deep or "
194203
"duplicates removed");
@@ -204,35 +213,22 @@ gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
204213

205214
std::vector<double> coverage_estimates;
206215

207-
if (SINGLE_ESTIMATE) {
208-
bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate(
216+
if (single_estimate) {
217+
const auto success = extrap_single_estimate(
209218
verbose, allow_defects, coverage_hist, orig_max_terms, diagonal,
210219
bin_step_size, max_extrap / bin_size, coverage_estimates);
211220
// IF FAILURE, EXIT
212-
if (!SINGLE_ESTIMATE_SUCCESS)
213-
throw std::runtime_error("SINGLE ESTIMATE FAILED, NEED TO RUN IN "
214-
"FULL MODE FOR ESTIMATES");
215-
216-
std::ofstream of;
217-
if (!outfile.empty())
218-
of.open(outfile);
219-
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
220-
221-
out << "TOTAL_BASES\tEXPECTED_DISTINCT\n";
222-
223-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
224-
out.precision(1);
221+
if (!success)
222+
throw std::runtime_error(
223+
"Single estimate failed. Run in full mode for estimates");
225224

226-
out << 0 << '\t' << 0 << '\n';
227-
for (std::size_t i = 0; i < std::size(coverage_estimates); ++i)
228-
out << (i + 1) * base_step_size << '\t'
229-
<< coverage_estimates[i] * bin_size << '\n';
225+
write_output(outfile, bin_size, base_step_size, coverage_estimates);
230226
}
231227
else {
232228
if (verbose)
233229
std::cerr << "[BOOTSTRAPPING HISTOGRAM]\n";
234230

235-
const std::size_t max_iter = 10 * n_bootstraps;
231+
const std::uint32_t max_iter = max_iter_per_bootstrap * n_bootstraps;
236232

237233
std::vector<std::vector<double>> bootstrap_estimates;
238234
extrap_bootstrap(verbose, allow_defects, seed, coverage_hist,
@@ -261,4 +257,4 @@ gc_extrap::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
261257
return EXIT_SUCCESS;
262258
}
263259

264-
// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions)
260+
// NOLINTEND(*-narrowing-conversions)

0 commit comments

Comments
 (0)