Skip to content

Commit a6e7a1c

Browse files
src/pop_size.cpp: refactoring how input is read
1 parent 3884690 commit a6e7a1c

1 file changed

Lines changed: 71 additions & 108 deletions

File tree

src/pop_size.cpp

Lines changed: 71 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -36,52 +36,37 @@
3636
#include <string>
3737
#include <vector>
3838

39-
using std::cbegin;
40-
using std::cend;
41-
using std::count_if;
42-
using std::min;
43-
using std::runtime_error;
44-
using std::string;
45-
using std::to_string;
46-
using std::uint32_t;
47-
using std::vector;
48-
49-
// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions)
50-
5139
auto
5240
pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
5341
try {
54-
static const std::size_t min_required_counts = 4;
55-
static const string min_required_counts_error_message =
56-
"max count before zero is less than min required count (" +
57-
to_string(min_required_counts) + ") duplicates removed";
42+
static constexpr auto max_iter_per_bootstrap = 100;
43+
static constexpr auto default_max_extrap = 1000000000ul;
44+
static constexpr auto min_required_counts = 4ul;
45+
static constexpr auto min_required_counts_error_message =
46+
"max count before zero is less than min required count (4)";
5847

59-
string outfile;
60-
string input_file_name;
61-
string histogram_outfile;
48+
std::string outfile;
49+
std::string input_file_name;
50+
std::string histogram_outfile;
6251

52+
// NOLINTBEGIN(*-avoid-magic-numbers)
6353
std::size_t orig_max_terms = 100;
64-
double max_extrap = 0.0;
65-
double step_size = 0.0;
54+
double max_extrap{};
6655
std::size_t n_desired_steps = 50;
6756
std::size_t n_bootstraps = 100;
6857
int diagonal = 0;
6958
double c_level = 0.95;
70-
uint32_t seed = 408;
59+
std::uint32_t seed = 408;
60+
// NOLINTEND(*-avoid-magic-numbers)
7161

72-
/* FLAGS */
73-
bool verbose = false;
74-
bool VALS_INPUT = false;
75-
bool PAIRED_END = false;
76-
bool HIST_INPUT = false;
77-
bool SINGLE_ESTIMATE = false;
78-
bool allow_defects = false;
62+
// flags
63+
bool verbose{false};
64+
bool paired_end{false};
65+
bool single_estimate{false};
66+
bool allow_defects{false};
67+
68+
std::uint32_t n_threads{1};
7969

80-
#ifdef HAVE_HTSLIB
81-
bool BAM_FORMAT_INPUT = false;
82-
std::size_t MAX_SEGMENT_LENGTH = 5000;
83-
uint32_t n_threads{1};
84-
#endif
8570
CLI::App app{rlstrip(pop_size::about_msg)};
8671
argv = app.ensure_utf8(argv);
8772
app.usage("\nUsage: preseq pop_size [OPTIONS]");
@@ -93,26 +78,18 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
9378
->option_text("FILE")
9479
->required()
9580
->check(CLI::ExistingFile);
96-
app.add_option("-o,--output", outfile, "yield output file default: stdout");
81+
app.add_option("-o,--output", outfile, "output file")
82+
->option_text("FILE");
9783
app.add_option("-e,--extrap", max_extrap, "maximum extrapolation");
9884
app.add_option("-s,--steps", n_desired_steps, "number of steps");
9985
app.add_option("-n,--boots", n_bootstraps, "number of bootstraps");
100-
app.add_option("-c,--cval", c_level, "level for confidence intervals");
86+
app.add_option("-c,--ci-level", c_level, "level for confidence intervals");
10187
app.add_option("-x,--terms", orig_max_terms, "maximum terms in estimator");
102-
#ifdef HAVE_HTSLIB
103-
app.add_flag("-B,--bam", BAM_FORMAT_INPUT, "input is in BAM format");
104-
app.add_option("-l,--seg_len", MAX_SEGMENT_LENGTH,
105-
"maximum segment length when merging paired end bam reads");
106-
#endif
107-
app.add_flag("-P,--pe", PAIRED_END, "input is paired end read file");
108-
app.add_flag("-V,--vals", VALS_INPUT,
109-
"input is a text file containing only the observed counts");
110-
app.add_flag("-H,--hist", HIST_INPUT,
111-
"input is a text file containing the observed histogram");
112-
app.add_flag("-Q,--quick", SINGLE_ESTIMATE,
88+
app.add_option("-r,--seed", seed, "seed for random number generator");
89+
app.add_flag("-p,--paired-end", paired_end, "input is paired end read file");
90+
app.add_flag("-Q,--quick", single_estimate,
11391
"quick mode (no bootstraps) for confidence intervals");
11492
app.add_flag("-D,--defects", allow_defects, "no testing for defects");
115-
app.add_option("-r,--seed", seed, "seed for random number generator");
11693
app.add_flag("-v,--verbose", verbose, "print more info");
11794
// clang-format on
11895

@@ -123,38 +100,32 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
123100
}
124101
CLI11_PARSE(app, argc, argv);
125102

103+
const auto input_format = get_input_format_type(input_file_name);
104+
if (is_unknown(input_format)) {
105+
std::cerr << "unknown input format\n";
106+
return EXIT_FAILURE;
107+
}
108+
109+
if (verbose) {
110+
std::cerr << "INPUT FORMAT: " << to_string(input_format) << '\n';
111+
if (is_bam(input_format) || is_bed(input_format))
112+
std::cerr << "PAIRED END: " << std::boolalpha << paired_end << '\n';
113+
}
114+
126115
const auto [n_reads, counts_hist] = [&] {
127-
if (HIST_INPUT) {
128-
if (verbose)
129-
std::cerr << "HIST_INPUT\n";
116+
switch (input_format) {
117+
case input_format_type::hist:
130118
return load_histogram(input_file_name);
131-
}
132-
else if (VALS_INPUT) {
133-
if (verbose)
134-
std::cerr << "VALS_INPUT\n";
119+
case input_format_type::counts:
135120
return load_counts(input_file_name);
136-
}
137121
#ifdef HAVE_HTSLIB
138-
else if (BAM_FORMAT_INPUT && PAIRED_END) {
139-
if (verbose)
140-
std::cerr << "PAIRED_END_BAM_INPUT\n";
141-
return load_counts_BAM_pe(n_threads, input_file_name);
142-
}
143-
else if (BAM_FORMAT_INPUT) {
144-
if (verbose)
145-
std::cerr << "BAM_INPUT\n";
146-
return load_counts_BAM_se(n_threads, input_file_name);
147-
}
122+
case input_format_type::bam:
123+
return paired_end ? load_counts_BAM_pe(n_threads, input_file_name)
124+
: load_counts_BAM_se(n_threads, input_file_name);
148125
#endif
149-
else if (PAIRED_END) {
150-
if (verbose)
151-
std::cerr << "PAIRED_END_BED_INPUT\n";
152-
return load_counts_bed_pe(input_file_name);
153-
}
154-
else { // default is single end bed file
155-
if (verbose)
156-
std::cerr << "BED_INPUT\n";
157-
return load_counts_bed_se(input_file_name);
126+
default: // case input_format_type::vals:
127+
return paired_end ? load_counts_bed_pe(input_file_name)
128+
: load_counts_bed_se(input_file_name);
158129
}
159130
}();
160131

@@ -167,13 +138,14 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
167138
while (first_zero < std::size(counts_hist) && counts_hist[first_zero] > 0)
168139
++first_zero;
169140

170-
orig_max_terms = min(orig_max_terms, first_zero - 1);
141+
orig_max_terms = std::min(orig_max_terms, first_zero - 1);
171142
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
172143

173-
if (max_extrap < 1.0)
174-
max_extrap = 1000000000 * distinct_reads;
175-
if (step_size < 1.0)
176-
step_size = (max_extrap - distinct_reads) / n_desired_steps;
144+
if (max_extrap == 0.0)
145+
max_extrap = default_max_extrap * distinct_reads;
146+
147+
const auto step_size =
148+
(max_extrap - distinct_reads) / static_cast<double>(n_desired_steps);
177149

178150
const std::size_t distinct_counts =
179151
std::count_if(std::cbegin(counts_hist), std::cend(counts_hist),
@@ -193,59 +165,52 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
193165
// check to make sure library is not overly saturated
194166
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
195167
if (two_fold_extrap < 0.0)
196-
throw runtime_error("Saturation expected at double initial sample size."
197-
" Unable to extrapolate");
198-
199-
// const std::size_t total_reads = get_counts_from_hist(counts_hist);
200-
201-
// assert(total_reads == n_reads); // ADS: why commented out?
168+
throw std::runtime_error(
169+
"Saturation expected at double initial sample size."
170+
" Unable to extrapolate");
202171

203172
// check that min required count is satisfied
204173
if (orig_max_terms < min_required_counts)
205-
throw runtime_error(min_required_counts_error_message);
174+
throw std::runtime_error(min_required_counts_error_message);
206175

207176
if (verbose)
208177
std::cerr << "[ESTIMATING YIELD CURVE]\n";
209178

210-
vector<double> yield_estimates;
179+
std::vector<double> yield_estimates;
211180

212-
if (SINGLE_ESTIMATE) {
213-
const bool single_estimate_success = extrap_single_estimate(
181+
if (single_estimate) {
182+
const bool success = extrap_single_estimate(
214183
verbose, allow_defects, counts_hist, orig_max_terms, diagonal,
215184
step_size, max_extrap, yield_estimates);
216-
// IF FAILURE, EXIT
217-
if (!single_estimate_success)
218-
throw runtime_error("single estimate failed, run "
219-
"full mode for estimates");
185+
if (!success)
186+
throw std::runtime_error(
187+
"single estimate failed, run full mode for estimates");
220188

221189
std::ofstream of;
222190
if (!outfile.empty())
223-
of.open(outfile.c_str());
191+
of.open(outfile.data());
224192
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
225193

226194
out << "TOTAL_READS\tEXPECTED_DISTINCT\n";
227-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
228-
out.precision(1);
229-
230195
out << 0 << '\t' << 0 << '\n';
231-
for (std::size_t i = 0; i < std::size(yield_estimates); ++i)
196+
for (auto i = 0u; i < std::size(yield_estimates); ++i)
232197
out << (i + 1) * step_size << '\t' << yield_estimates[i] << '\n';
233198
}
234199
else {
235200
if (verbose)
236201
std::cerr << "[BOOTSTRAPPING HISTOGRAM]\n";
237202

238-
const std::size_t max_iter = 100 * n_bootstraps;
203+
const std::size_t max_iter = max_iter_per_bootstrap * n_bootstraps;
239204

240-
vector<vector<double>> bootstrap_estimates;
205+
std::vector<std::vector<double>> bootstrap_estimates;
241206
extrap_bootstrap(verbose, allow_defects, seed, counts_hist, n_bootstraps,
242207
orig_max_terms, diagonal, step_size, max_extrap,
243208
max_iter, bootstrap_estimates);
244209

245210
if (verbose)
246211
std::cerr << "[COMPUTING CONFIDENCE INTERVALS]\n";
247212
// yield ci
248-
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
213+
std::vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
249214

250215
vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
251216
yield_lower_ci_lognorm, yield_upper_ci_lognorm);
@@ -256,13 +221,13 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
256221
if (!outfile.empty())
257222
of.open(outfile);
258223
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
259-
260-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
261-
out.precision(1);
224+
if (!outfile.empty() && !out)
225+
throw std::runtime_error("failed to open outfile file: " + outfile);
262226

263227
const std::size_t n_ests = std::size(yield_estimates) - 1;
264228
if (n_ests < 2)
265-
throw runtime_error("problem with number of estimates in pop_size");
229+
throw std::runtime_error(
230+
"problem with number of estimates in pop_size");
266231

267232
const bool converged =
268233
(yield_estimates[n_ests] - yield_estimates[n_ests - 1] < 1.0);
@@ -282,5 +247,3 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
282247
}
283248
return EXIT_SUCCESS;
284249
}
285-
286-
// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions)

0 commit comments

Comments
 (0)