Skip to content

Commit e115b55

Browse files
author
sena
committed
implemented k-mer content, no p-values yet
1 parent 40e437e commit e115b55

7 files changed

Lines changed: 84 additions & 27 deletions

File tree

src/FalcoConfig.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,7 @@ const string FalcoConfig::html_template =
826826
" {{adaptercontentce}}"
827827
""
828828
" {{kmercontentcs}}"
829-
" <li><a class=\"{{passkmercontent}}\" href=\"#adaptercontent\">{{kmercontentname}}</a></li>"
829+
" <li><a class=\"{{passkmercontent}}\" href=\"#kmercontent\">{{kmercontentname}}</a></li>"
830830
" {{kmercontentce}}"
831831
""
832832
""
@@ -932,10 +932,10 @@ const string FalcoConfig::html_template =
932932
""
933933
"{{kmercontentcs}}"
934934
"<div class=\"module\">"
935-
" <h2 class=\"{{passkmercontent}}\" id=\"adaptercontent\">"
935+
" <h2 class=\"{{passkmercontent}}\" id=\"kmercontent\">"
936936
" {{kmercontentname}} : {{passkmercontent}}"
937937
" </h2>"
938-
" {{kmercontentdata}}"
938+
" <div id=\"kmerlineplot\"></div>"
939939
"</div>"
940940
"{{kmercontentce}}"
941941
""
@@ -1052,5 +1052,16 @@ const string FalcoConfig::html_template =
10521052
" yaxis : {title : '% sequences with adapter before position'}"
10531053
" } );"
10541054
"}"
1055+
"if (document.getElementById('kmerlineplot') !== null) {"
1056+
" Plotly.newPlot('kmerlineplot', ["
1057+
" {{kmercontentdata}}"
1058+
" ], {"
1059+
" margin: { t: 0 }, "
1060+
" showlegend: true,"
1061+
" xaxis : {title : 'Base position'},"
1062+
" yaxis : {title : 'log2(obs/ exp max)'}"
1063+
" } );"
1064+
"}"
1065+
10551066
"</script>"
10561067
"</html>";

src/FalcoConfig.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,10 @@ namespace Constants {
166166
const size_t bit_shift_adapter = log2exact(max_adapters);
167167

168168
// we shift 14 bits when reading a kmer, two bits per base
169-
const size_t bit_shift_kmer = 2 * Constants::kmer_size;
169+
const size_t bit_shift_kmer = bit_shift_base*kmer_size;
170170

171171
// mask to get only the first 2*k bits of the sliding window
172-
const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1;
172+
const size_t kmer_mask = (1ull << (bit_shift_kmer)) - 1;
173173
};
174174

175175
#endif

src/FastqStats.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ FastqStats::FastqStats() {
6969
position_quality_count.fill(0);
7070
pos_kmer_count.fill(0);
7171
pos_adapter_count.fill(0);
72-
kmer_count = vector<size_t>(kNumBases*(kmer_mask + 1), 0);
72+
kmer_count = vector<size_t>(kNumBases*(Constants::kmer_mask + 1), 0);
7373
}
7474

7575
// Initialize as many gc models as fast bases

src/FastqStats.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,6 @@ struct FastqStats {
145145
static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides);
146146
static const size_t kBitShiftQuality = log2exact(kNumQualityValues);
147147

148-
/************ KMER CONSTANTS **********/
149-
150-
// we shift 14 bits when reading a kmer, two bits per base
151-
static const size_t kBitShiftKmer = 2 * Constants::kmer_size;
152-
153-
// mask to get only the first 2*k bits of the sliding window
154-
static const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1;
155148

156149
/************ ADAPTER CONSTANTS **********/
157150
// bit shift for adapters, log(100) = 7

src/Module.cpp

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1985,7 +1985,7 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
19851985
for (size_t kmer = 0; kmer < num_kmers; ++kmer) {
19861986
for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) {
19871987
observed_count = stats.kmer_count[
1988-
(i << FastqStats::kBitShiftKmer) | kmer
1988+
(i << Constants::bit_shift_kmer) | kmer
19891989
];
19901990
total_kmer_counts[kmer] += observed_count;
19911991
}
@@ -1995,40 +1995,51 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
19951995
double dividend = static_cast<double>(num_seen_kmers);
19961996
for (size_t kmer = 0; kmer < num_kmers; ++kmer) {
19971997
for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) {
1998-
observed_count = stats.kmer_count[
1999-
(i << FastqStats::kBitShiftKmer) | kmer
2000-
];
1998+
observed_count =
1999+
stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer];
2000+
20012001
expected_count = pos_kmer_count[i] / dividend;
20022002
obs_exp_ratio = observed_count / expected_count;
20032003

20042004
if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) {
20052005
obs_exp_max[kmer] = obs_exp_ratio;
2006-
where_obs_exp_is_max[kmer] = i;
2006+
where_obs_exp_is_max[kmer] = i + 1 - kmer_size;
20072007
}
20082008
}
20092009

2010-
if (obs_exp_max[kmer] > 5) {
2010+
if (obs_exp_max[kmer] > MIN_OBS_EXP_TO_REPORT) {
20112011
kmers_to_report.push_back(make_pair(kmer, obs_exp_max[kmer]));
20122012
}
20132013
}
20142014

20152015
sort (begin(kmers_to_report), end(kmers_to_report),
2016-
[](pair<size_t, double> &a, pair<size_t, double> &b) {
2016+
[](const pair<size_t, double> &a, const pair<size_t, double> &b) {
20172017
return a.second > b.second;
20182018
});
2019-
20202019
}
20212020

20222021
void
20232022
ModuleKmerContent::make_grade() {
2024-
grade = "fail";
2023+
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT);
2024+
grade = "pass";
2025+
2026+
// the worst kmer is at the top
2027+
if (lim > 0) {
2028+
const size_t kmer = kmers_to_report[0].first;
2029+
const double obs_exp = obs_exp_max[kmer];
2030+
if (obs_exp >= grade_error)
2031+
grade = "fail";
2032+
else if (obs_exp >= grade_warn)
2033+
grade = "warn";
2034+
}
20252035
}
20262036

20272037
void
20282038
ModuleKmerContent::write_module(ostream &os) {
20292039
os << "#Sequence\tCount\tPValue\tObs/Exp Max\tMax Obs/Exp Position\n";
2030-
for (size_t i = 0; i < 20 && i < kmers_to_report.size(); ++i) {
2031-
size_t kmer = kmers_to_report[i].first;
2040+
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT);
2041+
for (size_t i = 0; i < lim; ++i) {
2042+
const size_t kmer = kmers_to_report[i].first;
20322043
os << size_t_to_seq(kmer, kmer_size) << "\t"
20332044
<< total_kmer_counts[kmer] << "\t"
20342045
<< "0.0" << "\t"
@@ -2039,5 +2050,44 @@ ModuleKmerContent::write_module(ostream &os) {
20392050

20402051
string
20412052
ModuleKmerContent::make_html_data() {
2042-
return "<b>K-mer content module currently not implemented!";
2053+
bool seen_first = false;
2054+
ostringstream data;
2055+
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_PLOT);
2056+
2057+
// get xlim to plot: whatever the largest position with some
2058+
// reported k-mer is
2059+
size_t xlim = 0;
2060+
for (size_t i = 0; i < lim; ++i)
2061+
xlim = max(xlim, where_obs_exp_is_max[kmers_to_report[i].first]);
2062+
2063+
for (size_t i = 0; i < lim; ++i) {
2064+
const size_t kmer = kmers_to_report[i].first;
2065+
const double log_obs_exp = log(kmers_to_report[i].second)/log(2);
2066+
if (!seen_first)
2067+
seen_first = true;
2068+
else
2069+
data << ",";
2070+
data << "{";
2071+
2072+
// X values : read position
2073+
data << "x : [";
2074+
for (size_t j = 0; j < xlim; ++j) {
2075+
data << j+1;
2076+
if (j < xlim - 1) data << ",";
2077+
}
2078+
data << "]";
2079+
2080+
// Y values : A peak wherever the k-mer is seen the most
2081+
data << ", y : [";
2082+
for (size_t j = 0; j < xlim; ++j) {
2083+
data << ((j == (where_obs_exp_is_max[kmer])) ? (log_obs_exp) : 0);
2084+
if (i < xlim - 1)
2085+
data << ",";
2086+
}
2087+
2088+
data << "]";
2089+
data << ", type : 'line', ";
2090+
data << "name : \"" << size_t_to_seq(kmer, Constants::kmer_size) << "\"}";
2091+
}
2092+
return data.str();
20432093
}

src/Module.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,9 @@ class ModuleKmerContent : public Module {
377377
std::vector<std::pair<size_t, double>> kmers_to_report;
378378
public:
379379
static const std::string module_name;
380+
static const size_t MIN_OBS_EXP_TO_REPORT = 5;
381+
static const size_t MAX_KMERS_TO_REPORT = 20;
382+
static const size_t MAX_KMERS_TO_PLOT = 10;
380383
ModuleKmerContent(const FalcoConfig &config);
381384
~ModuleKmerContent(){}
382385
void summarize_module(FastqStats &stats);

src/StreamReader.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,10 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) {
256256
cur_kmer = ((cur_kmer << Constants::bit_shift_base) | base_ind);
257257

258258
// registers k-mer if seen at least k nucleotides since the last n
259-
if (do_kmer && (num_bases_after_n == Constants::kmer_size)) {
259+
if (do_kmer && (num_bases_after_n >= Constants::kmer_size)) {
260260

261261
stats.kmer_count[(read_pos << Constants::bit_shift_kmer)
262-
| (cur_kmer & Constants::bit_shift_kmer)]++;
262+
| (cur_kmer & Constants::kmer_mask)]++;
263263
stats.pos_kmer_count[read_pos]++;
264264
}
265265

0 commit comments

Comments
 (0)