-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIMM.H
More file actions
87 lines (78 loc) · 3.5 KB
/
IMM.H
File metadata and controls
87 lines (78 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/****************************************************************
IMM.H : Interpolated Markov Models
Copyright (C)2013 William H. Majoros (martiandna@gmail.com).
This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
License (GPL) version 3, as described at www.opensource.org.
****************************************************************/
#ifndef INCL_IMM_H
#define INCL_IMM_H
#include "ContentSensor.H"
#include "BOOM/Vector.H"
#include "BOOM/StringMap.H"
#include "BOOM/Chi2Table.H"
#include "TrainingSequence.H"
/****************************************************************
class IMM
Implements an Interpolated Markov Model as described in:
Salzberg SL, Pertea M, Delcher AL, Gardner MJ, Tettelin H:
Interpolated Markov models for eukaryotic gene finding.
Genomics 59, 1999:24-31.
There is only one known difference between this implementation and
that described in the paper: the "minSampleSize" parameter given
during parameter estimation is used here to enforce a strict lower
bound on sample size. Whereas the above paper calls for a rule
in which probabilities based on sample sizes less than 400 should
be interpolated with a lower-order estimate, this software does that
but additionally enforces the rule that any estimate based on a
sample size less than "minSampleSize" will be completely discarded,
so that a lower-order estimate must be used. The lower-order estimate
may then utilize interpolation with a still lower-order model, if
appropriate.
****************************************************************/
class IMM : public ContentSensor
{
BOOM::Vector<BOOM::StringMap<double>*> *models; // indexed by order
BOOM::Vector<BOOM::StringMap<int>*> *counts; // indexed by order
BOOM::Chi2Table chiSquaredTable;
int N, phase;
int alphabetSize;
IMM *revComp;
double chiTest(BOOM::Vector<int>&,BOOM::Vector<int>&);
void revCompSeqs(BOOM::Vector<TrainingSequence*>&,
BOOM::Vector<TrainingSequence*>&);
void buildModels(BOOM::Vector<TrainingSequence*> &,int minSampleSize);
void updateCounts_fw(BOOM::String &,int order,int sequencePhase,
int boostCount);
void updateCounts_rev(BOOM::String &,int order,int seqPhase,
int boostCount);
void undefine_fw(BOOM::String &history,BOOM::StringMap<int> &model);
void undefine_rev(BOOM::String &future,BOOM::StringMap<int> &model);
void computeProbabilities_fw(int minSampleSize);
void computeProbabilities_rev(int minSampleSize);
void load(istream &);
void interpolate_fw();
void interpolate_rev();
public:
IMM(const IMM &);
IMM(const BOOM::String &filename);
IMM(istream &,Strand=FORWARD_STRAND);
IMM(BOOM::Vector<TrainingSequence*> &,int order,int minSampleSize,
int phase,ContentType,Strand strand=EITHER_STRAND);
virtual ~IMM();
virtual double scoreSingleBase(const Sequence &,const BOOM::String &,
int index,Symbol,char);
virtual void scoreSingleBase(const Sequence &,const BOOM::String &,
int index,Symbol,char,double &scorePhase0,
double &scorePhase1,double &scorePhase2);
virtual double scoreSubsequence(const Sequence &,const BOOM::String &,
int begin,int length,int seqPhase);
virtual ContentSensor *reverseComplement();
virtual bool save(const BOOM::String &filename);
virtual bool save(ostream &os);
virtual void useLogOdds(ContentSensor &nullModel);
virtual void useLogOdds_anonymous(ContentSensor &nullModel);
virtual int getOrder() {return N;}
virtual bool isPhased() {return false;}
virtual ContentSensor *compile();
};
#endif