Skip to content

Commit cfc1d8d

Browse files
ppavlidisarteymix
authored andcommitted
add support for missing data in eBayes
1 parent 7682d83 commit cfc1d8d

6 files changed

Lines changed: 548 additions & 98 deletions

File tree

src/ubic/basecode/math/linearmodels/LeastSquaresFit.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ public LeastSquaresFit( DesignMatrix designMatrix, DoubleMatrix<String, String>
234234
* Preferred interface for weighted least squares fit between two matrices
235235
*
236236
* @param designMatrix
237-
* @param data
237+
* @param b the data
238238
* @param weights to be used in modifying the influence of the observations in vectorB.
239239
*/
240240
public LeastSquaresFit( DesignMatrix designMatrix, DoubleMatrix2D b, final DoubleMatrix2D weights ) {
@@ -338,7 +338,7 @@ public LeastSquaresFit( DoubleMatrix2D A, DoubleMatrix2D b, final DoubleMatrix2D
338338
}
339339

340340
/**
341-
* @param sample information that will be converted to a design matrix; intercept term is added.
341+
* @param sampleInfo information that will be converted to a design matrix; intercept term is added.
342342
* @param data Data matrix
343343
*/
344344
public LeastSquaresFit( ObjectMatrix<String, String, Object> sampleInfo, DenseDoubleMatrix2D data ) {
@@ -378,7 +378,7 @@ public LeastSquaresFit( ObjectMatrix<String, String, Object> sampleInfo, DenseDo
378378
/**
379379
* NamedMatrix allows easier handling of the results.
380380
*
381-
* @param sample information that will be converted to a design matrix; intercept term is added.
381+
* @param design information that will be converted to a design matrix; intercept term is added.
382382
* @param b Data matrix
383383
*/
384384
public LeastSquaresFit( ObjectMatrix<String, String, Object> design, DoubleMatrix<String, String> b ) {
@@ -396,7 +396,7 @@ public LeastSquaresFit( ObjectMatrix<String, String, Object> design, DoubleMatri
396396
/**
397397
* NamedMatrix allows easier handling of the results.
398398
*
399-
* @param sample information that will be converted to a design matrix; intercept term is added.
399+
* @param design information that will be converted to a design matrix; intercept term is added.
400400
* @param data Data matrix
401401
*/
402402
public LeastSquaresFit( ObjectMatrix<String, String, Object> design, DoubleMatrix<String, String> data,
@@ -756,9 +756,9 @@ protected List<GenericAnovaResult> anova() {
756756
/**
757757
* Provide results of limma eBayes algorithm. These will be used next time summarize is called on this.
758758
*
759-
* @param dfPrior
760-
* @param varPrior
761-
* @param varPost
759+
* @param d dfPrior
760+
* @param v varPrior
761+
* @param vp varPost
762762
*/
763763
protected void ebayesUpdate( double d, double v, DoubleMatrix1D vp ) {
764764
this.dfPrior = d;

src/ubic/basecode/math/linearmodels/ModeratedTstat.java

Lines changed: 175 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/*
22
* The baseCode project
3-
*
3+
*
44
* Copyright (c) 2017 University of British Columbia
5-
*
5+
*
66
* Licensed under the Apache License, Version 2.0 (the "License");
77
* you may not use this file except in compliance with the License.
88
* You may obtain a copy of the License at
@@ -21,6 +21,9 @@
2121

2222
import java.util.List;
2323

24+
import cern.colt.function.DoubleFunction;
25+
import cern.colt.list.BooleanArrayList;
26+
import cern.colt.list.DoubleArrayList;
2427
import org.apache.commons.math3.special.Gamma;
2528

2629
import cern.colt.matrix.DoubleMatrix1D;
@@ -34,120 +37,238 @@
3437
* Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray
3538
* experiments. Statistical Applications in Genetics and Molecular Biology Volume 3, Issue 1, Article 3.
3639
* <p>
37-
* Important: does not handle missing values in the original data.
38-
*
40+
* R code snippets in comments are from squeezeVar.R in the limma source code.
41+
*
3942
* @author paul
4043
*/
4144
public class ModeratedTstat {
4245

46+
private final static DoubleFunction digamma = new DoubleFunction() {
47+
@Override
48+
public double apply(double v) {
49+
return Gamma.digamma(v);
50+
}
51+
};
52+
private final static DoubleFunction trigammainverse = new DoubleFunction() {
53+
@Override
54+
public double apply(double v) {
55+
return SpecFunc.trigammaInverse(v);
56+
}
57+
};
58+
private final static DoubleFunction trigamma = new DoubleFunction() {
59+
@Override
60+
public double apply(double v) {
61+
return Gamma.trigamma(v);
62+
}
63+
};
64+
public static final double TOOSMALL = Math.pow(10, -15);
65+
4366
/**
44-
* Does essentially the same thing as limma::ebayes. However: does not handle missing values; it assumes a single
45-
* value of residual degrees of freedom.
46-
*
67+
* Does essentially the same thing as limma::ebayes
68+
*
4769
* @param fit which will be modified
4870
*/
49-
public static void ebayes( LeastSquaresFit fit ) {
50-
51-
if ( fit.isHasMissing() ) {
52-
throw new UnsupportedOperationException( "Ebayes not supported yet for data with missing values" );
53-
}
54-
71+
public static void ebayes(LeastSquaresFit fit) {
5572
List<LinearModelSummary> summaries = fit.summarize();
56-
DoubleMatrix1D sigmas = new DenseDoubleMatrix1D( new double[summaries.size()] );
73+
DoubleMatrix1D sigmas = new DenseDoubleMatrix1D(new double[summaries.size()]);
74+
DoubleMatrix1D dofs = new DenseDoubleMatrix1D(new double[summaries.size()]);
5775
int i = 0;
58-
Integer dof = 0;
59-
// corner case can get nulls, example: GSE10778
60-
for ( LinearModelSummary lms : summaries ) {
76+
// corner case can get nulls, example: GSE10778
77+
for (LinearModelSummary lms : summaries) {
6178
assert lms.getSigma() != null;
62-
63-
sigmas.set( i, lms.getSigma() );
79+
sigmas.set(i, lms.getSigma());
6480
Integer residualDof = lms.getResidualDof();
65-
dof = residualDof;
81+
Integer dof = residualDof;
82+
dofs.set(i, dof); // collect vector of dofs instead of assuming fixed value.
6683
i++;
6784

6885
}
86+
squeezeVar(sigmas.copy().assign(Functions.square), dofs, fit);
87+
}
88+
89+
/**
90+
* defining 'ok' as non-missing and non-infinite and not very close to zero as per limma implementation
91+
*
92+
* @param x
93+
* @return
94+
*/
95+
private static final BooleanArrayList ok(DoubleMatrix1D x) {
96+
assert x.size() > 0;
97+
BooleanArrayList answer = new BooleanArrayList(x.size());
6998

70-
// need to handle a vector of dofs, really (missing values). or at least check it's constant
71-
squeezeVar( sigmas.copy().assign( Functions.square ), dof, fit );
99+
for (int i = 0; i < x.size(); i++) {
100+
double a = x.getQuick(i);
101+
answer.add(!(Double.isNaN(a) || Double.isInfinite(a) || a < TOOSMALL));
102+
}
103+
return answer;
72104
}
73105

106+
107+
private static final BooleanArrayList conjunction(BooleanArrayList a, BooleanArrayList b) {
108+
assert a.size() == b.size();
109+
BooleanArrayList answer = new BooleanArrayList();
110+
for (int i = 0; i < a.size(); i++) {
111+
answer.add(a.get(i) && b.get(i));
112+
}
113+
return answer;
114+
}
115+
116+
117+
private static final DoubleMatrix1D stripNonOK(DoubleMatrix1D x, BooleanArrayList ok) {
118+
119+
assert ok.size() == x.size();
120+
121+
DoubleArrayList okvals = new DoubleArrayList();
122+
for (int i = 0; i < x.size(); i++) {
123+
if (ok.get(i)) {
124+
okvals.add(x.get(i));
125+
}
126+
}
127+
DoubleMatrix1D answer = new DenseDoubleMatrix1D(okvals.size());
128+
129+
for (int i = 0; i < answer.size(); i++) {
130+
answer.set(i, okvals.get(i));
131+
}
132+
return answer;
133+
}
134+
135+
74136
/*
75137
* Return the scale and df2
76-
* TODO deal with missing/bad values.
77-
* TODO support covariate?
78138
*/
79-
protected static double[] fitFDist( DoubleMatrix1D x, double df1 ) {
139+
protected static double[] fitFDist(final DoubleMatrix1D vars, final DoubleMatrix1D df1s) {
140+
141+
BooleanArrayList ok = conjunction(ok(vars), ok(df1s));
142+
143+
DoubleMatrix1D x = stripNonOK(vars, ok);
144+
DoubleMatrix1D df1 = stripNonOK(df1s, ok);
145+
146+
if (x.size() == 0) {
147+
throw new IllegalStateException("There were no valid values of variance to perform eBayes parameter estimation");
148+
}
149+
150+
// stay away from zero values of variance
151+
x = x.assign(Functions.max(1e-5));
152+
153+
// z <- log(x)
154+
DoubleMatrix1D z = x.copy().assign(Functions.log);
155+
156+
// e <- z-digamma(df1/2)+log(df1/2)
157+
DoubleMatrix1D e1 = df1.copy().assign(Functions.div(2.0)).assign(digamma);
158+
DoubleMatrix1D e2 = df1.copy().assign(Functions.div(2.0)).assign(Functions.log);
159+
DoubleMatrix1D e = z.copy().assign(e1, Functions.minus).assign(e2, Functions.plus);
160+
161+
int n = x.size();
162+
163+
164+
if (n < 2) {
165+
throw new IllegalStateException("Too few valid variance values to do eBayes parameter estimation (require at least 2)");
166+
}
167+
168+
// emean <- mean(e)
169+
double emean = e.zSum() / n;
170+
// evar <- sum((e-emean)^2)/(n-1)
171+
double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
172+
/ (n - 1);
173+
174+
// evar <- evar - mean(trigamma(df1/2))
175+
evar = evar - df1.copy().assign(Functions.div(2.0)).assign(trigamma).zSum() / df1.size();
176+
double df2;
177+
double s20;
178+
if (evar > 0.0) {
179+
// df2 <- 2*trigammaInverse(evar)
180+
df2 = 2 * SpecFunc.trigammaInverse(evar);
181+
182+
// s20 <- exp(emean+digamma(df2/2)-log(df2/2))
183+
s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
184+
185+
} else {
186+
df2 = Double.POSITIVE_INFINITY;
187+
// s20 <- mean(x)
188+
s20 = x.copy().aggregate(Functions.plus, Functions.identity) / x.size();
189+
}
190+
191+
return new double[]{s20, df2};
192+
}
193+
194+
195+
/*
196+
* Return the scale and df2. Original implementation, does not handle missing values, kept here for posterity/comparison/debugging.
197+
*/
198+
protected static double[] fitFDistNoMissing(DoubleMatrix1D x, double df1) {
80199

81200
// stay away from zero valuess
82-
x = x.copy().assign( Functions.max( 1e-5 ) );
201+
x = x.copy().assign(Functions.max(1e-5));
83202
// z <- log(x)
84203
// e <- z-digamma(df1/2)+log(df1/2)
85-
DoubleMatrix1D e = x.copy().assign( Functions.log ).assign( Functions.minus( Gamma.digamma( df1 / 2.0 ) ) )
86-
.assign( Functions.plus( Math.log( df1 / 2.0 ) ) );
204+
DoubleMatrix1D e = x.copy().assign(Functions.log).assign(Functions.minus(Gamma.digamma(df1 / 2.0)))
205+
.assign(Functions.plus(Math.log(df1 / 2.0)));
87206

88207
int nok = x.size(); // number of oks - need to implement checks
89208
// emean <- mean(e)
90-
double emean = e.copy().aggregate( Functions.plus, Functions.identity ) / nok;
209+
double emean = e.copy().zSum() / nok;
91210
// evar <- sum((e-emean)^2)/(nok-1L)
92-
double evar = e.copy().assign( Functions.minus( emean ) ).aggregate( Functions.plus, Functions.square )
93-
/ ( nok - 1 );
211+
double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
212+
/ (nok - 1);
94213

95-
evar = evar - Gamma.trigamma( df1 / 2.0 );
214+
evar = evar - Gamma.trigamma(df1 / 2.0);
96215

97216
double df2;
98217
double s20;
99-
if ( evar > 0 ) {
218+
if (evar > 0) {
100219
// df2 <- 2*trigammaInverse(evar)
101-
df2 = 2 * SpecFunc.trigammaInverse( evar );
220+
df2 = 2 * SpecFunc.trigammaInverse(evar);
102221

103222
// s20 <- exp(emean+digamma(df2/2)-log(df2/2))
104-
s20 = Math.exp( emean + Gamma.digamma( df2 / 2.0 ) - Math.log( df2 / 2.0 ) );
223+
s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
105224

106225
} else {
107226
df2 = Double.POSITIVE_INFINITY;
108227
// s20 <- mean(x)
109-
s20 = x.copy().aggregate( Functions.plus, Functions.identity ) / x.size();
228+
s20 = x.copy().zSum() / x.size();
110229
}
111230

112-
return new double[] { s20, df2 };
231+
return new double[]{s20, df2};
113232
}
114233

115234
/**
116235
* Ignoring robust and covariate for now
117-
*
236+
*
118237
* @param var initial values of estimated residual variance = sigma^2 = rssq/rdof; this will be moderated
119-
* @param df
238+
* @param df vector of degrees of freedom
120239
* @param fit will be updated with new info; call fit.summarize() to get updated pvalues etc.
121240
* @return varPost for testing mostly
122241
*/
123-
protected static DoubleMatrix1D squeezeVar( DoubleMatrix1D var, double df, LeastSquaresFit fit ) {
242+
protected static DoubleMatrix1D squeezeVar(DoubleMatrix1D var, DoubleMatrix1D df, LeastSquaresFit fit) {
124243

125-
double[] ffit = fitFDist( var, df );
244+
double[] ffit = fitFDist(var, df);
126245
double dfPrior = ffit[1];
127246

128-
DoubleMatrix1D varPost = squeezeVar( var, df, ffit );
247+
DoubleMatrix1D varPost = squeezeVar(var, df, ffit);
129248

130-
if ( fit != null )
131-
fit.ebayesUpdate( dfPrior, ffit[0], varPost );
249+
if (fit != null)
250+
fit.ebayesUpdate(dfPrior, ffit[0], varPost);
132251

133252
return varPost;
134253
}
135254

136255
/**
137-
* @param var
138-
* @param df should be a vector of dfs but not sure if necessary.
139-
* @param fit
140-
* @return
256+
* @param var vector of estimated residual variances
257+
* @param df vector of dfs
258+
* @param fit result of fitFDist()
259+
* @return vector of squeezed variances
141260
*/
142-
private static DoubleMatrix1D squeezeVar( DoubleMatrix1D var, double df, double[] fit ) {
261+
private static DoubleMatrix1D squeezeVar(DoubleMatrix1D var, DoubleMatrix1D df, double[] fit) {
143262
double varPrior = fit[0];
144263
double dfPrior = fit[1];
145264

146-
if ( Double.isInfinite( df ) ) {
147-
throw new IllegalStateException( "not implemented case of infinite dof" );
148-
}
149-
return var.copy().assign( Functions.mult( df ) )
150-
.assign( Functions.plus( dfPrior * varPrior ) ).assign( Functions.div( df + dfPrior ) );
265+
266+
// out$var.post <- (df*var + out$df.prior*out$var.prior) / df.total
267+
268+
DoubleMatrix1D dfTotal = df.copy().assign(Functions.plus(dfPrior));
269+
270+
return var.copy().assign(df, Functions.mult)
271+
.assign(Functions.plus(dfPrior * varPrior)).assign(dfTotal, Functions.div);
151272

152273
}
153274

0 commit comments

Comments
 (0)