Skip to content

Commit dfdb3ed

Browse files
committed
add method for counting nearly identical values but ignoring NAs
1 parent a09c43b commit dfdb3ed

2 files changed

Lines changed: 156 additions & 86 deletions

File tree

src/ubic/basecode/math/Stats.java

Lines changed: 138 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/*
22
* The baseCode project
3-
*
4-
* Copyright (c) 2006 University of British Columbia
5-
*
3+
*
4+
* Copyright (c) 2006-2021 University of British Columbia
5+
*
66
* Licensed under the Apache License, Version 2.0 (the "License");
77
* you may not use this file except in compliance with the License.
88
* You may obtain a copy of the License at
@@ -23,49 +23,50 @@
2323

2424
import cern.colt.list.DoubleArrayList;
2525
import cern.jet.stat.Descriptive;
26+
import org.apache.commons.math3.util.DoubleArray;
2627

2728
/**
2829
* Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
2930
* be found elsewhere.
30-
*
31+
*
32+
* @author Paul Pavlidis
3133
* @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
32-
* </a>
34+
* </a>
3335
* @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
34-
* </a>
35-
* @author Paul Pavlidis
36+
* </a>
3637
*/
3738
public class Stats {
3839

3940

4041
/**
4142
* Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
4243
* representing the distribution in question.
43-
*
44+
*
4445
* @param x The input of counts (i.e. a histogram).
4546
* @return DoubleArrayList the CDF.
4647
*/
47-
public static DoubleArrayList cdf( DoubleArrayList x ) {
48-
return cumulateRight( normalize( x ) );
48+
public static DoubleArrayList cdf(DoubleArrayList x) {
49+
return cumulateRight(normalize(x));
4950
}
5051

5152
/**
5253
* Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
5354
* concern is the left tail.
54-
*
55+
*
5556
* @param x DoubleArrayList
5657
* @return cern.colt.list.DoubleArrayList
5758
*/
58-
public static DoubleArrayList cumulate( DoubleArrayList x ) {
59-
if ( x.size() == 0 ) {
60-
return new DoubleArrayList( 0 );
59+
public static DoubleArrayList cumulate(DoubleArrayList x) {
60+
if (x.size() == 0) {
61+
return new DoubleArrayList(0);
6162
}
6263

6364
DoubleArrayList r = new DoubleArrayList();
6465

6566
double sum = 0.0;
66-
for ( int i = 0; i < x.size(); i++ ) {
67-
sum += x.get( i );
68-
r.add( sum );
67+
for (int i = 0; i < x.size(); i++) {
68+
sum += x.get(i);
69+
r.add(sum);
6970
}
7071
return r;
7172
}
@@ -74,21 +75,21 @@ public static DoubleArrayList cumulate( DoubleArrayList x ) {
7475
* Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
7576
* upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
7677
* small numbers that could be lost to rounding.
77-
*
78+
*
7879
* @param x the array of data to be cumulated.
7980
* @return cern.colt.list.DoubleArrayList
8081
*/
81-
public static DoubleArrayList cumulateRight( DoubleArrayList x ) {
82-
if ( x.size() == 0 ) {
83-
return new DoubleArrayList( 0 );
82+
public static DoubleArrayList cumulateRight(DoubleArrayList x) {
83+
if (x.size() == 0) {
84+
return new DoubleArrayList(0);
8485
}
8586

86-
DoubleArrayList r = new DoubleArrayList( new double[x.size()] );
87+
DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
8788

8889
double sum = 0.0;
89-
for ( int i = x.size() - 1; i >= 0; i-- ) {
90-
sum += x.get( i );
91-
r.set( i, sum );
90+
for (int i = x.size() - 1; i >= 0; i--) {
91+
sum += x.get(i);
92+
r.set(i, sum);
9293
}
9394
return r;
9495
}
@@ -97,33 +98,33 @@ public static DoubleArrayList cumulateRight( DoubleArrayList x ) {
9798
* Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
9899
* returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
99100
* absolute value of the mean; that is, negative values are treated as magnitudes.
100-
*
101+
*
101102
* @param data DoubleArrayList
102103
* @return the cv
103104
* @todo offer a regularized version of this function.
104105
*/
105-
public static double cv( DoubleArrayList data ) {
106-
double mean = DescriptiveWithMissing.mean( data );
106+
public static double cv(DoubleArrayList data) {
107+
double mean = DescriptiveWithMissing.mean(data);
107108

108-
double sampleVariance = DescriptiveWithMissing.sampleVariance( data, mean );
109+
double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
109110

110-
if ( sampleVariance == 0.0 ) return 0.0;
111+
if (sampleVariance == 0.0) return 0.0;
111112

112-
if ( mean == 0.0 ) {
113+
if (mean == 0.0) {
113114
return 0.0;
114115
}
115116

116-
return Math.sqrt( sampleVariance ) / Math.abs( mean );
117+
return Math.sqrt(sampleVariance) / Math.abs(mean);
117118
}
118119

119120
/**
120121
* Test whether a value is a valid fractional or probability value.
121-
*
122+
*
122123
* @param value
123124
* @return true if the value is in the interval 0 to 1.
124125
*/
125-
public static boolean isValidFraction( double value ) {
126-
if ( value > 1.0 || value < 0.0 ) {
126+
public static boolean isValidFraction(double value) {
127+
if (value > 1.0 || value < 0.0) {
127128
return false;
128129
}
129130
return true;
@@ -132,25 +133,25 @@ public static boolean isValidFraction( double value ) {
132133
/**
133134
* calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
134135
* must be a value from 0 to 100.
135-
*
136-
* @see DescriptiveWithMissing#meanAboveQuantile
137-
* @param index the rank of the value we wish to average above.
138-
* @param array Array for which we want to get the quantile.
136+
*
137+
* @param index the rank of the value we wish to average above.
138+
* @param array Array for which we want to get the quantile.
139139
* @param effectiveSize The size of the array, not including NaNs.
140140
* @return double
141+
* @see DescriptiveWithMissing#meanAboveQuantile
141142
*/
142-
public static double meanAboveQuantile( int index, double[] array, int effectiveSize ) {
143+
public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
143144

144145
double[] temp = new double[effectiveSize];
145146
double median;
146147
double returnvalue = 0.0;
147148
int k = 0;
148149

149150
temp = array;
150-
median = quantile( index, array, effectiveSize );
151+
median = quantile(index, array, effectiveSize);
151152

152-
for ( int i = 0; i < effectiveSize; i++ ) {
153-
if ( temp[i] > median ) {
153+
for (int i = 0; i < effectiveSize; i++) {
154+
if (temp[i] > median) {
154155
returnvalue += temp[i];
155156
k++;
156157
}
@@ -160,77 +161,128 @@ public static double meanAboveQuantile( int index, double[] array, int effective
160161

161162
/**
162163
* Adjust the elements of an array so they total to 1.0.
163-
*
164+
*
164165
* @param x Input array.
165166
* @return Normalized array.
166167
*/
167-
public static DoubleArrayList normalize( DoubleArrayList x ) {
168-
return normalize( x, Descriptive.sum( x ) );
168+
public static DoubleArrayList normalize(DoubleArrayList x) {
169+
return normalize(x, Descriptive.sum(x));
169170
}
170171

171172
/**
172173
* Divide the elements of an array by a given factor.
173-
*
174-
* @param x Input array.
174+
*
175+
* @param x Input array.
175176
* @param normfactor double
176177
* @return Normalized array.
177178
*/
178-
public static DoubleArrayList normalize( DoubleArrayList x, double normfactor ) {
179-
if ( x.size() == 0 ) {
180-
return new DoubleArrayList( 0 );
179+
public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
180+
if (x.size() == 0) {
181+
return new DoubleArrayList(0);
181182
}
182183

183184
DoubleArrayList r = new DoubleArrayList();
184185

185-
for ( int i = 0; i < x.size(); i++ ) {
186-
r.add( x.get( i ) / normfactor );
186+
for (int i = 0; i < x.size(); i++) {
187+
r.add(x.get(i) / normfactor);
187188
}
188189
return r;
189190

190191
}
191192

192193
/**
193-
* @param array
194+
* @param array input data
194195
* @param tolerance a small constant
195196
* @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
196-
* value.
197+
* value.
197198
*/
198-
public static Integer numberofDistinctValues( DoubleArrayList array, double tolerance ) {
199+
public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
199200

200201
Set<Double> distinct = new HashSet<>();
201202
int r = 1;
202-
if ( tolerance > 0.0 ) {
203-
r = ( int ) Math.ceil( 1.0 / tolerance );
203+
if (tolerance > 0.0) {
204+
r = (int) Math.ceil(1.0 / tolerance);
204205
}
205-
for ( int i = 0; i < array.size(); i++ ) {
206-
double v = array.get( i );
207-
if ( tolerance > 0 ) {
206+
for (int i = 0; i < array.size(); i++) {
207+
double v = array.get(i);
208+
if (tolerance > 0) {
208209
// this might not be foolproof
209-
distinct.add( ( double ) Math.round( v * r ) / r );
210+
distinct.add((double) Math.round(v * r) / r);
210211
} else {
211-
distinct.add( v );
212+
distinct.add(v);
212213
}
213214
}
214-
return Math.max( 0, distinct.size() );
215+
return Math.max(0, distinct.size());
215216

216217
}
217218

219+
218220
/**
219-
* Given a double array, calculate the quantile requested. Note that no interpolation is done.
220-
*
221-
* @see DescriptiveWithMissing#quantile
222-
* @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
223-
* we should enter 100.
224-
* @param values double[] - array of data we want quantile of
221+
* @param tolerance a small constant
222+
* @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
223+
*/
224+
public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
225+
226+
Set<Double> distinct = new HashSet<>();
227+
int r = 1;
228+
if (tolerance > 0.0) {
229+
r = (int) Math.ceil(1.0 / tolerance);
230+
}
231+
for (int i = 0; i < array.size(); i++) {
232+
double v = array.get(i);
233+
if (Double.isNaN(v)) {
234+
continue;
235+
}
236+
if (tolerance > 0) {
237+
// this might not be foolproof
238+
distinct.add((double) Math.round(v * r) / r);
239+
} else {
240+
distinct.add(v);
241+
}
242+
}
243+
return Math.max(0, distinct.size());
244+
245+
}
246+
247+
/**
248+
* Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
249+
*
250+
* @param array input data
251+
* @param tolerance a small constant to define the difference that is "distinct"
252+
* @return
253+
*/
254+
public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
255+
double numNonNA = (double) numNonMissing(array);
256+
if (numNonNA == 0) return 0.0;
257+
return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
258+
}
259+
260+
private static Integer numNonMissing(DoubleArrayList array) {
261+
int nm = 0;
262+
for (int i = 0; i < array.size(); i++) {
263+
if (Double.isNaN(array.get(i))) continue;
264+
nm++;
265+
}
266+
return nm;
267+
}
268+
269+
270+
/**
271+
* Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
272+
*
273+
* @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
274+
* we should enter 100.
275+
* @param values double[] - array of data we want quantile of
225276
* @param effectiveSize int the effective size of the array
226277
* @return double the value at the requested quantile
278+
* @see DescriptiveWithMissing#quantile
227279
*/
228-
public static double quantile( int index, double[] values, int effectiveSize ) {
280+
public static double quantile(int index, double[] values, int effectiveSize) {
229281
double pivot = -1.0;
230-
if ( index == 0 ) {
282+
if (index == 0) {
231283
double ans = values[0];
232-
for ( int i = 1; i < effectiveSize; i++ ) {
233-
if ( ans > values[i] ) {
284+
for (int i = 1; i < effectiveSize; i++) {
285+
if (ans > values[i]) {
234286
ans = values[i];
235287
}
236288
}
@@ -239,7 +291,7 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
239291

240292
double[] temp = new double[effectiveSize];
241293

242-
for ( int i = 0; i < effectiveSize; i++ ) {
294+
for (int i = 0; i < effectiveSize; i++) {
243295
temp[i] = values[i];
244296
}
245297

@@ -249,33 +301,33 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
249301
double[] bigger = new double[effectiveSize];
250302
int itrSm = 0;
251303
int itrBg = 0;
252-
for ( int i = 1; i < effectiveSize; i++ ) {
253-
if ( temp[i] <= pivot ) {
304+
for (int i = 1; i < effectiveSize; i++) {
305+
if (temp[i] <= pivot) {
254306
smaller[itrSm] = temp[i];
255307
itrSm++;
256-
} else if ( temp[i] > pivot ) {
308+
} else if (temp[i] > pivot) {
257309
bigger[itrBg] = temp[i];
258310
itrBg++;
259311
}
260312
}
261-
if ( itrSm > index ) { // quantile must be in the 'smaller' array
262-
return quantile( index, smaller, itrSm );
263-
} else if ( itrSm < index ) { // quantile is in the 'bigger' array
264-
return quantile( index - itrSm - 1, bigger, itrBg );
313+
if (itrSm > index) { // quantile must be in the 'smaller' array
314+
return quantile(index, smaller, itrSm);
315+
} else if (itrSm < index) { // quantile is in the 'bigger' array
316+
return quantile(index - itrSm - 1, bigger, itrBg);
265317
} else {
266318
return pivot;
267319
}
268320

269321
}
270322

271323
/**
272-
* Compute the range of an array.
273-
*
324+
* Compute the range of an array. Missing values are ignored.
325+
*
274326
* @param data DoubleArrayList
275327
* @return double
276328
*/
277-
public static double range( DoubleArrayList data ) {
278-
return DescriptiveWithMissing.max( data ) - DescriptiveWithMissing.min( data );
329+
public static double range(DoubleArrayList data) {
330+
return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
279331
}
280332

281333
private Stats() { /* block instantiation */

0 commit comments

Comments
 (0)