Mercurial > hg > LGDataverses
comparison src/main/java/edu/harvard/iq/dataverse/util/SumStatCalculator.java @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 9:5926d6419569 | 10:a50cf11e5178 |
|---|---|
| 1 /* | |
| 2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College. | |
| 3 | |
| 4 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 you may not use this file except in compliance with the License. | |
| 6 You may obtain a copy of the License at | |
| 7 | |
| 8 http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 | |
| 10 Unless required by applicable law or agreed to in writing, software | |
| 11 distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 See the License for the specific language governing permissions and | |
| 14 limitations under the License. | |
| 15 | |
| 16 Dataverse Network - A web application to share, preserve and analyze research data. | |
| 17 Developed at the Institute for Quantitative Social Science, Harvard University. | |
| 18 Version 3.0. | |
| 19 */ | |
| 20 | |
| 21 package edu.harvard.iq.dataverse.util; | |
| 22 import java.util.*; | |
| 23 import java.util.logging.Logger; | |
| 24 | |
| 25 import org.apache.commons.lang.*; | |
| 26 import org.apache.commons.math.stat.*; | |
| 27 //import cern.colt.list.*; | |
| 28 //import cern.jet.stat.Descriptive; | |
| 29 | |
| 30 | |
| 31 /** | |
| 32 * | |
| 33 * @author Leonid Andreev | |
| 34 */ | |
| 35 public class SumStatCalculator { | |
| 36 | |
| 37 private static Logger logger = Logger.getLogger(SumStatCalculator.class.getPackage().getName()); | |
| 38 | |
| 39 public static double[] calculateSummaryStatistics(Number[] x){ | |
| 40 logger.fine("entering calculate summary statistics ("+x.length+" Number values);"); | |
| 41 | |
| 42 double[] nx = new double[8]; | |
| 43 //("mean", "medn", "mode", "vald", "invd", "min", "max", "stdev"); | |
| 44 | |
| 45 Float testNanValue = new Float(Float.NaN); | |
| 46 Number testNumberValue = testNanValue; | |
| 47 if (Double.isNaN(testNumberValue.doubleValue())) { | |
| 48 logger.fine("Float test NaN value is still recognized as a Double NaN."); | |
| 49 } | |
| 50 | |
| 51 int invalid = countInvalidValues(x); | |
| 52 nx[4] = invalid; | |
| 53 logger.fine("counted invalid values: "+nx[4]); | |
| 54 nx[3] = x.length - invalid; | |
| 55 logger.fine("counted valid values: "+nx[3]); | |
| 56 | |
| 57 | |
| 58 //double[] newx = prepareForSummaryStats(x); | |
| 59 double[] newx = prepareForSummaryStatsAlternative(x, x.length - invalid); | |
| 60 logger.fine("prepared double vector for summary stats calculation ("+newx.length+" double values);"); | |
| 61 | |
| 62 ////nx[0] = StatUtils.mean(newx); | |
| 63 nx[0] = calculateMean(newx); | |
| 64 logger.fine("calculated mean: "+nx[0]); | |
| 65 ////nx[1] = StatUtils.percentile(newx, 50); | |
| 66 nx[1] = calculateMedian(newx); | |
| 67 logger.fine("calculated medn: "+nx[1]); | |
| 68 nx[2] = 0.0; //getMode(newx); | |
| 69 | |
| 70 nx[5] = StatUtils.min(newx); | |
| 71 logger.fine("calculated min: "+nx[5]); | |
| 72 nx[6] = StatUtils.max(newx); | |
| 73 logger.fine("calculated max: "+nx[6]); | |
| 74 nx[7] = Math.sqrt(StatUtils.variance(newx)); | |
| 75 logger.fine("calculated stdev: "+nx[7]); | |
| 76 return nx; | |
| 77 } | |
| 78 | |
| 79 private static double[] prepareForSummaryStats(Number[] x) { | |
| 80 Double[] z = numberToDouble(x); | |
| 81 return removeInvalidValues(z); | |
| 82 } | |
| 83 | |
| 84 private static double[] prepareForSummaryStatsAlternative(Number[] x, int length) { | |
| 85 double[] retvector = new double[length]; | |
| 86 | |
| 87 int c = 0; | |
| 88 for (int i = 0; i < x.length; i++) { | |
| 89 if (x[i] != null) { | |
| 90 double xvalue = x[i].doubleValue(); | |
| 91 if (!Double.isNaN(xvalue)) { | |
| 92 retvector[c++] = xvalue; | |
| 93 } | |
| 94 } | |
| 95 } | |
| 96 | |
| 97 // Throw exception if c != length in the end? | |
| 98 | |
| 99 return retvector; | |
| 100 } | |
| 101 | |
| 102 /** | |
| 103 * Converts an array of primitive Number types to doubles | |
| 104 * | |
| 105 */ | |
| 106 private static Double[] numberToDouble(Number[] x){ | |
| 107 Double[] z= new Double[x.length]; | |
| 108 for (int i=0; i<x.length;i++){ | |
| 109 z[i] = x[i] != null ? new Double( x[i].doubleValue() ) : null; | |
| 110 } | |
| 111 return z; | |
| 112 } | |
| 113 | |
| 114 /** | |
| 115 * Returns a new double array of nulls and non-Double.NaN values only | |
| 116 * | |
| 117 */ | |
| 118 // TODO: | |
| 119 // implement this in some way that does not require allocating a new | |
| 120 // ArrayList for the values of every vector. -- L.A. Aug. 11 2014 | |
| 121 private static double[] removeInvalidValues(Double[] x){ | |
| 122 List<Double> dl = new ArrayList<Double>(); | |
| 123 for (Double d : x){ | |
| 124 if (d != null && !Double.isNaN(d)){ | |
| 125 dl.add(d); | |
| 126 } | |
| 127 } | |
| 128 return ArrayUtils.toPrimitive( | |
| 129 dl.toArray(new Double[dl.size()])); | |
| 130 } | |
| 131 | |
| 132 /** | |
| 133 * Returns the number of Double.NaNs (or nulls) in a double-type array | |
| 134 * | |
| 135 */ | |
| 136 private static int countInvalidValues(Number[] x){ | |
| 137 int counter=0; | |
| 138 for (int i=0; i<x.length;i++){ | |
| 139 ////if ( x[i] == null || x[i].equals(Double.NaN) ) { | |
| 140 if ( x[i] == null || (Double.isNaN(x[i].doubleValue())) ) { | |
| 141 counter++; | |
| 142 } | |
| 143 } | |
| 144 return counter; | |
| 145 } | |
| 146 | |
| 147 /** | |
| 148 * Returns the number of Double.NaNs in a double-type array | |
| 149 * | |
| 150 * TODO: figure out if this is actually necessary - to count NaNs and | |
| 151 * nulls separately; | |
| 152 * -- L.A. 4.0 alpha 1 | |
| 153 */ | |
| 154 private static int countNaNs(double[] x){ | |
| 155 int NaNcounter=0; | |
| 156 for (int i=0; i<x.length;i++){ | |
| 157 if (Double.isNaN(x[i])){ | |
| 158 NaNcounter++; | |
| 159 } | |
| 160 } | |
| 161 return NaNcounter; | |
| 162 } | |
| 163 | |
| 164 private static double calculateMedian(double[] values) { | |
| 165 double[] sorted = new double[values.length]; | |
| 166 System.arraycopy(values, 0, sorted, 0, values.length); | |
| 167 logger.fine("made an extra copy of the vector;"); | |
| 168 Arrays.sort(sorted); | |
| 169 logger.fine("sorted double vector for median calculations;"); | |
| 170 | |
| 171 if (sorted.length == 0) { | |
| 172 return Double.NaN; | |
| 173 } | |
| 174 if (sorted.length == 1) { | |
| 175 return sorted[0]; // always return single value for n = 1 | |
| 176 } | |
| 177 double n = sorted.length; | |
| 178 double pos = (n + 1) / 2; | |
| 179 double fpos = Math.floor(pos); | |
| 180 int intPos = (int) fpos; | |
| 181 double dif = pos - fpos; | |
| 182 | |
| 183 double lower = sorted[intPos - 1]; | |
| 184 double upper = sorted[intPos]; | |
| 185 | |
| 186 return lower + dif * (upper - lower); | |
| 187 } | |
| 188 | |
| 189 private static double calculateMean(double[] values) { | |
| 190 return calculateMean(values, 0 , values.length); | |
| 191 } | |
| 192 | |
| 193 private static double calculateMean(double[] values, final int begin, final int length) { | |
| 194 | |
| 195 if (values == null || length == 0) { | |
| 196 return Double.NaN; | |
| 197 } | |
| 198 | |
| 199 double sampleSize = length; | |
| 200 | |
| 201 // Compute initial estimate using definitional formula | |
| 202 double xbar = calculateSum(values) / sampleSize; | |
| 203 | |
| 204 // Compute correction factor in second pass | |
| 205 double correction = 0; | |
| 206 for (int i = begin; i < begin + length; i++) { | |
| 207 correction += values[i] - xbar; | |
| 208 } | |
| 209 return xbar + (correction / sampleSize); | |
| 210 } | |
| 211 | |
| 212 | |
| 213 private static double calculateSum(double[] values) { | |
| 214 return calculateSum(values, 0, values.length); | |
| 215 } | |
| 216 | |
| 217 private static double calculateSum(double[] values, final int begin, final int length) { | |
| 218 if (values == null || length == 0) { | |
| 219 return Double.NaN; | |
| 220 } | |
| 221 double sum = 0.0; | |
| 222 for (int i = begin; i < begin + length; i++) { | |
| 223 sum += values[i]; | |
| 224 } | |
| 225 return sum; | |
| 226 } | |
| 227 | |
| 228 | |
| 229 /** | |
| 230 * Returns the mode statistic of a double variable | |
| 231 * | |
| 232 */ | |
| 233 /* | |
| 234 public static double getMode(double[] x){ | |
| 235 double mode = Double.NaN; | |
| 236 | |
| 237 if ((countNaNs(x) == x.length) || (x.length < 1)){ | |
| 238 return mode; | |
| 239 } else { | |
| 240 DoubleArrayList dx = new DoubleArrayList(x); | |
| 241 dx.sort(); | |
| 242 DoubleArrayList freqTable = new DoubleArrayList(1); | |
| 243 IntArrayList countTable = new IntArrayList(1); | |
| 244 Descriptive.frequencies(dx, freqTable, countTable); | |
| 245 //out.println("freqTable="+ | |
| 246 // ReflectionToStringBuilder.toString(freqTable)); | |
| 247 //out.println("freqTable="+ | |
| 248 // ReflectionToStringBuilder.toString(countTable)); | |
| 249 int max_i = 0; | |
| 250 for (int i=1; i< countTable.size();i++ ){ | |
| 251 if (countTable.get(i)> countTable.get(max_i)){ | |
| 252 max_i = i; | |
| 253 } | |
| 254 } | |
| 255 mode = freqTable.get(max_i); | |
| 256 //out.println("position = "+ | |
| 257 //max_i+"\tits value="+freqTable.get(max_i)); | |
| 258 } | |
| 259 return mode; | |
| 260 } | |
| 261 */ | |
| 262 | |
| 263 | |
| 264 } |
