comparison src/main/java/edu/harvard/iq/dataverse/util/SumStatCalculator.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
9:5926d6419569 10:a50cf11e5178
1 /*
2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15
16 Dataverse Network - A web application to share, preserve and analyze research data.
17 Developed at the Institute for Quantitative Social Science, Harvard University.
18 Version 3.0.
19 */
20
21 package edu.harvard.iq.dataverse.util;
22 import java.util.*;
23 import java.util.logging.Logger;
24
25 import org.apache.commons.lang.*;
26 import org.apache.commons.math.stat.*;
27 //import cern.colt.list.*;
28 //import cern.jet.stat.Descriptive;
29
30
31 /**
32 *
33 * @author Leonid Andreev
34 */
35 public class SumStatCalculator {
36
37 private static Logger logger = Logger.getLogger(SumStatCalculator.class.getPackage().getName());
38
39 public static double[] calculateSummaryStatistics(Number[] x){
40 logger.fine("entering calculate summary statistics ("+x.length+" Number values);");
41
42 double[] nx = new double[8];
43 //("mean", "medn", "mode", "vald", "invd", "min", "max", "stdev");
44
45 Float testNanValue = new Float(Float.NaN);
46 Number testNumberValue = testNanValue;
47 if (Double.isNaN(testNumberValue.doubleValue())) {
48 logger.fine("Float test NaN value is still recognized as a Double NaN.");
49 }
50
51 int invalid = countInvalidValues(x);
52 nx[4] = invalid;
53 logger.fine("counted invalid values: "+nx[4]);
54 nx[3] = x.length - invalid;
55 logger.fine("counted valid values: "+nx[3]);
56
57
58 //double[] newx = prepareForSummaryStats(x);
59 double[] newx = prepareForSummaryStatsAlternative(x, x.length - invalid);
60 logger.fine("prepared double vector for summary stats calculation ("+newx.length+" double values);");
61
62 ////nx[0] = StatUtils.mean(newx);
63 nx[0] = calculateMean(newx);
64 logger.fine("calculated mean: "+nx[0]);
65 ////nx[1] = StatUtils.percentile(newx, 50);
66 nx[1] = calculateMedian(newx);
67 logger.fine("calculated medn: "+nx[1]);
68 nx[2] = 0.0; //getMode(newx);
69
70 nx[5] = StatUtils.min(newx);
71 logger.fine("calculated min: "+nx[5]);
72 nx[6] = StatUtils.max(newx);
73 logger.fine("calculated max: "+nx[6]);
74 nx[7] = Math.sqrt(StatUtils.variance(newx));
75 logger.fine("calculated stdev: "+nx[7]);
76 return nx;
77 }
78
79 private static double[] prepareForSummaryStats(Number[] x) {
80 Double[] z = numberToDouble(x);
81 return removeInvalidValues(z);
82 }
83
84 private static double[] prepareForSummaryStatsAlternative(Number[] x, int length) {
85 double[] retvector = new double[length];
86
87 int c = 0;
88 for (int i = 0; i < x.length; i++) {
89 if (x[i] != null) {
90 double xvalue = x[i].doubleValue();
91 if (!Double.isNaN(xvalue)) {
92 retvector[c++] = xvalue;
93 }
94 }
95 }
96
97 // Throw exception if c != length in the end?
98
99 return retvector;
100 }
101
102 /**
103 * Converts an array of primitive Number types to doubles
104 *
105 */
106 private static Double[] numberToDouble(Number[] x){
107 Double[] z= new Double[x.length];
108 for (int i=0; i<x.length;i++){
109 z[i] = x[i] != null ? new Double( x[i].doubleValue() ) : null;
110 }
111 return z;
112 }
113
114 /**
115 * Returns a new double array of nulls and non-Double.NaN values only
116 *
117 */
118 // TODO:
119 // implement this in some way that does not require allocating a new
120 // ArrayList for the values of every vector. -- L.A. Aug. 11 2014
121 private static double[] removeInvalidValues(Double[] x){
122 List<Double> dl = new ArrayList<Double>();
123 for (Double d : x){
124 if (d != null && !Double.isNaN(d)){
125 dl.add(d);
126 }
127 }
128 return ArrayUtils.toPrimitive(
129 dl.toArray(new Double[dl.size()]));
130 }
131
132 /**
133 * Returns the number of Double.NaNs (or nulls) in a double-type array
134 *
135 */
136 private static int countInvalidValues(Number[] x){
137 int counter=0;
138 for (int i=0; i<x.length;i++){
139 ////if ( x[i] == null || x[i].equals(Double.NaN) ) {
140 if ( x[i] == null || (Double.isNaN(x[i].doubleValue())) ) {
141 counter++;
142 }
143 }
144 return counter;
145 }
146
147 /**
148 * Returns the number of Double.NaNs in a double-type array
149 *
150 * TODO: figure out if this is actually necessary - to count NaNs and
151 * nulls separately;
152 * -- L.A. 4.0 alpha 1
153 */
154 private static int countNaNs(double[] x){
155 int NaNcounter=0;
156 for (int i=0; i<x.length;i++){
157 if (Double.isNaN(x[i])){
158 NaNcounter++;
159 }
160 }
161 return NaNcounter;
162 }
163
164 private static double calculateMedian(double[] values) {
165 double[] sorted = new double[values.length];
166 System.arraycopy(values, 0, sorted, 0, values.length);
167 logger.fine("made an extra copy of the vector;");
168 Arrays.sort(sorted);
169 logger.fine("sorted double vector for median calculations;");
170
171 if (sorted.length == 0) {
172 return Double.NaN;
173 }
174 if (sorted.length == 1) {
175 return sorted[0]; // always return single value for n = 1
176 }
177 double n = sorted.length;
178 double pos = (n + 1) / 2;
179 double fpos = Math.floor(pos);
180 int intPos = (int) fpos;
181 double dif = pos - fpos;
182
183 double lower = sorted[intPos - 1];
184 double upper = sorted[intPos];
185
186 return lower + dif * (upper - lower);
187 }
188
189 private static double calculateMean(double[] values) {
190 return calculateMean(values, 0 , values.length);
191 }
192
193 private static double calculateMean(double[] values, final int begin, final int length) {
194
195 if (values == null || length == 0) {
196 return Double.NaN;
197 }
198
199 double sampleSize = length;
200
201 // Compute initial estimate using definitional formula
202 double xbar = calculateSum(values) / sampleSize;
203
204 // Compute correction factor in second pass
205 double correction = 0;
206 for (int i = begin; i < begin + length; i++) {
207 correction += values[i] - xbar;
208 }
209 return xbar + (correction / sampleSize);
210 }
211
212
213 private static double calculateSum(double[] values) {
214 return calculateSum(values, 0, values.length);
215 }
216
217 private static double calculateSum(double[] values, final int begin, final int length) {
218 if (values == null || length == 0) {
219 return Double.NaN;
220 }
221 double sum = 0.0;
222 for (int i = begin; i < begin + length; i++) {
223 sum += values[i];
224 }
225 return sum;
226 }
227
228
229 /**
230 * Returns the mode statistic of a double variable
231 *
232 */
233 /*
234 public static double getMode(double[] x){
235 double mode = Double.NaN;
236
237 if ((countNaNs(x) == x.length) || (x.length < 1)){
238 return mode;
239 } else {
240 DoubleArrayList dx = new DoubleArrayList(x);
241 dx.sort();
242 DoubleArrayList freqTable = new DoubleArrayList(1);
243 IntArrayList countTable = new IntArrayList(1);
244 Descriptive.frequencies(dx, freqTable, countTable);
245 //out.println("freqTable="+
246 // ReflectionToStringBuilder.toString(freqTable));
247 //out.println("freqTable="+
248 // ReflectionToStringBuilder.toString(countTable));
249 int max_i = 0;
250 for (int i=1; i< countTable.size();i++ ){
251 if (countTable.get(i)> countTable.get(max_i)){
252 max_i = i;
253 }
254 }
255 mode = freqTable.get(max_i);
256 //out.println("position = "+
257 //max_i+"\tits value="+freqTable.get(max_i));
258 }
259 return mode;
260 }
261 */
262
263
264 }