diff src/main/java/edu/harvard/iq/dataverse/util/SumStatCalculator.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/edu/harvard/iq/dataverse/util/SumStatCalculator.java	Tue Sep 08 17:00:21 2015 +0200
@@ -0,0 +1,264 @@
+/*
+   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Dataverse Network - A web application to share, preserve and analyze research data.
+   Developed at the Institute for Quantitative Social Science, Harvard University.
+   Version 3.0.
+*/
+
+package edu.harvard.iq.dataverse.util;
+import java.util.*;
+import java.util.logging.Logger;
+
+import org.apache.commons.lang.*;
+import org.apache.commons.math.stat.*;
+//import cern.colt.list.*;
+//import cern.jet.stat.Descriptive;
+
+
+/**
+ *
+ * @author Leonid Andreev
+ */
+public class SumStatCalculator {
+    
+    private static Logger logger = Logger.getLogger(SumStatCalculator.class.getPackage().getName());
+
+    public static double[] calculateSummaryStatistics(Number[] x){
+        logger.fine("entering calculate summary statistics ("+x.length+" Number values);");
+        
+        double[] nx = new double[8];
+        //("mean", "medn", "mode", "vald", "invd", "min", "max", "stdev");
+
+        Float testNanValue = new Float(Float.NaN);
+        Number testNumberValue = testNanValue;
+        if (Double.isNaN(testNumberValue.doubleValue())) {
+            logger.fine("Float test NaN value is still recognized as a Double NaN.");
+        }
+        
+        int invalid = countInvalidValues(x);
+        nx[4] = invalid;
+        logger.fine("counted invalid values: "+nx[4]);
+        nx[3] = x.length - invalid;
+        logger.fine("counted valid values: "+nx[3]);
+        
+        
+        //double[] newx = prepareForSummaryStats(x);
+        double[] newx = prepareForSummaryStatsAlternative(x, x.length - invalid);
+        logger.fine("prepared double vector for summary stats calculation ("+newx.length+" double values);");        
+        
+        ////nx[0] = StatUtils.mean(newx);
+        nx[0] = calculateMean(newx);
+        logger.fine("calculated mean: "+nx[0]);
+        ////nx[1] = StatUtils.percentile(newx, 50);
+        nx[1] = calculateMedian(newx);
+        logger.fine("calculated medn: "+nx[1]);
+        nx[2] = 0.0; //getMode(newx); 
+        
+        nx[5] = StatUtils.min(newx);
+        logger.fine("calculated min: "+nx[5]);
+        nx[6] = StatUtils.max(newx);
+        logger.fine("calculated max: "+nx[6]);
+        nx[7] = Math.sqrt(StatUtils.variance(newx));
+        logger.fine("calculated stdev: "+nx[7]);
+        return nx;
+    }  
+
+    private static double[] prepareForSummaryStats(Number[] x) {
+        Double[] z = numberToDouble(x);
+        return removeInvalidValues(z);
+    }
+
+    private static double[] prepareForSummaryStatsAlternative(Number[] x, int length) {
+        double[] retvector = new double[length];
+
+        int c = 0;
+        for (int i = 0; i < x.length; i++) {
+            if (x[i] != null) {
+                double xvalue = x[i].doubleValue();
+                if (!Double.isNaN(xvalue)) {
+                    retvector[c++] = xvalue; 
+                }
+            }
+        }
+        
+        // Throw exception if c != length in the end?
+        
+        return retvector;
+    }
+    
+    /**
+     * Converts an array of primitive Number types to doubles
+     *
+     */
+    private static Double[] numberToDouble(Number[] x){
+        Double[] z= new Double[x.length];
+        for (int i=0; i<x.length;i++){
+            z[i] = x[i] != null ? new Double( x[i].doubleValue() ) : null;
+        }
+        return z;
+    }
+    
+    /**
+     * Returns a new double array of nulls and non-Double.NaN values only
+     *
+     */
+    // TODO: 
+    // implement this in some way that does not require allocating a new 
+    // ArrayList for the values of every vector. -- L.A. Aug. 11 2014
+    private static double[] removeInvalidValues(Double[] x){
+        List<Double> dl = new ArrayList<Double>();
+        for (Double d : x){
+            if (d != null && !Double.isNaN(d)){
+                dl.add(d);
+            }
+        }
+        return ArrayUtils.toPrimitive(
+            dl.toArray(new Double[dl.size()]));
+    }
+    
+    /**
+     * Returns the number of Double.NaNs (or nulls) in a double-type array
+     *
+     */
+    private static int countInvalidValues(Number[] x){
+        int counter=0;
+        for (int i=0; i<x.length;i++){
+            ////if ( x[i] == null || x[i].equals(Double.NaN) ) {
+            if ( x[i] == null || (Double.isNaN(x[i].doubleValue())) ) {
+                counter++;
+            }
+        }
+        return counter;
+    }
+    
+    /**
+     * Returns the number of Double.NaNs in a double-type array
+     *
+     * TODO: figure out if this is actually necessary - to count NaNs and
+     * nulls separately;
+     *  -- L.A. 4.0 alpha 1
+     */
+    private static int countNaNs(double[] x){
+        int NaNcounter=0;
+        for (int i=0; i<x.length;i++){
+            if (Double.isNaN(x[i])){
+                NaNcounter++;
+            }
+        }
+        return NaNcounter;
+    }
+    
+    private static double calculateMedian(double[] values) {
+        double[] sorted = new double[values.length];
+        System.arraycopy(values, 0, sorted, 0, values.length);
+        logger.fine("made an extra copy of the vector;");
+        Arrays.sort(sorted);
+        logger.fine("sorted double vector for median calculations;");
+        
+        if (sorted.length == 0) {
+            return Double.NaN;
+        }
+        if (sorted.length == 1) {
+            return sorted[0]; // always return single value for n = 1
+        }
+        double n = sorted.length;
+        double pos = (n + 1) / 2;
+        double fpos = Math.floor(pos);
+        int intPos = (int) fpos;
+        double dif = pos - fpos;
+        
+        double lower = sorted[intPos - 1];
+        double upper = sorted[intPos];
+        
+        return lower + dif * (upper - lower);
+    }
+    
+    private static double calculateMean(double[] values) {
+        return calculateMean(values, 0 , values.length);
+    }
+    
+    private static double calculateMean(double[] values, final int begin, final int length) {
+
+        if (values == null || length == 0) {
+            return Double.NaN;
+        }
+
+        double sampleSize = length;
+
+        // Compute initial estimate using definitional formula
+        double xbar = calculateSum(values) / sampleSize;
+
+        // Compute correction factor in second pass
+        double correction = 0;
+        for (int i = begin; i < begin + length; i++) {
+            correction += values[i] - xbar;
+        }
+        return xbar + (correction / sampleSize);
+    }
+
+    
+    private static double calculateSum(double[] values) {
+        return calculateSum(values, 0, values.length);
+    }
+    
+    private static double calculateSum(double[] values, final int begin, final int length) {
+        if (values == null || length == 0) {
+            return Double.NaN;
+        }
+        double sum = 0.0;
+        for (int i = begin; i < begin + length; i++) {
+            sum += values[i];
+        }
+        return sum;
+    }
+    
+    
+    /**
+     * Returns the mode statistic of a double variable
+     *
+     */
+    /*
+    public static double getMode(double[] x){
+        double mode = Double.NaN;
+
+        if ((countNaNs(x) == x.length) || (x.length < 1)){
+            return mode;
+        } else {
+            DoubleArrayList dx = new DoubleArrayList(x);
+            dx.sort();
+            DoubleArrayList freqTable = new DoubleArrayList(1);
+            IntArrayList countTable = new IntArrayList(1);
+            Descriptive.frequencies(dx, freqTable, countTable);
+            //out.println("freqTable="+
+            //    ReflectionToStringBuilder.toString(freqTable));
+            //out.println("freqTable="+
+            //    ReflectionToStringBuilder.toString(countTable));
+            int max_i = 0;
+            for (int i=1; i< countTable.size();i++ ){
+                if (countTable.get(i)> countTable.get(max_i)){
+                    max_i = i;
+                }
+            }
+            mode = freqTable.get(max_i);
+            //out.println("position = "+
+            //max_i+"\tits value="+freqTable.get(max_i));
+        }
+        return mode;
+    }
+    */
+    
+    
+}