Mercurial > hg > LGDataverses

diff src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
author: Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date: Tue, 08 Sep 2015 17:00:21 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java	Tue Sep 08 17:00:21 2015 +0200
@@ -0,0 +1,1293 @@
+/*
+   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Dataverse Network - A web application to share, preserve and analyze research data.
+   Developed at the Institute for Quantitative Social Science, Harvard University.
+   Version 3.0.
+*/
+
+package edu.harvard.iq.dataverse.dataaccess;
+
+import edu.harvard.iq.dataverse.DataFile;
+import edu.harvard.iq.dataverse.datavariable.DataVariable;
+import java.util.*;
+import java.util.Scanner;
+import java.util.logging.*;
+import java.io.*;
+import java.io.FileNotFoundException;
+import java.math.BigDecimal;
+import java.math.MathContext;
+import java.math.RoundingMode;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.regex.Matcher;
+
+
+import org.apache.commons.lang.*;
+
+
+/**
+ * 
+ * @author Leonid Andreev
+ * original author:
+ * @author a.sone
+ */
+ 
+public class TabularSubsetGenerator implements SubsetGenerator {
+
+    private static Logger dbgLog = Logger.getLogger(TabularSubsetGenerator.class.getPackage().getName());
+
+    private static int COLUMN_TYPE_STRING = 1;
+    private static int COLUMN_TYPE_LONG   = 2;
+    private static int COLUMN_TYPE_DOUBLE = 3; 
+    private static int COLUMN_TYPE_FLOAT = 4; 
+    
+    private static int MAX_COLUMN_BUFFER = 8192;
+    
+    private FileChannel fileChannel = null; 
+    
+    private int varcount; 
+    private int casecount; 
+    private int subsetcount;
+    
+    private byte[][] columnEntries = null; 
+
+    
+    private ByteBuffer[] columnByteBuffers;
+    private int[] columnBufferSizes;
+    private int[] columnBufferOffsets;
+    
+    private long[] columnStartOffsets; 
+    private long[] columnTotalOffsets;
+    private long[] columnTotalLengths;
+    
+    public TabularSubsetGenerator() {
+        
+    }
+    
+    public TabularSubsetGenerator (DataFile datafile, List<DataVariable> variables) throws IOException {
+        if (!datafile.isTabularData()) {
+            throw new IOException("DataFile is not tabular data.");
+        }
+         
+        setVarCount(datafile.getDataTable().getVarQuantity().intValue()); 
+        setCaseCount(datafile.getDataTable().getCaseQuantity().intValue()); 
+        
+        File tabfile = datafile.getFileSystemLocation().toFile();    
+        File rotatedImageFile = getRotatedImage(tabfile, getVarCount(), getCaseCount());
+        long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, getVarCount(), getCaseCount()); 
+        
+        fileChannel = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), StandardOpenOption.READ));
+        
+        if (variables == null || variables.size() < 1 || variables.size() > getVarCount()) {
+            throw new IOException("Illegal number of variables in the subset request");
+        }
+        
+        subsetcount = variables.size();
+        columnTotalOffsets = new long[subsetcount];
+        columnTotalLengths = new long[subsetcount];
+        columnByteBuffers = new ByteBuffer[subsetcount];
+
+
+        
+        if (subsetcount == 1) {
+            if (!datafile.getDataTable().getId().equals(variables.get(0).getDataTable().getId())) {
+                throw new IOException("Variable in the subset request does not belong to the datafile.");
+            }
+            dbgLog.fine("single variable subset; setting fileChannel position to "+extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder()));
+            fileChannel.position(extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder()));
+            columnTotalLengths[0] = extractColumnLength(columnEndOffsets, variables.get(0).getFileOrder());
+            columnTotalOffsets[0] = 0;
+        } else {
+            columnEntries = new byte[subsetcount][];
+
+            columnBufferSizes = new int[subsetcount];
+            columnBufferOffsets = new int[subsetcount];
+            columnStartOffsets = new long[subsetcount];
+
+            int i = 0;
+            for (DataVariable var : variables) {
+                if (!datafile.getDataTable().getId().equals(var.getDataTable().getId())) {
+                    throw new IOException("Variable in the subset request does not belong to the datafile.");
+                }
+                columnByteBuffers[i] = ByteBuffer.allocate(MAX_COLUMN_BUFFER);
+                columnTotalLengths[i] = extractColumnLength(columnEndOffsets, var.getFileOrder());
+                columnStartOffsets[i] = extractColumnOffset(columnEndOffsets, var.getFileOrder());
+                if (columnTotalLengths[i] < MAX_COLUMN_BUFFER) {
+                    columnByteBuffers[i].limit((int)columnTotalLengths[i]);
+                }
+                fileChannel.position(columnStartOffsets[i]);
+                columnBufferSizes[i] = fileChannel.read(columnByteBuffers[i]);
+                columnBufferOffsets[i] = 0;
+                columnTotalOffsets[i] = columnBufferSizes[i];
+                i++;
+            }
+        }
+    }
+    
+    private int getVarCount() {
+        return varcount;
+    }
+    
+    private void setVarCount(int varcount) {
+        this.varcount = varcount; 
+    }
+    
+    private int getCaseCount() {
+        return casecount;
+    }
+    
+    private void setCaseCount(int casecount) {
+        this.casecount = casecount; 
+    }
+    
+    
+    /* 
+     * Note that this method operates on the *absolute* column number, i.e.
+     * the number of the physical column in the tabular file. This is stored
+     * in DataVariable.FileOrder. 
+     * This "column number" should not be confused with the number of column 
+     * in the subset request; a user can request any number of variable 
+     * columns, in an order that doesn't have to follow the physical order
+     * of the columns in the file. 
+    */
+    private long extractColumnOffset(long[] columnEndOffsets, int column) throws IOException {
+        if (columnEndOffsets == null || columnEndOffsets.length <= column) {
+            throw new IOException("Offsets table not initialized; or column out of bounds.");
+        }
+        long columnOffset;
+        
+        if (column > 0) {
+            columnOffset = columnEndOffsets[column - 1];
+        } else {
+            columnOffset = getVarCount() * 8; 
+        }
+        return columnOffset; 
+    }
+    
+    /* 
+     * See the comment for the method above. 
+     */
+    private long extractColumnLength(long[] columnEndOffsets, int column) throws IOException {
+        if (columnEndOffsets == null || columnEndOffsets.length <= column) {
+            throw new IOException("Offsets table not initialized; or column out of bounds.");
+        }
+        long columnLength; 
+        
+        if (column > 0) {
+            columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1];
+        } else {
+            columnLength = columnEndOffsets[0] - varcount * 8;  
+        }
+        
+        return columnLength; 
+    }
+      
+    
+    private void bufferMoreColumnBytes(int column) throws IOException {
+        if (columnTotalOffsets[column] >= columnTotalLengths[column]) {
+            throw new IOException("attempt to buffer bytes past the column boundary");
+        }
+        fileChannel.position(columnStartOffsets[column] + columnTotalOffsets[column]);
+        
+        columnByteBuffers[column].clear();
+        if (columnTotalLengths[column] < columnTotalOffsets[column] + MAX_COLUMN_BUFFER) {
+            dbgLog.fine("Limiting the buffer to "+(columnTotalLengths[column] - columnTotalOffsets[column])+" bytes");
+            columnByteBuffers[column].limit((int) (columnTotalLengths[column] - columnTotalOffsets[column]));
+        }
+        columnBufferSizes[column] = fileChannel.read(columnByteBuffers[column]);
+        dbgLog.fine("Read "+columnBufferSizes[column]+" bytes for subset column "+column);
+        columnBufferOffsets[column] = 0;
+        columnTotalOffsets[column] += columnBufferSizes[column];
+    }
+    
+    /* 
+       do not use this method!
+       there's a high potential for the "UTF8 character split between buffers" error!
+    public String readColumnEntry(int column) {
+        String ret = null; 
+        int currentbyte;
+        
+        if (columnBufferOffsets[column] >= columnBufferSizes[column]) {
+            try {
+                bufferMoreColumnBytes(column);
+            } catch (IOException ioe) {
+                return null; 
+            }
+        }
+        
+        currentbyte = columnBufferOffsets[column];
+        try {
+            while (columnByteBuffers[column].array()[currentbyte] != '\n') {
+                currentbyte++;
+                if (currentbyte == columnBufferSizes[column]) {
+                    // save the leftover: 
+                    if (ret == null) {
+                        ret = new String(columnByteBuffers[column].array(), columnBufferOffsets[column], columnBufferSizes[column] - columnBufferOffsets[column], "UTF8");
+                    } else {
+                        ret = ret.concat(new String(columnByteBuffers[column].array(), columnBufferOffsets[column], columnBufferSizes[column] - columnBufferOffsets[column], "UTF8"));
+                    }
+                    // read more bytes:
+                    bufferMoreColumnBytes(column);
+                    currentbyte = 0;
+                }
+            }
+
+            // presumably, we have found our '\n':
+            if (ret == null) {
+                ret = new String(columnByteBuffers[column].array(), columnBufferOffsets[column], currentbyte - columnBufferOffsets[column], "UTF8");
+            } else {
+                ret = ret.concat(new String(columnByteBuffers[column].array(), columnBufferOffsets[column], currentbyte - columnBufferOffsets[column], "UTF8"));
+            }
+
+        } catch (IOException ioe) {
+            return null;
+        }
+
+        columnBufferOffsets[column] += (currentbyte + 1);
+
+        return ret;
+    }
+    */
+    
+    public byte[] readColumnEntryBytes(int column) {
+        return readColumnEntryBytes(column, true);
+    }
+    
+    
+    public byte[] readColumnEntryBytes(int column, boolean addTabs) {
+        byte[] leftover = null; 
+        byte[] ret = null; 
+        
+        if (columnBufferOffsets[column] >= columnBufferSizes[column]) {
+            try {
+                bufferMoreColumnBytes(column);
+                if (columnBufferSizes[column] < 1) {
+                    return null;
+                }
+            } catch (IOException ioe) {
+                return null; 
+            }
+        }
+        
+        int byteindex = columnBufferOffsets[column];
+        try {
+            while (columnByteBuffers[column].array()[byteindex] != '\n') {
+                byteindex++;
+                if (byteindex == columnBufferSizes[column]) {
+                    // save the leftover: 
+                    if (leftover == null) {
+                        leftover = new byte[columnBufferSizes[column] - columnBufferOffsets[column]];
+                        System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], leftover, 0, columnBufferSizes[column] - columnBufferOffsets[column]);
+                    } else {
+                        byte[] merged = new byte[leftover.length + columnBufferSizes[column]];
+                        
+                        System.arraycopy(leftover, 0, merged, 0, leftover.length);
+                        System.arraycopy(columnByteBuffers[column].array(), 0, merged, leftover.length, columnBufferSizes[column]);
+                        leftover = merged;
+                        merged = null; 
+                    }
+                    // read more bytes:
+                    bufferMoreColumnBytes(column);
+                    if (columnBufferSizes[column] < 1) {
+                        return null;
+                    }
+                    byteindex = 0;
+                }
+            }
+
+            // presumably, we have found our '\n':
+            if (leftover == null) {
+                ret = new byte[byteindex - columnBufferOffsets[column] + 1];
+                System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], ret, 0, byteindex - columnBufferOffsets[column] + 1);
+            } else {
+                ret = new byte[leftover.length + byteindex + 1];
+                System.arraycopy(leftover, 0, ret, 0, leftover.length);
+                System.arraycopy(columnByteBuffers[column].array(), 0, ret, leftover.length, byteindex + 1);
+            }
+
+        } catch (IOException ioe) {
+            return null;
+        }
+
+        columnBufferOffsets[column] = (byteindex + 1);
+
+        if (column < columnBufferOffsets.length - 1) {
+            ret[ret.length - 1] = '\t';
+        }
+        return ret;
+    }
+    
+    public int readSingleColumnSubset(byte[] buffer) throws IOException {
+        if (columnTotalOffsets[0] == columnTotalLengths[0]) {
+            return -1;
+        }
+        
+        if (columnByteBuffers[0] == null) {
+            dbgLog.fine("allocating single column subset buffer.");
+            columnByteBuffers[0] = ByteBuffer.allocate(buffer.length);
+        }
+                
+        int bytesread = fileChannel.read(columnByteBuffers[0]);
+        dbgLog.fine("single column subset: read "+bytesread+" bytes.");
+        if (columnTotalOffsets[0] + bytesread > columnTotalLengths[0]) {
+            bytesread = (int)(columnTotalLengths[0] - columnTotalOffsets[0]);
+        }
+        System.arraycopy(columnByteBuffers[0].array(), 0, buffer, 0, bytesread);
+
+        columnTotalOffsets[0] += bytesread;
+        columnByteBuffers[0].clear();
+        return bytesread > 0 ? bytesread : -1;
+    }
+    
+    
+    public byte[] readSubsetLineBytes() throws IOException {
+        byte[] ret = null; 
+        int total = 0; 
+        
+        for (int i = 0; i < subsetcount; i++) {
+            columnEntries[i] = readColumnEntryBytes(i);
+            if (columnEntries[i] == null) {
+                throw new IOException("Failed to read subset line entry");
+            }
+            total += columnEntries[i].length;
+        }
+        
+        ret = new byte[total];
+        int offset = 0;
+        for (int i = 0; i < subsetcount; i++) {
+            System.arraycopy(columnEntries[i], 0, ret, offset, columnEntries[i].length);
+            offset += columnEntries[i].length;
+        }
+        dbgLog.fine("line: "+new String(ret));
+        return ret;
+    } 
+    
+    
+    public void close() {
+        if (fileChannel != null) {
+            try {
+                fileChannel.close();
+            } catch (IOException ioe) {
+                // don't care.
+            }
+        }
+    }
+    
+    public  void subsetFile(String infile, String outfile, Set<Integer> columns, Long numCases) {
+        subsetFile(infile, outfile, columns, numCases, "\t");
+    }
+
+    public void subsetFile(String infile, String outfile, Set<Integer> columns, Long numCases,
+        String delimiter) {
+        try {
+            subsetFile(new FileInputStream(new File(infile)), outfile, columns, numCases, delimiter);
+        } catch (IOException ex) {
+            throw new RuntimeException("Could not open file "+infile);
+        }
+    }
+
+
+    public void subsetFile(InputStream in, String outfile, Set<Integer> columns, Long numCases,
+        String delimiter) {
+        try {
+          Scanner scanner =  new Scanner(in);
+          scanner.useDelimiter("\\n");
+
+          BufferedWriter out = new BufferedWriter(new FileWriter(outfile));
+            for (long caseIndex = 0; caseIndex < numCases; caseIndex++) {
+                if (scanner.hasNext()) {
+                    String[] line = (scanner.next()).split(delimiter,-1);
+                    List<String> ln = new ArrayList<String>();
+                    for (Integer i : columns) {
+                        ln.add(line[i]);
+                    }
+                    out.write(StringUtils.join(ln,"\t")+"\n");
+                } else {
+                    throw new RuntimeException("Tab file has fewer rows than the determined number of cases.");
+                }
+            }
+
+          while (scanner.hasNext()) {
+              if (!"".equals(scanner.next()) ) {
+                  throw new RuntimeException("Tab file has extra nonempty rows than the determined number of cases.");
+
+              }
+          }
+
+          scanner.close();
+          out.close();
+
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+    }
+    
+    /*
+     * Straightforward method for subsetting a column; inefficient on large 
+     * files, OK to use on small files:
+     */
+    
+    public static Double[] subsetDoubleVector(InputStream in, int column, int numCases) {
+        Double[] retVector = new Double[numCases];
+        Scanner scanner = new Scanner(in);
+        scanner.useDelimiter("\\n");
+
+        for (int caseIndex = 0; caseIndex < numCases; caseIndex++) {
+            if (scanner.hasNext()) {
+                String[] line = (scanner.next()).split("\t", -1);
+                try {
+                    retVector[caseIndex] = new Double(line[column]);
+                } catch (NumberFormatException ex) {
+                    retVector[caseIndex] = null; // missing value
+                }
+            } else {
+                scanner.close();
+                throw new RuntimeException("Tab file has fewer rows than the stored number of cases!");
+            }
+        }
+
+        int tailIndex = numCases;
+        while (scanner.hasNext()) {
+            String nextLine = scanner.next();
+            if (!"".equals(nextLine)) {
+                scanner.close();
+                throw new RuntimeException("Column "+column+": tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine);
+            }
+            tailIndex++;
+        }
+
+        scanner.close();
+        return retVector;
+
+    }
+
+    /*
+     * Straightforward method for subsetting a tab-delimited data file, extracting
+     * all the columns representing continuous variables and returning them as 
+     * a 2-dimensional array of Doubles;
+     * Inefficient on large files, OK to use on small ones.
+     */
+    public static Double[][] subsetDoubleVectors(InputStream in, Set<Integer> columns, int numCases) throws IOException {
+        Double[][] retVector = new Double[columns.size()][numCases];
+        Scanner scanner = new Scanner(in);
+        scanner.useDelimiter("\\n");
+
+        for (int caseIndex = 0; caseIndex < numCases; caseIndex++) {
+            if (scanner.hasNext()) {
+                String[] line = (scanner.next()).split("\t", -1);
+                int j = 0;
+                for (Integer i : columns) {
+                    try {
+                        // TODO: verify that NaN and +-Inf are going to be
+                        // handled correctly here! -- L.A. 
+                        // NO, "+-Inf" is not handled correctly; see the 
+                        // comment further down below. 
+                        retVector[j][caseIndex] = new Double(line[i]);
+                    } catch (NumberFormatException ex) {
+                        retVector[j][caseIndex] = null; // missing value
+                    }
+                    j++; 
+                }
+            } else {
+                scanner.close();
+                throw new IOException("Tab file has fewer rows than the stored number of cases!");
+            }
+        }
+
+        int tailIndex = numCases;
+        while (scanner.hasNext()) {
+            String nextLine = scanner.next();
+            if (!"".equals(nextLine)) {
+                scanner.close();
+                throw new IOException("Tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine);
+            }
+            tailIndex++;
+        }
+
+        scanner.close();
+        return retVector;
+
+    }
+    
+    public String[] subsetStringVector(DataFile datafile, int column) throws IOException {
+        return (String[])subsetObjectVector(datafile, column, COLUMN_TYPE_STRING);
+    }
+    
+    public Double[] subsetDoubleVector(DataFile datafile, int column) throws IOException {
+        return (Double[])subsetObjectVector(datafile, column, COLUMN_TYPE_DOUBLE);
+    }
+    
+    public Long[] subsetLongVector(DataFile datafile, int column) throws IOException {
+        return (Long[])subsetObjectVector(datafile, column, COLUMN_TYPE_LONG);
+    }
+    
+    // Float methods are temporary; 
+    // In normal operations we'll be treating all the floating point types as 
+    // doubles. I need to be able to handle floats for some 4.0 vs 3.* ingest
+    // tests. -- L.A. 
+    
+    public Float[] subsetFloatVector(DataFile datafile, int column) throws IOException {
+        return (Float[])subsetObjectVector(datafile, column, COLUMN_TYPE_FLOAT);
+    }
+    
+    public String[] subsetStringVector(File tabfile, int column, int varcount, int casecount) throws IOException {
+        return (String[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_STRING);
+    }
+    
+    public Double[] subsetDoubleVector(File tabfile, int column, int varcount, int casecount) throws IOException {
+        return (Double[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_DOUBLE);
+    }
+    
+    public Long[] subsetLongVector(File tabfile, int column, int varcount, int casecount) throws IOException {
+        return (Long[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_LONG);
+    }
+    
+    public Float[] subsetFloatVector(File tabfile, int column, int varcount, int casecount) throws IOException {
+        return (Float[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_FLOAT);
+    }
+    
+    public Object[] subsetObjectVector(DataFile dataFile, int column, int columntype) throws IOException {
+        if (!dataFile.isTabularData()) {
+            throw new IOException("DataFile is not tabular data.");
+        }
+         
+        int varcount = dataFile.getDataTable().getVarQuantity().intValue(); 
+        int casecount = dataFile.getDataTable().getCaseQuantity().intValue(); 
+        
+        if (column >= varcount) {
+            throw new IOException("Column "+column+" is out of bounds.");
+        }
+        
+        File tabfile = dataFile.getFileSystemLocation().toFile();
+
+        if (columntype == COLUMN_TYPE_STRING) {
+            String filename = dataFile.getFileMetadata().getLabel();
+            if (filename != null) {
+                filename = filename.replaceFirst("^_", "");
+                Integer fnumvalue = null; 
+                try {
+                    fnumvalue = new Integer(filename);
+                } catch (Exception ex){
+                    fnumvalue = null; 
+                }
+                if (fnumvalue != null) {
+                    //if ((fnumvalue.intValue() < 112497)) { // && (fnumvalue.intValue() > 60015)) {
+                    if ((fnumvalue.intValue() < 111931)) { // && (fnumvalue.intValue() > 60015)) {
+                        if (!(fnumvalue.intValue() == 60007
+                                || fnumvalue.intValue() == 59997
+                                || fnumvalue.intValue() == 60015
+                                || fnumvalue.intValue() == 59948
+                                || fnumvalue.intValue() == 60012
+                                || fnumvalue.intValue() == 52585
+                                || fnumvalue.intValue() == 60005
+                                || fnumvalue.intValue() == 60002
+                                || fnumvalue.intValue() == 59954
+                                || fnumvalue.intValue() == 60008
+                                || fnumvalue.intValue() == 54972
+                                || fnumvalue.intValue() == 55010
+                                || fnumvalue.intValue() == 54996
+                                || fnumvalue.intValue() == 53527
+                                || fnumvalue.intValue() == 53546
+                                || fnumvalue.intValue() == 55002
+                                || fnumvalue.intValue() == 55006
+                                || fnumvalue.intValue() == 54998
+                                || fnumvalue.intValue() == 52552
+                                // SPSS/SAV cases with similar issue - compat mode must be disabled
+                                //|| fnumvalue.intValue() == 101826 // temporary - tricky file with accents and v. 16...
+                                || fnumvalue.intValue() == 54618 // another SAV file, with long strings...
+                                || fnumvalue.intValue() == 54619 // [same]
+                                || fnumvalue.intValue() == 57983 
+                                || fnumvalue.intValue() == 58262
+                                || fnumvalue.intValue() == 58288
+                                || fnumvalue.intValue() == 58656
+                                || fnumvalue.intValue() == 59144
+                                // || fnumvalue.intValue() == 69626 [nope!]
+                                )) {
+                            dbgLog.info("\"Old\" file name detected; using \"compatibility mode\" for a character vector subset;");
+                            return subsetObjectVector(tabfile, column, varcount, casecount, columntype, true);
+                        }
+                    }
+                }
+            }
+        }
+
+        return subsetObjectVector(tabfile, column, varcount, casecount, columntype);
+    }
+    
+    public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype) throws IOException {
+        return subsetObjectVector(tabfile, column, varcount, casecount, columntype, false);
+    }
+    
+    
+    
+    public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype, boolean compatmode) throws IOException {
+        
+        Object[] retVector = null; 
+        
+        boolean isString = false; 
+        boolean isDouble = false;
+        boolean isLong   = false; 
+        boolean isFloat  = false; 
+        
+        //Locale loc = new Locale("en", "US");
+        
+        if (columntype == COLUMN_TYPE_STRING) {
+            isString = true; 
+            retVector = new String[casecount];
+        } else if (columntype == COLUMN_TYPE_DOUBLE) {
+            isDouble = true; 
+            retVector = new Double[casecount];
+        } else if (columntype == COLUMN_TYPE_LONG) {
+            isLong = true; 
+            retVector = new Long[casecount];
+        } else if (columntype == COLUMN_TYPE_FLOAT){
+            isFloat = true;
+            retVector = new Float[casecount];
+        } else {
+            throw new IOException("Unsupported column type: "+columntype);
+        }
+        
+        File rotatedImageFile = getRotatedImage(tabfile, varcount, casecount);
+        long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, varcount, casecount); 
+        long columnOffset = 0; 
+        long columnLength = 0; 
+        
+        if (column > 0) {
+            columnOffset = columnEndOffsets[column - 1];
+            columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1];
+        } else {
+            columnOffset = varcount * 8; 
+            columnLength = columnEndOffsets[0] - varcount * 8;  
+        }
+        
+        FileChannel fc = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), StandardOpenOption.READ));
+        fc.position(columnOffset);
+        int MAX_COLUMN_BUFFER = 8192;
+        
+        ByteBuffer in = ByteBuffer.allocate(MAX_COLUMN_BUFFER);
+        
+        if (columnLength < MAX_COLUMN_BUFFER) {
+          in.limit((int)(columnLength));
+        }
+        
+        long bytesRead = 0;
+        long bytesReadTotal = 0;
+        int caseindex = 0; 
+        int byteoffset = 0; 
+        byte[] leftover = null; 
+        
+        while (bytesReadTotal < columnLength) {
+            bytesRead = fc.read(in);
+            byte[] columnBytes = in.array();
+            int bytecount = 0; 
+
+            
+            while (bytecount < bytesRead) {
+                if (columnBytes[bytecount] == '\n') {
+                    /*
+                    String token = new String(columnBytes, byteoffset, bytecount-byteoffset, "UTF8");
+
+                    if (leftover != null) {
+                        String leftoverString = new String (leftover, "UTF8");
+                        token = leftoverString + token;
+                        leftover = null;
+                    }
+                    */
+                    /* 
+                     * Note that the way I was doing it at first - above - 
+                     * was not quite the correct way - because I was creating UTF8
+                     * strings from the leftover bytes, and the bytes in the 
+                     * current buffer *separately*; which means, if a multi-byte
+                     * UTF8 character got split in the middle between one buffer
+                     * and the next, both chunks of it would become junk 
+                     * characters, on each side!
+                     * The correct way of doing it, of course, is to create a
+                     * merged byte buffer, and then turn it into a UTF8 string. 
+                     *      -- L.A. 4.0
+                     */
+                    String token = null; 
+                    
+                    if (leftover == null) {
+                        token = new String(columnBytes, byteoffset, bytecount-byteoffset, "UTF8");
+                    } else {
+                        byte[] merged = new byte[leftover.length + bytecount-byteoffset];
+                        
+                        System.arraycopy(leftover, 0, merged, 0, leftover.length);
+                        System.arraycopy(columnBytes, byteoffset, merged, leftover.length, bytecount-byteoffset);
+                        token = new String (merged, "UTF8");
+                        leftover = null;
+                        merged = null; 
+                    }
+                    
+                    if (isString) {
+                        if ("".equals(token)) {
+                            // An empty string is a string missing value!
+                            // An empty string in quotes is an empty string!
+                            retVector[caseindex] = null;
+                        } else {
+                            // Strip the outer quotes:
+                            token = token.replaceFirst("^\\\"", "");
+                            token = token.replaceFirst("\\\"$", "");
+                            
+                            // We need to restore the special characters that 
+                            // are stored in tab files escaped - quotes, new lines 
+                            // and tabs. Before we do that however, we need to 
+                            // take care of any escaped backslashes stored in 
+                            // the tab file. I.e., "foo\t" should be transformed 
+                            // to "foo<TAB>"; but "foo\\t" should be transformed 
+                            // to "foo\t". This way new lines and tabs that were
+                            // already escaped in the original data are not 
+                            // going to be transformed to unescaped tab and 
+                            // new line characters!
+                            
+                            String[] splitTokens = token.split(Matcher.quoteReplacement("\\\\"), -2);
+                            
+                            // (note that it's important to use the 2-argument version 
+                            // of String.split(), and set the limit argument to a
+                            // negative value; otherwise any trailing backslashes 
+                            // are lost.)
+                            
+                            for (int i = 0; i < splitTokens.length; i++) {
+                                splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\\""), "\"");
+                                splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\t"), "\t");
+                                splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\n"), "\n");
+                                splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\r"), "\r");
+                            }
+                            // TODO: 
+                            // Make (some of?) the above optional; for ex., we 
+                            // do need to restore the newlines when calculating UNFs;
+                            // But if we are subsetting these vectors in order to 
+                            // create a new tab-delimited file, they will 
+                            // actually break things! -- L.A. Jul. 28 2014
+                            
+                            token = StringUtils.join(splitTokens, '\\');
+                            
+                            // "compatibility mode" - a hack, to be able to produce
+                            // unfs identical to those produced by the "early" 
+                            // unf5 jar; will be removed in production 4.0. 
+                            // -- L.A. (TODO: ...)
+                            if (compatmode && !"".equals(token)) {
+                                if (token.length() > 128) {
+                                    if ("".equals(token.trim())) {
+                                        // don't ask... 
+                                        token = token.substring(0, 129);
+                                    } else {
+                                        token = token.substring(0, 128);
+                                        //token = String.format(loc, "%.128s", token);
+                                        token = token.trim();
+                                        //dbgLog.info("formatted and trimmed: "+token);
+                                    }
+                                } else {
+                                    if ("".equals(token.trim())) {
+                                        // again, don't ask; 
+                                        // - this replicates some bugginness 
+                                        // that happens inside unf5;
+                                        token = "null";
+                                    } else {
+                                        token = token.trim();
+                                    }
+                                }
+                            }
+                            
+                            retVector[caseindex] = token;
+                        }
+                    } else if (isDouble) {
+                        try {
+                            // TODO: verify that NaN and +-Inf are 
+                            // handled correctly here! -- L.A.
+                            // Verified: new Double("nan") works correctly, 
+                            // resulting in Double.NaN;
+                            // Double("[+-]Inf") doesn't work however; 
+                            // (the constructor appears to be expecting it
+                            // to be spelled as "Infinity", "-Infinity", etc. 
+                            if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) {
+                                retVector[caseindex] = java.lang.Double.POSITIVE_INFINITY;
+                            } else if ("-inf".equalsIgnoreCase(token)) {
+                                retVector[caseindex] = java.lang.Double.NEGATIVE_INFINITY;
+                            } else if (token == null || token.equals("")) {
+                                // missing value:
+                                retVector[caseindex] = null;
+                            } else {
+                                retVector[caseindex] = new Double(token);
+                            }
+                        } catch (NumberFormatException ex) {
+                            dbgLog.warning("NumberFormatException thrown for "+token+" as Double");
+
+                            retVector[caseindex] = null; // missing value
+                            // TODO: ?
+                        }
+                    } else if (isLong) {
+                        try {
+                            retVector[caseindex] = new Long(token);
+                        } catch (NumberFormatException ex) {
+                            retVector[caseindex] = null; // assume missing value
+                        }
+                    } else if (isFloat) {
+                        try {
+                            if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) {
+                                retVector[caseindex] = java.lang.Float.POSITIVE_INFINITY;
+                            } else if ("-inf".equalsIgnoreCase(token)) {
+                                retVector[caseindex] = java.lang.Float.NEGATIVE_INFINITY;
+                            } else if (token == null || token.equals("")) {
+                                // missing value:
+                                retVector[caseindex] = null;
+                            } else {
+                                retVector[caseindex] = new Float(token);
+                            }
+                        } catch (NumberFormatException ex) {
+                            dbgLog.warning("NumberFormatException thrown for "+token+" as Float");
+                            retVector[caseindex] = null; // assume missing value (TODO: ?)
+                        }
+                    }
+                    caseindex++;
+                    
+                    if (bytecount == bytesRead - 1) {
+                        byteoffset = 0;
+                    } else {
+                        byteoffset = bytecount + 1;
+                    }
+                } else {
+                    if (bytecount == bytesRead - 1) {
+                        // We've reached the end of the buffer; 
+                        // This means we'll save whatever unused bytes left in 
+                        // it - i.e., the bytes between the last new line 
+                        // encountered and the end - in the leftover buffer. 
+                        
+                        // *EXCEPT*, there may be a case of a very long String
+                        // that is actually longer than MAX_COLUMN_BUFFER, in 
+                        // which case it is possible that we've read through
+                        // an entire buffer of bytes without finding any 
+                        // new lines... in this case we may need to add this
+                        // entire byte buffer to an already existing leftover 
+                        // buffer!
+                        if (leftover == null) {
+                            leftover = new byte[(int)bytesRead - byteoffset];
+                            System.arraycopy(columnBytes, byteoffset, leftover, 0, (int)bytesRead - byteoffset);
+                        } else {
+                            if (byteoffset != 0) {
+                                throw new IOException("Reached the end of the byte buffer, with some leftover left from the last read; yet the offset is not zero!");
+                            }
+                            byte[] merged = new byte[leftover.length + (int)bytesRead];
+
+                            System.arraycopy(leftover, 0, merged, 0, leftover.length);
+                            System.arraycopy(columnBytes, byteoffset, merged, leftover.length, (int)bytesRead);
+                            //leftover = null;
+                            leftover = merged;
+                            merged = null;   
+                        }
+                        byteoffset = 0;
+
+                    }
+                }
+                bytecount++;
+            }
+            
+            bytesReadTotal += bytesRead; 
+            in.clear();
+            if (columnLength - bytesReadTotal < MAX_COLUMN_BUFFER) {
+                in.limit((int)(columnLength - bytesReadTotal));
+            }
+        }
+        
+        fc.close();
+
+        if (caseindex != casecount) {
+            throw new IOException("Faile to read "+casecount+" tokens for column "+column);
+            //System.out.println("read "+caseindex+" tokens instead of expected "+casecount+".");
+        }
+        
+        return retVector; 
+    }
+    
+    private long[] extractColumnOffsets (File rotatedImageFile, int varcount, int casecount) throws IOException {
+         BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotatedImageFile));
+        
+        byte[] offsetHeader = new byte[varcount * 8];
+        long[] byteOffsets = new long[varcount];
+        
+        
+        int readlen = rotfileStream.read(offsetHeader); 
+        
+        if (readlen != varcount * 8) {
+            throw new IOException ("Could not read "+varcount*8+" header bytes from the rotated file.");
+        }
+        
+        for (int varindex = 0; varindex < varcount; varindex++) {
+            byte[] offsetBytes = new byte[8];
+            System.arraycopy(offsetHeader, varindex*8, offsetBytes, 0, 8);
+           
+            ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes);
+            byteOffsets[varindex] = offsetByteBuffer.getLong();
+            
+            //System.out.println(byteOffsets[varindex]);
+        }
+        
+        rotfileStream.close(); 
+        
+        return byteOffsets; 
+    }
+    
+    private File getRotatedImage(File tabfile, int varcount, int casecount)  throws IOException {
+        String fileName = tabfile.getAbsolutePath();
+        String rotatedImageFileName = fileName + ".90d";
+        File rotatedImageFile = new File(rotatedImageFileName); 
+        if (rotatedImageFile.exists()) {
+            //System.out.println("Image already exists!");
+            return rotatedImageFile;
+        }
+        
+        return generateRotatedImage(tabfile, varcount, casecount);
+        
+    }
+    
+    private File generateRotatedImage (File tabfile, int varcount, int casecount) throws IOException {
+        // TODO: throw exceptions if bad file, zero varcount, etc. ...
+        
+        String fileName = tabfile.getAbsolutePath();
+        String rotatedImageFileName = fileName + ".90d";
+        
+        int MAX_OUTPUT_STREAMS = 32;
+        int MAX_BUFFERED_BYTES = 10 * 1024 * 1024; // 10 MB - for now?
+        int MAX_COLUMN_BUFFER = 8 * 1024; 
+        
+        // offsetHeader will contain the byte offsets of the individual column 
+        // vectors in the final rotated image file
+        byte[] offsetHeader = new byte[varcount * 8];
+        int[] bufferedSizes = new int[varcount];
+        long[] cachedfileSizes = new long[varcount];
+        File[] columnTempFiles = new File[varcount];
+        
+        for (int i = 0; i < varcount; i++) {
+            bufferedSizes[i] = 0; 
+            cachedfileSizes[i] = 0;
+        }
+        
+        // TODO: adjust MAX_COLUMN_BUFFER here, so that the total size is 
+        // no more than MAX_BUFFERED_BYTES (but no less than 1024 maybe?)
+        
+        byte[][] bufferedColumns = new byte [varcount][MAX_COLUMN_BUFFER];
+        
+        // read the tab-delimited file: 
+        
+        FileInputStream tabfileStream = new FileInputStream(tabfile);
+        
+        Scanner scanner = new Scanner(tabfileStream);
+        scanner.useDelimiter("\\n");
+        
+        for (int caseindex = 0; caseindex < casecount; caseindex++) {
+            if (scanner.hasNext()) {
+                String[] line = (scanner.next()).split("\t", -1);
+                // TODO: throw an exception if there are fewer tab-delimited 
+                // tokens than the number of variables specified. 
+                String token = "";
+                int tokensize = 0; 
+                for (int varindex = 0; varindex < varcount; varindex++) {
+                    // TODO: figure out the safest way to convert strings to 
+                    // bytes here. Is it going to be safer to use getBytes("UTF8")?
+                    // we are already making the assumption that the values 
+                    // in the tab file are in UTF8. -- L.A.
+                    token = line[varindex] + "\n";
+                    tokensize = token.getBytes().length;
+                    if (bufferedSizes[varindex]+tokensize > MAX_COLUMN_BUFFER) {
+                        // fill the buffer and dump its contents into the temp file:
+                        // (do note that there may be *several* MAX_COLUMN_BUFFERs
+                        // worth of bytes in the token!)
+                        
+                        int tokenoffset = 0; 
+
+                        if (bufferedSizes[varindex] != MAX_COLUMN_BUFFER) {
+                            tokenoffset = MAX_COLUMN_BUFFER-bufferedSizes[varindex];
+                            System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokenoffset);
+                        } // (otherwise the buffer is already full, and we should 
+                          // simply dump it into the temp file, without adding any 
+                          // extra bytes to it)
+                        
+                        File bufferTempFile = columnTempFiles[varindex]; 
+                        if (bufferTempFile == null) {
+                            bufferTempFile = File.createTempFile("columnBufferFile", "bytes");
+                            columnTempFiles[varindex] = bufferTempFile; 
+                        } 
+                        
+                        // *append* the contents of the buffer to the end of the
+                        // temp file, if already exists:
+                        BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream (bufferTempFile, true));
+                        outputStream.write(bufferedColumns[varindex], 0, MAX_COLUMN_BUFFER);
+                        cachedfileSizes[varindex] += MAX_COLUMN_BUFFER;
+                        
+                        // keep writing MAX_COLUMN_BUFFER-size chunks of bytes into 
+                        // the temp file, for as long as there's more than MAX_COLUMN_BUFFER
+                        // bytes left in the token:
+                        
+                        while (tokensize - tokenoffset > MAX_COLUMN_BUFFER) {
+                            outputStream.write(token.getBytes(), tokenoffset, MAX_COLUMN_BUFFER);
+                            cachedfileSizes[varindex] += MAX_COLUMN_BUFFER;
+                            tokenoffset += MAX_COLUMN_BUFFER;
+                        }
+                        
+                        outputStream.close();
+                        
+                        // buffer the remaining bytes and reset the buffered 
+                        // byte counter: 
+                        
+                        System.arraycopy(token.getBytes(), 
+                                tokenoffset, 
+                                bufferedColumns[varindex], 
+                                0,
+                                tokensize - tokenoffset); 
+                        
+                        bufferedSizes[varindex] = tokensize - tokenoffset; 
+                        
+                    } else {
+                        // continue buffering
+                        System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokensize);
+                        bufferedSizes[varindex] += tokensize; 
+                    }
+                }
+            } else {
+                scanner.close();
+                throw new IOException("Tab file has fewer rows than the stored number of cases!");
+            }
+            
+        }
+        
+        // OK, we've created the individual byte vectors of the tab file columns;
+        // they may be partially saved in temp files and/or in memory.
+        // We now need to go through all these buffers and create the final 
+        // rotated image file. 
+        
+        BufferedOutputStream finalOut = new BufferedOutputStream(new FileOutputStream (new File(rotatedImageFileName)));
+        
+        // but first we should create the offset header and write it out into 
+        // the final file; because it should be at the head, doh!
+        
+        long columnOffset = varcount * 8;
+        // (this is the offset of the first column vector; it is equal to the
+        // size of the offset header, i.e. varcount * 8 bytes)
+      
+        for (int varindex = 0; varindex < varcount; varindex++) {
+            long totalColumnBytes = cachedfileSizes[varindex] + bufferedSizes[varindex];
+            columnOffset+=totalColumnBytes;
+            //totalColumnBytes;
+            byte[] columnOffsetByteArray = ByteBuffer.allocate(8).putLong(columnOffset).array();
+            System.arraycopy(columnOffsetByteArray, 0, offsetHeader, varindex * 8, 8);
+        }
+        
+        finalOut.write(offsetHeader, 0, varcount * 8);
+        
+        for (int varindex = 0; varindex < varcount; varindex++) {
+            long cachedBytesRead = 0; 
+            
+            // check if there is a cached temp file:
+            
+            File cachedTempFile = columnTempFiles[varindex]; 
+            if (cachedTempFile != null) {
+                byte[] cachedBytes = new byte[MAX_COLUMN_BUFFER];
+                BufferedInputStream cachedIn = new BufferedInputStream(new FileInputStream(cachedTempFile));
+                int readlen = 0; 
+                while ((readlen = cachedIn.read(cachedBytes)) > -1) {
+                    finalOut.write(cachedBytes, 0, readlen);
+                    cachedBytesRead += readlen;
+                }
+                cachedIn.close();
+                // delete the temp file: 
+                cachedTempFile.delete();
+                
+            }
+            
+            if (cachedBytesRead != cachedfileSizes[varindex]) {
+                finalOut.close();
+                throw new IOException("Could not read the correct number of bytes cached for column "+varindex+"; "+
+                        cachedfileSizes[varindex] + " bytes expected, "+cachedBytesRead+" read.");
+            }
+            
+            // then check if there are any bytes buffered for this column:
+            
+            if (bufferedSizes[varindex] > 0) {
+                finalOut.write(bufferedColumns[varindex], 0, bufferedSizes[varindex]);
+            }
+            
+        }
+        
+        finalOut.close();
+        return new File(rotatedImageFileName);
+
+    }
+  
+    /*
+     * Test method for taking a "rotated" image, and reversing it, reassembling 
+     * all the columns in the original order. Which should result in a file 
+     * byte-for-byte identical file to the original tab-delimited version.
+     *
+     * (do note that this method is not efficiently implemented; it's only 
+     * being used for experiments so far, to confirm the accuracy of the 
+     * accuracy of generateRotatedImage(). It should not be used for any 
+     * practical means in the application!)
+     */
+    private void reverseRotatedImage (File rotfile, int varcount, int casecount) throws IOException {
+        // open the file, read in the offset header: 
+        BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotfile));
+        
+        byte[] offsetHeader = new byte[varcount * 8];
+        long[] byteOffsets = new long[varcount];
+        
+        int readlen = rotfileStream.read(offsetHeader); 
+        
+        if (readlen != varcount * 8) {
+            throw new IOException ("Could not read "+varcount*8+" header bytes from the rotated file.");
+        }
+        
+        for (int varindex = 0; varindex < varcount; varindex++) {
+            byte[] offsetBytes = new byte[8];
+            System.arraycopy(offsetHeader, varindex*8, offsetBytes, 0, 8);
+           
+            ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes);
+            byteOffsets[varindex] = offsetByteBuffer.getLong();
+            
+            //System.out.println(byteOffsets[varindex]);
+        }
+        
+        String [][] reversedMatrix = new String[casecount][varcount];
+        
+        long offset = varcount * 8; 
+        byte[] columnBytes; 
+        
+        for (int varindex = 0; varindex < varcount; varindex++) {
+            long columnLength = byteOffsets[varindex] - offset; 
+            
+            
+            
+            columnBytes = new byte[(int)columnLength];
+            readlen = rotfileStream.read(columnBytes);
+            
+            if (readlen != columnLength) {
+                throw new IOException ("Could not read "+columnBytes+" bytes for column "+varindex);
+            }
+            /*
+            String columnString = new String(columnBytes);
+            //System.out.print(columnString);
+            String[] values = columnString.split("\n", -1);
+            
+            if (values.length < casecount) {
+                throw new IOException("count mismatch: "+values.length+" tokens found for column "+varindex);
+            }
+            
+            for (int caseindex = 0; caseindex < casecount; caseindex++) {
+                reversedMatrix[caseindex][varindex] = values[caseindex];
+            }*/
+            
+            int bytecount = 0; 
+            int byteoffset = 0; 
+            int caseindex = 0;
+            //System.out.println("generating value vector for column "+varindex);
+            while (bytecount < columnLength) {
+                if (columnBytes[bytecount] == '\n') {
+                    String token = new String(columnBytes, byteoffset, bytecount-byteoffset);
+                    reversedMatrix[caseindex++][varindex] = token;
+                    byteoffset = bytecount + 1;
+                }
+                bytecount++;
+            }
+            
+            if (caseindex != casecount) {
+                throw new IOException("count mismatch: "+caseindex+" tokens found for column "+varindex);
+            }
+            offset = byteOffsets[varindex];
+        }
+        
+        for (int caseindex = 0; caseindex < casecount; caseindex++) {
+            for (int varindex = 0; varindex < varcount; varindex++) {
+                System.out.print(reversedMatrix[caseindex][varindex]);
+                if (varindex < varcount-1) {
+                    System.out.print("\t");
+                } else {
+                    System.out.print("\n");
+                }
+            }
+        }
+        
+        rotfileStream.close();
+        
+        
+    }
+    
+    /**
+     * main() method, for testing
+     * usage: java edu.harvard.iq.dataverse.dataaccess.TabularSubsetGenerator testfile.tab varcount casecount column type
+     * make sure the CLASSPATH contains ...
+     * 
+     */
+    
+    public static void main(String[] args) {
+        
+        String tabFileName = args[0]; 
+        int varcount = new Integer(args[1]).intValue();
+        int casecount = new Integer(args[2]).intValue();
+        int column = new Integer(args[3]).intValue();
+        String type = args[4];
+        
+        File tabFile = new File(tabFileName);
+        File rotatedImageFile = null; 
+        
+        TabularSubsetGenerator subsetGenerator = new TabularSubsetGenerator(); 
+        
+        /*
+        try {
+            rotatedImageFile = subsetGenerator.getRotatedImage(tabFile, varcount, casecount);
+        } catch (IOException ex) {
+            System.out.println(ex.getMessage());
+        }
+        */
+        
+        //System.out.println("\nFinished generating \"rotated\" column image file."); 
+                
+        //System.out.println("\nOffsets:");
+        
+        MathContext doubleMathContext = new MathContext(15, RoundingMode.HALF_EVEN);
+        String FORMAT_IEEE754 = "%+#.15e";
+        
+        try {
+            //subsetGenerator.reverseRotatedImage(rotatedImageFile, varcount, casecount);
+            //String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount);
+            if ("string".equals(type)) {
+                String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount);
+                for (int i = 0; i < casecount; i++) {
+                    System.out.println(columns[i]);
+                }
+            } else {
+
+                Double[] columns = subsetGenerator.subsetDoubleVector(tabFile, column, varcount, casecount);
+                for (int i = 0; i < casecount; i++) {
+                    if (columns[i] != null) {
+                        BigDecimal outBigDecimal = new BigDecimal(columns[i], doubleMathContext);
+                        System.out.println(String.format(FORMAT_IEEE754, outBigDecimal));
+                    } else {
+                        System.out.println("NA");
+                    }
+                    //System.out.println(columns[i]);
+                }
+            }
+        } catch (IOException ex) {
+            System.out.println(ex.getMessage());
+        }
+    }
+}
+
+
author	Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date	Tue, 08 Sep 2015 17:00:21 +0200
parents
children