Mercurial > hg > LGDataverses
comparison src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 9:5926d6419569 | 10:a50cf11e5178 |
|---|---|
| 1 /* | |
| 2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College. | |
| 3 | |
| 4 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 you may not use this file except in compliance with the License. | |
| 6 You may obtain a copy of the License at | |
| 7 | |
| 8 http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 | |
| 10 Unless required by applicable law or agreed to in writing, software | |
| 11 distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 See the License for the specific language governing permissions and | |
| 14 limitations under the License. | |
| 15 | |
| 16 Dataverse Network - A web application to share, preserve and analyze research data. | |
| 17 Developed at the Institute for Quantitative Social Science, Harvard University. | |
| 18 Version 3.0. | |
| 19 */ | |
| 20 | |
| 21 package edu.harvard.iq.dataverse.dataaccess; | |
| 22 | |
| 23 import edu.harvard.iq.dataverse.DataFile; | |
| 24 import edu.harvard.iq.dataverse.datavariable.DataVariable; | |
| 25 import java.util.*; | |
| 26 import java.util.Scanner; | |
| 27 import java.util.logging.*; | |
| 28 import java.io.*; | |
| 29 import java.io.FileNotFoundException; | |
| 30 import java.math.BigDecimal; | |
| 31 import java.math.MathContext; | |
| 32 import java.math.RoundingMode; | |
| 33 import java.nio.ByteBuffer; | |
| 34 import java.nio.channels.FileChannel; | |
| 35 import java.nio.file.Paths; | |
| 36 import java.nio.file.StandardOpenOption; | |
| 37 import java.util.regex.Matcher; | |
| 38 | |
| 39 | |
| 40 import org.apache.commons.lang.*; | |
| 41 | |
| 42 | |
| 43 /** | |
| 44 * | |
| 45 * @author Leonid Andreev | |
| 46 * original author: | |
| 47 * @author a.sone | |
| 48 */ | |
| 49 | |
| 50 public class TabularSubsetGenerator implements SubsetGenerator { | |
| 51 | |
| 52 private static Logger dbgLog = Logger.getLogger(TabularSubsetGenerator.class.getPackage().getName()); | |
| 53 | |
| 54 private static int COLUMN_TYPE_STRING = 1; | |
| 55 private static int COLUMN_TYPE_LONG = 2; | |
| 56 private static int COLUMN_TYPE_DOUBLE = 3; | |
| 57 private static int COLUMN_TYPE_FLOAT = 4; | |
| 58 | |
| 59 private static int MAX_COLUMN_BUFFER = 8192; | |
| 60 | |
| 61 private FileChannel fileChannel = null; | |
| 62 | |
| 63 private int varcount; | |
| 64 private int casecount; | |
| 65 private int subsetcount; | |
| 66 | |
| 67 private byte[][] columnEntries = null; | |
| 68 | |
| 69 | |
| 70 private ByteBuffer[] columnByteBuffers; | |
| 71 private int[] columnBufferSizes; | |
| 72 private int[] columnBufferOffsets; | |
| 73 | |
| 74 private long[] columnStartOffsets; | |
| 75 private long[] columnTotalOffsets; | |
| 76 private long[] columnTotalLengths; | |
| 77 | |
| 78 public TabularSubsetGenerator() { | |
| 79 | |
| 80 } | |
| 81 | |
| 82 public TabularSubsetGenerator (DataFile datafile, List<DataVariable> variables) throws IOException { | |
| 83 if (!datafile.isTabularData()) { | |
| 84 throw new IOException("DataFile is not tabular data."); | |
| 85 } | |
| 86 | |
| 87 setVarCount(datafile.getDataTable().getVarQuantity().intValue()); | |
| 88 setCaseCount(datafile.getDataTable().getCaseQuantity().intValue()); | |
| 89 | |
| 90 File tabfile = datafile.getFileSystemLocation().toFile(); | |
| 91 File rotatedImageFile = getRotatedImage(tabfile, getVarCount(), getCaseCount()); | |
| 92 long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, getVarCount(), getCaseCount()); | |
| 93 | |
| 94 fileChannel = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), StandardOpenOption.READ)); | |
| 95 | |
| 96 if (variables == null || variables.size() < 1 || variables.size() > getVarCount()) { | |
| 97 throw new IOException("Illegal number of variables in the subset request"); | |
| 98 } | |
| 99 | |
| 100 subsetcount = variables.size(); | |
| 101 columnTotalOffsets = new long[subsetcount]; | |
| 102 columnTotalLengths = new long[subsetcount]; | |
| 103 columnByteBuffers = new ByteBuffer[subsetcount]; | |
| 104 | |
| 105 | |
| 106 | |
| 107 if (subsetcount == 1) { | |
| 108 if (!datafile.getDataTable().getId().equals(variables.get(0).getDataTable().getId())) { | |
| 109 throw new IOException("Variable in the subset request does not belong to the datafile."); | |
| 110 } | |
| 111 dbgLog.fine("single variable subset; setting fileChannel position to "+extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder())); | |
| 112 fileChannel.position(extractColumnOffset(columnEndOffsets, variables.get(0).getFileOrder())); | |
| 113 columnTotalLengths[0] = extractColumnLength(columnEndOffsets, variables.get(0).getFileOrder()); | |
| 114 columnTotalOffsets[0] = 0; | |
| 115 } else { | |
| 116 columnEntries = new byte[subsetcount][]; | |
| 117 | |
| 118 columnBufferSizes = new int[subsetcount]; | |
| 119 columnBufferOffsets = new int[subsetcount]; | |
| 120 columnStartOffsets = new long[subsetcount]; | |
| 121 | |
| 122 int i = 0; | |
| 123 for (DataVariable var : variables) { | |
| 124 if (!datafile.getDataTable().getId().equals(var.getDataTable().getId())) { | |
| 125 throw new IOException("Variable in the subset request does not belong to the datafile."); | |
| 126 } | |
| 127 columnByteBuffers[i] = ByteBuffer.allocate(MAX_COLUMN_BUFFER); | |
| 128 columnTotalLengths[i] = extractColumnLength(columnEndOffsets, var.getFileOrder()); | |
| 129 columnStartOffsets[i] = extractColumnOffset(columnEndOffsets, var.getFileOrder()); | |
| 130 if (columnTotalLengths[i] < MAX_COLUMN_BUFFER) { | |
| 131 columnByteBuffers[i].limit((int)columnTotalLengths[i]); | |
| 132 } | |
| 133 fileChannel.position(columnStartOffsets[i]); | |
| 134 columnBufferSizes[i] = fileChannel.read(columnByteBuffers[i]); | |
| 135 columnBufferOffsets[i] = 0; | |
| 136 columnTotalOffsets[i] = columnBufferSizes[i]; | |
| 137 i++; | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 private int getVarCount() { | |
| 143 return varcount; | |
| 144 } | |
| 145 | |
| 146 private void setVarCount(int varcount) { | |
| 147 this.varcount = varcount; | |
| 148 } | |
| 149 | |
| 150 private int getCaseCount() { | |
| 151 return casecount; | |
| 152 } | |
| 153 | |
| 154 private void setCaseCount(int casecount) { | |
| 155 this.casecount = casecount; | |
| 156 } | |
| 157 | |
| 158 | |
| 159 /* | |
| 160 * Note that this method operates on the *absolute* column number, i.e. | |
| 161 * the number of the physical column in the tabular file. This is stored | |
| 162 * in DataVariable.FileOrder. | |
| 163 * This "column number" should not be confused with the number of column | |
| 164 * in the subset request; a user can request any number of variable | |
| 165 * columns, in an order that doesn't have to follow the physical order | |
| 166 * of the columns in the file. | |
| 167 */ | |
| 168 private long extractColumnOffset(long[] columnEndOffsets, int column) throws IOException { | |
| 169 if (columnEndOffsets == null || columnEndOffsets.length <= column) { | |
| 170 throw new IOException("Offsets table not initialized; or column out of bounds."); | |
| 171 } | |
| 172 long columnOffset; | |
| 173 | |
| 174 if (column > 0) { | |
| 175 columnOffset = columnEndOffsets[column - 1]; | |
| 176 } else { | |
| 177 columnOffset = getVarCount() * 8; | |
| 178 } | |
| 179 return columnOffset; | |
| 180 } | |
| 181 | |
| 182 /* | |
| 183 * See the comment for the method above. | |
| 184 */ | |
| 185 private long extractColumnLength(long[] columnEndOffsets, int column) throws IOException { | |
| 186 if (columnEndOffsets == null || columnEndOffsets.length <= column) { | |
| 187 throw new IOException("Offsets table not initialized; or column out of bounds."); | |
| 188 } | |
| 189 long columnLength; | |
| 190 | |
| 191 if (column > 0) { | |
| 192 columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1]; | |
| 193 } else { | |
| 194 columnLength = columnEndOffsets[0] - varcount * 8; | |
| 195 } | |
| 196 | |
| 197 return columnLength; | |
| 198 } | |
| 199 | |
| 200 | |
| 201 private void bufferMoreColumnBytes(int column) throws IOException { | |
| 202 if (columnTotalOffsets[column] >= columnTotalLengths[column]) { | |
| 203 throw new IOException("attempt to buffer bytes past the column boundary"); | |
| 204 } | |
| 205 fileChannel.position(columnStartOffsets[column] + columnTotalOffsets[column]); | |
| 206 | |
| 207 columnByteBuffers[column].clear(); | |
| 208 if (columnTotalLengths[column] < columnTotalOffsets[column] + MAX_COLUMN_BUFFER) { | |
| 209 dbgLog.fine("Limiting the buffer to "+(columnTotalLengths[column] - columnTotalOffsets[column])+" bytes"); | |
| 210 columnByteBuffers[column].limit((int) (columnTotalLengths[column] - columnTotalOffsets[column])); | |
| 211 } | |
| 212 columnBufferSizes[column] = fileChannel.read(columnByteBuffers[column]); | |
| 213 dbgLog.fine("Read "+columnBufferSizes[column]+" bytes for subset column "+column); | |
| 214 columnBufferOffsets[column] = 0; | |
| 215 columnTotalOffsets[column] += columnBufferSizes[column]; | |
| 216 } | |
| 217 | |
| 218 /* | |
| 219 do not use this method! | |
| 220 there's a high potential for the "UTF8 character split between buffers" error! | |
| 221 public String readColumnEntry(int column) { | |
| 222 String ret = null; | |
| 223 int currentbyte; | |
| 224 | |
| 225 if (columnBufferOffsets[column] >= columnBufferSizes[column]) { | |
| 226 try { | |
| 227 bufferMoreColumnBytes(column); | |
| 228 } catch (IOException ioe) { | |
| 229 return null; | |
| 230 } | |
| 231 } | |
| 232 | |
| 233 currentbyte = columnBufferOffsets[column]; | |
| 234 try { | |
| 235 while (columnByteBuffers[column].array()[currentbyte] != '\n') { | |
| 236 currentbyte++; | |
| 237 if (currentbyte == columnBufferSizes[column]) { | |
| 238 // save the leftover: | |
| 239 if (ret == null) { | |
| 240 ret = new String(columnByteBuffers[column].array(), columnBufferOffsets[column], columnBufferSizes[column] - columnBufferOffsets[column], "UTF8"); | |
| 241 } else { | |
| 242 ret = ret.concat(new String(columnByteBuffers[column].array(), columnBufferOffsets[column], columnBufferSizes[column] - columnBufferOffsets[column], "UTF8")); | |
| 243 } | |
| 244 // read more bytes: | |
| 245 bufferMoreColumnBytes(column); | |
| 246 currentbyte = 0; | |
| 247 } | |
| 248 } | |
| 249 | |
| 250 // presumably, we have found our '\n': | |
| 251 if (ret == null) { | |
| 252 ret = new String(columnByteBuffers[column].array(), columnBufferOffsets[column], currentbyte - columnBufferOffsets[column], "UTF8"); | |
| 253 } else { | |
| 254 ret = ret.concat(new String(columnByteBuffers[column].array(), columnBufferOffsets[column], currentbyte - columnBufferOffsets[column], "UTF8")); | |
| 255 } | |
| 256 | |
| 257 } catch (IOException ioe) { | |
| 258 return null; | |
| 259 } | |
| 260 | |
| 261 columnBufferOffsets[column] += (currentbyte + 1); | |
| 262 | |
| 263 return ret; | |
| 264 } | |
| 265 */ | |
| 266 | |
| 267 public byte[] readColumnEntryBytes(int column) { | |
| 268 return readColumnEntryBytes(column, true); | |
| 269 } | |
| 270 | |
| 271 | |
| 272 public byte[] readColumnEntryBytes(int column, boolean addTabs) { | |
| 273 byte[] leftover = null; | |
| 274 byte[] ret = null; | |
| 275 | |
| 276 if (columnBufferOffsets[column] >= columnBufferSizes[column]) { | |
| 277 try { | |
| 278 bufferMoreColumnBytes(column); | |
| 279 if (columnBufferSizes[column] < 1) { | |
| 280 return null; | |
| 281 } | |
| 282 } catch (IOException ioe) { | |
| 283 return null; | |
| 284 } | |
| 285 } | |
| 286 | |
| 287 int byteindex = columnBufferOffsets[column]; | |
| 288 try { | |
| 289 while (columnByteBuffers[column].array()[byteindex] != '\n') { | |
| 290 byteindex++; | |
| 291 if (byteindex == columnBufferSizes[column]) { | |
| 292 // save the leftover: | |
| 293 if (leftover == null) { | |
| 294 leftover = new byte[columnBufferSizes[column] - columnBufferOffsets[column]]; | |
| 295 System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], leftover, 0, columnBufferSizes[column] - columnBufferOffsets[column]); | |
| 296 } else { | |
| 297 byte[] merged = new byte[leftover.length + columnBufferSizes[column]]; | |
| 298 | |
| 299 System.arraycopy(leftover, 0, merged, 0, leftover.length); | |
| 300 System.arraycopy(columnByteBuffers[column].array(), 0, merged, leftover.length, columnBufferSizes[column]); | |
| 301 leftover = merged; | |
| 302 merged = null; | |
| 303 } | |
| 304 // read more bytes: | |
| 305 bufferMoreColumnBytes(column); | |
| 306 if (columnBufferSizes[column] < 1) { | |
| 307 return null; | |
| 308 } | |
| 309 byteindex = 0; | |
| 310 } | |
| 311 } | |
| 312 | |
| 313 // presumably, we have found our '\n': | |
| 314 if (leftover == null) { | |
| 315 ret = new byte[byteindex - columnBufferOffsets[column] + 1]; | |
| 316 System.arraycopy(columnByteBuffers[column].array(), columnBufferOffsets[column], ret, 0, byteindex - columnBufferOffsets[column] + 1); | |
| 317 } else { | |
| 318 ret = new byte[leftover.length + byteindex + 1]; | |
| 319 System.arraycopy(leftover, 0, ret, 0, leftover.length); | |
| 320 System.arraycopy(columnByteBuffers[column].array(), 0, ret, leftover.length, byteindex + 1); | |
| 321 } | |
| 322 | |
| 323 } catch (IOException ioe) { | |
| 324 return null; | |
| 325 } | |
| 326 | |
| 327 columnBufferOffsets[column] = (byteindex + 1); | |
| 328 | |
| 329 if (column < columnBufferOffsets.length - 1) { | |
| 330 ret[ret.length - 1] = '\t'; | |
| 331 } | |
| 332 return ret; | |
| 333 } | |
| 334 | |
| 335 public int readSingleColumnSubset(byte[] buffer) throws IOException { | |
| 336 if (columnTotalOffsets[0] == columnTotalLengths[0]) { | |
| 337 return -1; | |
| 338 } | |
| 339 | |
| 340 if (columnByteBuffers[0] == null) { | |
| 341 dbgLog.fine("allocating single column subset buffer."); | |
| 342 columnByteBuffers[0] = ByteBuffer.allocate(buffer.length); | |
| 343 } | |
| 344 | |
| 345 int bytesread = fileChannel.read(columnByteBuffers[0]); | |
| 346 dbgLog.fine("single column subset: read "+bytesread+" bytes."); | |
| 347 if (columnTotalOffsets[0] + bytesread > columnTotalLengths[0]) { | |
| 348 bytesread = (int)(columnTotalLengths[0] - columnTotalOffsets[0]); | |
| 349 } | |
| 350 System.arraycopy(columnByteBuffers[0].array(), 0, buffer, 0, bytesread); | |
| 351 | |
| 352 columnTotalOffsets[0] += bytesread; | |
| 353 columnByteBuffers[0].clear(); | |
| 354 return bytesread > 0 ? bytesread : -1; | |
| 355 } | |
| 356 | |
| 357 | |
| 358 public byte[] readSubsetLineBytes() throws IOException { | |
| 359 byte[] ret = null; | |
| 360 int total = 0; | |
| 361 | |
| 362 for (int i = 0; i < subsetcount; i++) { | |
| 363 columnEntries[i] = readColumnEntryBytes(i); | |
| 364 if (columnEntries[i] == null) { | |
| 365 throw new IOException("Failed to read subset line entry"); | |
| 366 } | |
| 367 total += columnEntries[i].length; | |
| 368 } | |
| 369 | |
| 370 ret = new byte[total]; | |
| 371 int offset = 0; | |
| 372 for (int i = 0; i < subsetcount; i++) { | |
| 373 System.arraycopy(columnEntries[i], 0, ret, offset, columnEntries[i].length); | |
| 374 offset += columnEntries[i].length; | |
| 375 } | |
| 376 dbgLog.fine("line: "+new String(ret)); | |
| 377 return ret; | |
| 378 } | |
| 379 | |
| 380 | |
| 381 public void close() { | |
| 382 if (fileChannel != null) { | |
| 383 try { | |
| 384 fileChannel.close(); | |
| 385 } catch (IOException ioe) { | |
| 386 // don't care. | |
| 387 } | |
| 388 } | |
| 389 } | |
| 390 | |
| 391 public void subsetFile(String infile, String outfile, Set<Integer> columns, Long numCases) { | |
| 392 subsetFile(infile, outfile, columns, numCases, "\t"); | |
| 393 } | |
| 394 | |
| 395 public void subsetFile(String infile, String outfile, Set<Integer> columns, Long numCases, | |
| 396 String delimiter) { | |
| 397 try { | |
| 398 subsetFile(new FileInputStream(new File(infile)), outfile, columns, numCases, delimiter); | |
| 399 } catch (IOException ex) { | |
| 400 throw new RuntimeException("Could not open file "+infile); | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 | |
| 405 public void subsetFile(InputStream in, String outfile, Set<Integer> columns, Long numCases, | |
| 406 String delimiter) { | |
| 407 try { | |
| 408 Scanner scanner = new Scanner(in); | |
| 409 scanner.useDelimiter("\\n"); | |
| 410 | |
| 411 BufferedWriter out = new BufferedWriter(new FileWriter(outfile)); | |
| 412 for (long caseIndex = 0; caseIndex < numCases; caseIndex++) { | |
| 413 if (scanner.hasNext()) { | |
| 414 String[] line = (scanner.next()).split(delimiter,-1); | |
| 415 List<String> ln = new ArrayList<String>(); | |
| 416 for (Integer i : columns) { | |
| 417 ln.add(line[i]); | |
| 418 } | |
| 419 out.write(StringUtils.join(ln,"\t")+"\n"); | |
| 420 } else { | |
| 421 throw new RuntimeException("Tab file has fewer rows than the determined number of cases."); | |
| 422 } | |
| 423 } | |
| 424 | |
| 425 while (scanner.hasNext()) { | |
| 426 if (!"".equals(scanner.next()) ) { | |
| 427 throw new RuntimeException("Tab file has extra nonempty rows than the determined number of cases."); | |
| 428 | |
| 429 } | |
| 430 } | |
| 431 | |
| 432 scanner.close(); | |
| 433 out.close(); | |
| 434 | |
| 435 } catch (FileNotFoundException e) { | |
| 436 e.printStackTrace(); | |
| 437 } catch (IOException e) { | |
| 438 e.printStackTrace(); | |
| 439 } | |
| 440 | |
| 441 } | |
| 442 | |
| 443 /* | |
| 444 * Straightforward method for subsetting a column; inefficient on large | |
| 445 * files, OK to use on small files: | |
| 446 */ | |
| 447 | |
| 448 public static Double[] subsetDoubleVector(InputStream in, int column, int numCases) { | |
| 449 Double[] retVector = new Double[numCases]; | |
| 450 Scanner scanner = new Scanner(in); | |
| 451 scanner.useDelimiter("\\n"); | |
| 452 | |
| 453 for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { | |
| 454 if (scanner.hasNext()) { | |
| 455 String[] line = (scanner.next()).split("\t", -1); | |
| 456 try { | |
| 457 retVector[caseIndex] = new Double(line[column]); | |
| 458 } catch (NumberFormatException ex) { | |
| 459 retVector[caseIndex] = null; // missing value | |
| 460 } | |
| 461 } else { | |
| 462 scanner.close(); | |
| 463 throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); | |
| 464 } | |
| 465 } | |
| 466 | |
| 467 int tailIndex = numCases; | |
| 468 while (scanner.hasNext()) { | |
| 469 String nextLine = scanner.next(); | |
| 470 if (!"".equals(nextLine)) { | |
| 471 scanner.close(); | |
| 472 throw new RuntimeException("Column "+column+": tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); | |
| 473 } | |
| 474 tailIndex++; | |
| 475 } | |
| 476 | |
| 477 scanner.close(); | |
| 478 return retVector; | |
| 479 | |
| 480 } | |
| 481 | |
| 482 /* | |
| 483 * Straightforward method for subsetting a tab-delimited data file, extracting | |
| 484 * all the columns representing continuous variables and returning them as | |
| 485 * a 2-dimensional array of Doubles; | |
| 486 * Inefficient on large files, OK to use on small ones. | |
| 487 */ | |
| 488 public static Double[][] subsetDoubleVectors(InputStream in, Set<Integer> columns, int numCases) throws IOException { | |
| 489 Double[][] retVector = new Double[columns.size()][numCases]; | |
| 490 Scanner scanner = new Scanner(in); | |
| 491 scanner.useDelimiter("\\n"); | |
| 492 | |
| 493 for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { | |
| 494 if (scanner.hasNext()) { | |
| 495 String[] line = (scanner.next()).split("\t", -1); | |
| 496 int j = 0; | |
| 497 for (Integer i : columns) { | |
| 498 try { | |
| 499 // TODO: verify that NaN and +-Inf are going to be | |
| 500 // handled correctly here! -- L.A. | |
| 501 // NO, "+-Inf" is not handled correctly; see the | |
| 502 // comment further down below. | |
| 503 retVector[j][caseIndex] = new Double(line[i]); | |
| 504 } catch (NumberFormatException ex) { | |
| 505 retVector[j][caseIndex] = null; // missing value | |
| 506 } | |
| 507 j++; | |
| 508 } | |
| 509 } else { | |
| 510 scanner.close(); | |
| 511 throw new IOException("Tab file has fewer rows than the stored number of cases!"); | |
| 512 } | |
| 513 } | |
| 514 | |
| 515 int tailIndex = numCases; | |
| 516 while (scanner.hasNext()) { | |
| 517 String nextLine = scanner.next(); | |
| 518 if (!"".equals(nextLine)) { | |
| 519 scanner.close(); | |
| 520 throw new IOException("Tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); | |
| 521 } | |
| 522 tailIndex++; | |
| 523 } | |
| 524 | |
| 525 scanner.close(); | |
| 526 return retVector; | |
| 527 | |
| 528 } | |
| 529 | |
| 530 public String[] subsetStringVector(DataFile datafile, int column) throws IOException { | |
| 531 return (String[])subsetObjectVector(datafile, column, COLUMN_TYPE_STRING); | |
| 532 } | |
| 533 | |
| 534 public Double[] subsetDoubleVector(DataFile datafile, int column) throws IOException { | |
| 535 return (Double[])subsetObjectVector(datafile, column, COLUMN_TYPE_DOUBLE); | |
| 536 } | |
| 537 | |
| 538 public Long[] subsetLongVector(DataFile datafile, int column) throws IOException { | |
| 539 return (Long[])subsetObjectVector(datafile, column, COLUMN_TYPE_LONG); | |
| 540 } | |
| 541 | |
| 542 // Float methods are temporary; | |
| 543 // In normal operations we'll be treating all the floating point types as | |
| 544 // doubles. I need to be able to handle floats for some 4.0 vs 3.* ingest | |
| 545 // tests. -- L.A. | |
| 546 | |
| 547 public Float[] subsetFloatVector(DataFile datafile, int column) throws IOException { | |
| 548 return (Float[])subsetObjectVector(datafile, column, COLUMN_TYPE_FLOAT); | |
| 549 } | |
| 550 | |
| 551 public String[] subsetStringVector(File tabfile, int column, int varcount, int casecount) throws IOException { | |
| 552 return (String[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_STRING); | |
| 553 } | |
| 554 | |
| 555 public Double[] subsetDoubleVector(File tabfile, int column, int varcount, int casecount) throws IOException { | |
| 556 return (Double[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_DOUBLE); | |
| 557 } | |
| 558 | |
| 559 public Long[] subsetLongVector(File tabfile, int column, int varcount, int casecount) throws IOException { | |
| 560 return (Long[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_LONG); | |
| 561 } | |
| 562 | |
| 563 public Float[] subsetFloatVector(File tabfile, int column, int varcount, int casecount) throws IOException { | |
| 564 return (Float[])subsetObjectVector(tabfile, column, varcount, casecount, COLUMN_TYPE_FLOAT); | |
| 565 } | |
| 566 | |
| 567 public Object[] subsetObjectVector(DataFile dataFile, int column, int columntype) throws IOException { | |
| 568 if (!dataFile.isTabularData()) { | |
| 569 throw new IOException("DataFile is not tabular data."); | |
| 570 } | |
| 571 | |
| 572 int varcount = dataFile.getDataTable().getVarQuantity().intValue(); | |
| 573 int casecount = dataFile.getDataTable().getCaseQuantity().intValue(); | |
| 574 | |
| 575 if (column >= varcount) { | |
| 576 throw new IOException("Column "+column+" is out of bounds."); | |
| 577 } | |
| 578 | |
| 579 File tabfile = dataFile.getFileSystemLocation().toFile(); | |
| 580 | |
| 581 if (columntype == COLUMN_TYPE_STRING) { | |
| 582 String filename = dataFile.getFileMetadata().getLabel(); | |
| 583 if (filename != null) { | |
| 584 filename = filename.replaceFirst("^_", ""); | |
| 585 Integer fnumvalue = null; | |
| 586 try { | |
| 587 fnumvalue = new Integer(filename); | |
| 588 } catch (Exception ex){ | |
| 589 fnumvalue = null; | |
| 590 } | |
| 591 if (fnumvalue != null) { | |
| 592 //if ((fnumvalue.intValue() < 112497)) { // && (fnumvalue.intValue() > 60015)) { | |
| 593 if ((fnumvalue.intValue() < 111931)) { // && (fnumvalue.intValue() > 60015)) { | |
| 594 if (!(fnumvalue.intValue() == 60007 | |
| 595 || fnumvalue.intValue() == 59997 | |
| 596 || fnumvalue.intValue() == 60015 | |
| 597 || fnumvalue.intValue() == 59948 | |
| 598 || fnumvalue.intValue() == 60012 | |
| 599 || fnumvalue.intValue() == 52585 | |
| 600 || fnumvalue.intValue() == 60005 | |
| 601 || fnumvalue.intValue() == 60002 | |
| 602 || fnumvalue.intValue() == 59954 | |
| 603 || fnumvalue.intValue() == 60008 | |
| 604 || fnumvalue.intValue() == 54972 | |
| 605 || fnumvalue.intValue() == 55010 | |
| 606 || fnumvalue.intValue() == 54996 | |
| 607 || fnumvalue.intValue() == 53527 | |
| 608 || fnumvalue.intValue() == 53546 | |
| 609 || fnumvalue.intValue() == 55002 | |
| 610 || fnumvalue.intValue() == 55006 | |
| 611 || fnumvalue.intValue() == 54998 | |
| 612 || fnumvalue.intValue() == 52552 | |
| 613 // SPSS/SAV cases with similar issue - compat mode must be disabled | |
| 614 //|| fnumvalue.intValue() == 101826 // temporary - tricky file with accents and v. 16... | |
| 615 || fnumvalue.intValue() == 54618 // another SAV file, with long strings... | |
| 616 || fnumvalue.intValue() == 54619 // [same] | |
| 617 || fnumvalue.intValue() == 57983 | |
| 618 || fnumvalue.intValue() == 58262 | |
| 619 || fnumvalue.intValue() == 58288 | |
| 620 || fnumvalue.intValue() == 58656 | |
| 621 || fnumvalue.intValue() == 59144 | |
| 622 // || fnumvalue.intValue() == 69626 [nope!] | |
| 623 )) { | |
| 624 dbgLog.info("\"Old\" file name detected; using \"compatibility mode\" for a character vector subset;"); | |
| 625 return subsetObjectVector(tabfile, column, varcount, casecount, columntype, true); | |
| 626 } | |
| 627 } | |
| 628 } | |
| 629 } | |
| 630 } | |
| 631 | |
| 632 return subsetObjectVector(tabfile, column, varcount, casecount, columntype); | |
| 633 } | |
| 634 | |
| 635 public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype) throws IOException { | |
| 636 return subsetObjectVector(tabfile, column, varcount, casecount, columntype, false); | |
| 637 } | |
| 638 | |
| 639 | |
| 640 | |
| 641 public Object[] subsetObjectVector(File tabfile, int column, int varcount, int casecount, int columntype, boolean compatmode) throws IOException { | |
| 642 | |
| 643 Object[] retVector = null; | |
| 644 | |
| 645 boolean isString = false; | |
| 646 boolean isDouble = false; | |
| 647 boolean isLong = false; | |
| 648 boolean isFloat = false; | |
| 649 | |
| 650 //Locale loc = new Locale("en", "US"); | |
| 651 | |
| 652 if (columntype == COLUMN_TYPE_STRING) { | |
| 653 isString = true; | |
| 654 retVector = new String[casecount]; | |
| 655 } else if (columntype == COLUMN_TYPE_DOUBLE) { | |
| 656 isDouble = true; | |
| 657 retVector = new Double[casecount]; | |
| 658 } else if (columntype == COLUMN_TYPE_LONG) { | |
| 659 isLong = true; | |
| 660 retVector = new Long[casecount]; | |
| 661 } else if (columntype == COLUMN_TYPE_FLOAT){ | |
| 662 isFloat = true; | |
| 663 retVector = new Float[casecount]; | |
| 664 } else { | |
| 665 throw new IOException("Unsupported column type: "+columntype); | |
| 666 } | |
| 667 | |
| 668 File rotatedImageFile = getRotatedImage(tabfile, varcount, casecount); | |
| 669 long[] columnEndOffsets = extractColumnOffsets(rotatedImageFile, varcount, casecount); | |
| 670 long columnOffset = 0; | |
| 671 long columnLength = 0; | |
| 672 | |
| 673 if (column > 0) { | |
| 674 columnOffset = columnEndOffsets[column - 1]; | |
| 675 columnLength = columnEndOffsets[column] - columnEndOffsets[column - 1]; | |
| 676 } else { | |
| 677 columnOffset = varcount * 8; | |
| 678 columnLength = columnEndOffsets[0] - varcount * 8; | |
| 679 } | |
| 680 | |
| 681 FileChannel fc = (FileChannel.open(Paths.get(rotatedImageFile.getAbsolutePath()), StandardOpenOption.READ)); | |
| 682 fc.position(columnOffset); | |
| 683 int MAX_COLUMN_BUFFER = 8192; | |
| 684 | |
| 685 ByteBuffer in = ByteBuffer.allocate(MAX_COLUMN_BUFFER); | |
| 686 | |
| 687 if (columnLength < MAX_COLUMN_BUFFER) { | |
| 688 in.limit((int)(columnLength)); | |
| 689 } | |
| 690 | |
| 691 long bytesRead = 0; | |
| 692 long bytesReadTotal = 0; | |
| 693 int caseindex = 0; | |
| 694 int byteoffset = 0; | |
| 695 byte[] leftover = null; | |
| 696 | |
| 697 while (bytesReadTotal < columnLength) { | |
| 698 bytesRead = fc.read(in); | |
| 699 byte[] columnBytes = in.array(); | |
| 700 int bytecount = 0; | |
| 701 | |
| 702 | |
| 703 while (bytecount < bytesRead) { | |
| 704 if (columnBytes[bytecount] == '\n') { | |
| 705 /* | |
| 706 String token = new String(columnBytes, byteoffset, bytecount-byteoffset, "UTF8"); | |
| 707 | |
| 708 if (leftover != null) { | |
| 709 String leftoverString = new String (leftover, "UTF8"); | |
| 710 token = leftoverString + token; | |
| 711 leftover = null; | |
| 712 } | |
| 713 */ | |
| 714 /* | |
| 715 * Note that the way I was doing it at first - above - | |
| 716 * was not quite the correct way - because I was creating UTF8 | |
| 717 * strings from the leftover bytes, and the bytes in the | |
| 718 * current buffer *separately*; which means, if a multi-byte | |
| 719 * UTF8 character got split in the middle between one buffer | |
| 720 * and the next, both chunks of it would become junk | |
| 721 * characters, on each side! | |
| 722 * The correct way of doing it, of course, is to create a | |
| 723 * merged byte buffer, and then turn it into a UTF8 string. | |
| 724 * -- L.A. 4.0 | |
| 725 */ | |
| 726 String token = null; | |
| 727 | |
| 728 if (leftover == null) { | |
| 729 token = new String(columnBytes, byteoffset, bytecount-byteoffset, "UTF8"); | |
| 730 } else { | |
| 731 byte[] merged = new byte[leftover.length + bytecount-byteoffset]; | |
| 732 | |
| 733 System.arraycopy(leftover, 0, merged, 0, leftover.length); | |
| 734 System.arraycopy(columnBytes, byteoffset, merged, leftover.length, bytecount-byteoffset); | |
| 735 token = new String (merged, "UTF8"); | |
| 736 leftover = null; | |
| 737 merged = null; | |
| 738 } | |
| 739 | |
| 740 if (isString) { | |
| 741 if ("".equals(token)) { | |
| 742 // An empty string is a string missing value! | |
| 743 // An empty string in quotes is an empty string! | |
| 744 retVector[caseindex] = null; | |
| 745 } else { | |
| 746 // Strip the outer quotes: | |
| 747 token = token.replaceFirst("^\\\"", ""); | |
| 748 token = token.replaceFirst("\\\"$", ""); | |
| 749 | |
| 750 // We need to restore the special characters that | |
| 751 // are stored in tab files escaped - quotes, new lines | |
| 752 // and tabs. Before we do that however, we need to | |
| 753 // take care of any escaped backslashes stored in | |
| 754 // the tab file. I.e., "foo\t" should be transformed | |
| 755 // to "foo<TAB>"; but "foo\\t" should be transformed | |
| 756 // to "foo\t". This way new lines and tabs that were | |
| 757 // already escaped in the original data are not | |
| 758 // going to be transformed to unescaped tab and | |
| 759 // new line characters! | |
| 760 | |
| 761 String[] splitTokens = token.split(Matcher.quoteReplacement("\\\\"), -2); | |
| 762 | |
| 763 // (note that it's important to use the 2-argument version | |
| 764 // of String.split(), and set the limit argument to a | |
| 765 // negative value; otherwise any trailing backslashes | |
| 766 // are lost.) | |
| 767 | |
| 768 for (int i = 0; i < splitTokens.length; i++) { | |
| 769 splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\\""), "\""); | |
| 770 splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\t"), "\t"); | |
| 771 splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\n"), "\n"); | |
| 772 splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\r"), "\r"); | |
| 773 } | |
| 774 // TODO: | |
| 775 // Make (some of?) the above optional; for ex., we | |
| 776 // do need to restore the newlines when calculating UNFs; | |
| 777 // But if we are subsetting these vectors in order to | |
| 778 // create a new tab-delimited file, they will | |
| 779 // actually break things! -- L.A. Jul. 28 2014 | |
| 780 | |
| 781 token = StringUtils.join(splitTokens, '\\'); | |
| 782 | |
| 783 // "compatibility mode" - a hack, to be able to produce | |
| 784 // unfs identical to those produced by the "early" | |
| 785 // unf5 jar; will be removed in production 4.0. | |
| 786 // -- L.A. (TODO: ...) | |
| 787 if (compatmode && !"".equals(token)) { | |
| 788 if (token.length() > 128) { | |
| 789 if ("".equals(token.trim())) { | |
| 790 // don't ask... | |
| 791 token = token.substring(0, 129); | |
| 792 } else { | |
| 793 token = token.substring(0, 128); | |
| 794 //token = String.format(loc, "%.128s", token); | |
| 795 token = token.trim(); | |
| 796 //dbgLog.info("formatted and trimmed: "+token); | |
| 797 } | |
| 798 } else { | |
| 799 if ("".equals(token.trim())) { | |
| 800 // again, don't ask; | |
| 801 // - this replicates some bugginness | |
| 802 // that happens inside unf5; | |
| 803 token = "null"; | |
| 804 } else { | |
| 805 token = token.trim(); | |
| 806 } | |
| 807 } | |
| 808 } | |
| 809 | |
| 810 retVector[caseindex] = token; | |
| 811 } | |
| 812 } else if (isDouble) { | |
| 813 try { | |
| 814 // TODO: verify that NaN and +-Inf are | |
| 815 // handled correctly here! -- L.A. | |
| 816 // Verified: new Double("nan") works correctly, | |
| 817 // resulting in Double.NaN; | |
| 818 // Double("[+-]Inf") doesn't work however; | |
| 819 // (the constructor appears to be expecting it | |
| 820 // to be spelled as "Infinity", "-Infinity", etc. | |
| 821 if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) { | |
| 822 retVector[caseindex] = java.lang.Double.POSITIVE_INFINITY; | |
| 823 } else if ("-inf".equalsIgnoreCase(token)) { | |
| 824 retVector[caseindex] = java.lang.Double.NEGATIVE_INFINITY; | |
| 825 } else if (token == null || token.equals("")) { | |
| 826 // missing value: | |
| 827 retVector[caseindex] = null; | |
| 828 } else { | |
| 829 retVector[caseindex] = new Double(token); | |
| 830 } | |
| 831 } catch (NumberFormatException ex) { | |
| 832 dbgLog.warning("NumberFormatException thrown for "+token+" as Double"); | |
| 833 | |
| 834 retVector[caseindex] = null; // missing value | |
| 835 // TODO: ? | |
| 836 } | |
| 837 } else if (isLong) { | |
| 838 try { | |
| 839 retVector[caseindex] = new Long(token); | |
| 840 } catch (NumberFormatException ex) { | |
| 841 retVector[caseindex] = null; // assume missing value | |
| 842 } | |
| 843 } else if (isFloat) { | |
| 844 try { | |
| 845 if ("inf".equalsIgnoreCase(token) || "+inf".equalsIgnoreCase(token)) { | |
| 846 retVector[caseindex] = java.lang.Float.POSITIVE_INFINITY; | |
| 847 } else if ("-inf".equalsIgnoreCase(token)) { | |
| 848 retVector[caseindex] = java.lang.Float.NEGATIVE_INFINITY; | |
| 849 } else if (token == null || token.equals("")) { | |
| 850 // missing value: | |
| 851 retVector[caseindex] = null; | |
| 852 } else { | |
| 853 retVector[caseindex] = new Float(token); | |
| 854 } | |
| 855 } catch (NumberFormatException ex) { | |
| 856 dbgLog.warning("NumberFormatException thrown for "+token+" as Float"); | |
| 857 retVector[caseindex] = null; // assume missing value (TODO: ?) | |
| 858 } | |
| 859 } | |
| 860 caseindex++; | |
| 861 | |
| 862 if (bytecount == bytesRead - 1) { | |
| 863 byteoffset = 0; | |
| 864 } else { | |
| 865 byteoffset = bytecount + 1; | |
| 866 } | |
| 867 } else { | |
| 868 if (bytecount == bytesRead - 1) { | |
| 869 // We've reached the end of the buffer; | |
| 870 // This means we'll save whatever unused bytes left in | |
| 871 // it - i.e., the bytes between the last new line | |
| 872 // encountered and the end - in the leftover buffer. | |
| 873 | |
| 874 // *EXCEPT*, there may be a case of a very long String | |
| 875 // that is actually longer than MAX_COLUMN_BUFFER, in | |
| 876 // which case it is possible that we've read through | |
| 877 // an entire buffer of bytes without finding any | |
| 878 // new lines... in this case we may need to add this | |
| 879 // entire byte buffer to an already existing leftover | |
| 880 // buffer! | |
| 881 if (leftover == null) { | |
| 882 leftover = new byte[(int)bytesRead - byteoffset]; | |
| 883 System.arraycopy(columnBytes, byteoffset, leftover, 0, (int)bytesRead - byteoffset); | |
| 884 } else { | |
| 885 if (byteoffset != 0) { | |
| 886 throw new IOException("Reached the end of the byte buffer, with some leftover left from the last read; yet the offset is not zero!"); | |
| 887 } | |
| 888 byte[] merged = new byte[leftover.length + (int)bytesRead]; | |
| 889 | |
| 890 System.arraycopy(leftover, 0, merged, 0, leftover.length); | |
| 891 System.arraycopy(columnBytes, byteoffset, merged, leftover.length, (int)bytesRead); | |
| 892 //leftover = null; | |
| 893 leftover = merged; | |
| 894 merged = null; | |
| 895 } | |
| 896 byteoffset = 0; | |
| 897 | |
| 898 } | |
| 899 } | |
| 900 bytecount++; | |
| 901 } | |
| 902 | |
| 903 bytesReadTotal += bytesRead; | |
| 904 in.clear(); | |
| 905 if (columnLength - bytesReadTotal < MAX_COLUMN_BUFFER) { | |
| 906 in.limit((int)(columnLength - bytesReadTotal)); | |
| 907 } | |
| 908 } | |
| 909 | |
| 910 fc.close(); | |
| 911 | |
| 912 if (caseindex != casecount) { | |
| 913 throw new IOException("Faile to read "+casecount+" tokens for column "+column); | |
| 914 //System.out.println("read "+caseindex+" tokens instead of expected "+casecount+"."); | |
| 915 } | |
| 916 | |
| 917 return retVector; | |
| 918 } | |
| 919 | |
| 920 private long[] extractColumnOffsets (File rotatedImageFile, int varcount, int casecount) throws IOException { | |
| 921 BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotatedImageFile)); | |
| 922 | |
| 923 byte[] offsetHeader = new byte[varcount * 8]; | |
| 924 long[] byteOffsets = new long[varcount]; | |
| 925 | |
| 926 | |
| 927 int readlen = rotfileStream.read(offsetHeader); | |
| 928 | |
| 929 if (readlen != varcount * 8) { | |
| 930 throw new IOException ("Could not read "+varcount*8+" header bytes from the rotated file."); | |
| 931 } | |
| 932 | |
| 933 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 934 byte[] offsetBytes = new byte[8]; | |
| 935 System.arraycopy(offsetHeader, varindex*8, offsetBytes, 0, 8); | |
| 936 | |
| 937 ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes); | |
| 938 byteOffsets[varindex] = offsetByteBuffer.getLong(); | |
| 939 | |
| 940 //System.out.println(byteOffsets[varindex]); | |
| 941 } | |
| 942 | |
| 943 rotfileStream.close(); | |
| 944 | |
| 945 return byteOffsets; | |
| 946 } | |
| 947 | |
| 948 private File getRotatedImage(File tabfile, int varcount, int casecount) throws IOException { | |
| 949 String fileName = tabfile.getAbsolutePath(); | |
| 950 String rotatedImageFileName = fileName + ".90d"; | |
| 951 File rotatedImageFile = new File(rotatedImageFileName); | |
| 952 if (rotatedImageFile.exists()) { | |
| 953 //System.out.println("Image already exists!"); | |
| 954 return rotatedImageFile; | |
| 955 } | |
| 956 | |
| 957 return generateRotatedImage(tabfile, varcount, casecount); | |
| 958 | |
| 959 } | |
| 960 | |
| 961 private File generateRotatedImage (File tabfile, int varcount, int casecount) throws IOException { | |
| 962 // TODO: throw exceptions if bad file, zero varcount, etc. ... | |
| 963 | |
| 964 String fileName = tabfile.getAbsolutePath(); | |
| 965 String rotatedImageFileName = fileName + ".90d"; | |
| 966 | |
| 967 int MAX_OUTPUT_STREAMS = 32; | |
| 968 int MAX_BUFFERED_BYTES = 10 * 1024 * 1024; // 10 MB - for now? | |
| 969 int MAX_COLUMN_BUFFER = 8 * 1024; | |
| 970 | |
| 971 // offsetHeader will contain the byte offsets of the individual column | |
| 972 // vectors in the final rotated image file | |
| 973 byte[] offsetHeader = new byte[varcount * 8]; | |
| 974 int[] bufferedSizes = new int[varcount]; | |
| 975 long[] cachedfileSizes = new long[varcount]; | |
| 976 File[] columnTempFiles = new File[varcount]; | |
| 977 | |
| 978 for (int i = 0; i < varcount; i++) { | |
| 979 bufferedSizes[i] = 0; | |
| 980 cachedfileSizes[i] = 0; | |
| 981 } | |
| 982 | |
| 983 // TODO: adjust MAX_COLUMN_BUFFER here, so that the total size is | |
| 984 // no more than MAX_BUFFERED_BYTES (but no less than 1024 maybe?) | |
| 985 | |
| 986 byte[][] bufferedColumns = new byte [varcount][MAX_COLUMN_BUFFER]; | |
| 987 | |
| 988 // read the tab-delimited file: | |
| 989 | |
| 990 FileInputStream tabfileStream = new FileInputStream(tabfile); | |
| 991 | |
| 992 Scanner scanner = new Scanner(tabfileStream); | |
| 993 scanner.useDelimiter("\\n"); | |
| 994 | |
| 995 for (int caseindex = 0; caseindex < casecount; caseindex++) { | |
| 996 if (scanner.hasNext()) { | |
| 997 String[] line = (scanner.next()).split("\t", -1); | |
| 998 // TODO: throw an exception if there are fewer tab-delimited | |
| 999 // tokens than the number of variables specified. | |
| 1000 String token = ""; | |
| 1001 int tokensize = 0; | |
| 1002 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1003 // TODO: figure out the safest way to convert strings to | |
| 1004 // bytes here. Is it going to be safer to use getBytes("UTF8")? | |
| 1005 // we are already making the assumption that the values | |
| 1006 // in the tab file are in UTF8. -- L.A. | |
| 1007 token = line[varindex] + "\n"; | |
| 1008 tokensize = token.getBytes().length; | |
| 1009 if (bufferedSizes[varindex]+tokensize > MAX_COLUMN_BUFFER) { | |
| 1010 // fill the buffer and dump its contents into the temp file: | |
| 1011 // (do note that there may be *several* MAX_COLUMN_BUFFERs | |
| 1012 // worth of bytes in the token!) | |
| 1013 | |
| 1014 int tokenoffset = 0; | |
| 1015 | |
| 1016 if (bufferedSizes[varindex] != MAX_COLUMN_BUFFER) { | |
| 1017 tokenoffset = MAX_COLUMN_BUFFER-bufferedSizes[varindex]; | |
| 1018 System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokenoffset); | |
| 1019 } // (otherwise the buffer is already full, and we should | |
| 1020 // simply dump it into the temp file, without adding any | |
| 1021 // extra bytes to it) | |
| 1022 | |
| 1023 File bufferTempFile = columnTempFiles[varindex]; | |
| 1024 if (bufferTempFile == null) { | |
| 1025 bufferTempFile = File.createTempFile("columnBufferFile", "bytes"); | |
| 1026 columnTempFiles[varindex] = bufferTempFile; | |
| 1027 } | |
| 1028 | |
| 1029 // *append* the contents of the buffer to the end of the | |
| 1030 // temp file, if already exists: | |
| 1031 BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream (bufferTempFile, true)); | |
| 1032 outputStream.write(bufferedColumns[varindex], 0, MAX_COLUMN_BUFFER); | |
| 1033 cachedfileSizes[varindex] += MAX_COLUMN_BUFFER; | |
| 1034 | |
| 1035 // keep writing MAX_COLUMN_BUFFER-size chunks of bytes into | |
| 1036 // the temp file, for as long as there's more than MAX_COLUMN_BUFFER | |
| 1037 // bytes left in the token: | |
| 1038 | |
| 1039 while (tokensize - tokenoffset > MAX_COLUMN_BUFFER) { | |
| 1040 outputStream.write(token.getBytes(), tokenoffset, MAX_COLUMN_BUFFER); | |
| 1041 cachedfileSizes[varindex] += MAX_COLUMN_BUFFER; | |
| 1042 tokenoffset += MAX_COLUMN_BUFFER; | |
| 1043 } | |
| 1044 | |
| 1045 outputStream.close(); | |
| 1046 | |
| 1047 // buffer the remaining bytes and reset the buffered | |
| 1048 // byte counter: | |
| 1049 | |
| 1050 System.arraycopy(token.getBytes(), | |
| 1051 tokenoffset, | |
| 1052 bufferedColumns[varindex], | |
| 1053 0, | |
| 1054 tokensize - tokenoffset); | |
| 1055 | |
| 1056 bufferedSizes[varindex] = tokensize - tokenoffset; | |
| 1057 | |
| 1058 } else { | |
| 1059 // continue buffering | |
| 1060 System.arraycopy(token.getBytes(), 0, bufferedColumns[varindex], bufferedSizes[varindex], tokensize); | |
| 1061 bufferedSizes[varindex] += tokensize; | |
| 1062 } | |
| 1063 } | |
| 1064 } else { | |
| 1065 scanner.close(); | |
| 1066 throw new IOException("Tab file has fewer rows than the stored number of cases!"); | |
| 1067 } | |
| 1068 | |
| 1069 } | |
| 1070 | |
| 1071 // OK, we've created the individual byte vectors of the tab file columns; | |
| 1072 // they may be partially saved in temp files and/or in memory. | |
| 1073 // We now need to go through all these buffers and create the final | |
| 1074 // rotated image file. | |
| 1075 | |
| 1076 BufferedOutputStream finalOut = new BufferedOutputStream(new FileOutputStream (new File(rotatedImageFileName))); | |
| 1077 | |
| 1078 // but first we should create the offset header and write it out into | |
| 1079 // the final file; because it should be at the head, doh! | |
| 1080 | |
| 1081 long columnOffset = varcount * 8; | |
| 1082 // (this is the offset of the first column vector; it is equal to the | |
| 1083 // size of the offset header, i.e. varcount * 8 bytes) | |
| 1084 | |
| 1085 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1086 long totalColumnBytes = cachedfileSizes[varindex] + bufferedSizes[varindex]; | |
| 1087 columnOffset+=totalColumnBytes; | |
| 1088 //totalColumnBytes; | |
| 1089 byte[] columnOffsetByteArray = ByteBuffer.allocate(8).putLong(columnOffset).array(); | |
| 1090 System.arraycopy(columnOffsetByteArray, 0, offsetHeader, varindex * 8, 8); | |
| 1091 } | |
| 1092 | |
| 1093 finalOut.write(offsetHeader, 0, varcount * 8); | |
| 1094 | |
| 1095 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1096 long cachedBytesRead = 0; | |
| 1097 | |
| 1098 // check if there is a cached temp file: | |
| 1099 | |
| 1100 File cachedTempFile = columnTempFiles[varindex]; | |
| 1101 if (cachedTempFile != null) { | |
| 1102 byte[] cachedBytes = new byte[MAX_COLUMN_BUFFER]; | |
| 1103 BufferedInputStream cachedIn = new BufferedInputStream(new FileInputStream(cachedTempFile)); | |
| 1104 int readlen = 0; | |
| 1105 while ((readlen = cachedIn.read(cachedBytes)) > -1) { | |
| 1106 finalOut.write(cachedBytes, 0, readlen); | |
| 1107 cachedBytesRead += readlen; | |
| 1108 } | |
| 1109 cachedIn.close(); | |
| 1110 // delete the temp file: | |
| 1111 cachedTempFile.delete(); | |
| 1112 | |
| 1113 } | |
| 1114 | |
| 1115 if (cachedBytesRead != cachedfileSizes[varindex]) { | |
| 1116 finalOut.close(); | |
| 1117 throw new IOException("Could not read the correct number of bytes cached for column "+varindex+"; "+ | |
| 1118 cachedfileSizes[varindex] + " bytes expected, "+cachedBytesRead+" read."); | |
| 1119 } | |
| 1120 | |
| 1121 // then check if there are any bytes buffered for this column: | |
| 1122 | |
| 1123 if (bufferedSizes[varindex] > 0) { | |
| 1124 finalOut.write(bufferedColumns[varindex], 0, bufferedSizes[varindex]); | |
| 1125 } | |
| 1126 | |
| 1127 } | |
| 1128 | |
| 1129 finalOut.close(); | |
| 1130 return new File(rotatedImageFileName); | |
| 1131 | |
| 1132 } | |
| 1133 | |
| 1134 /* | |
| 1135 * Test method for taking a "rotated" image, and reversing it, reassembling | |
| 1136 * all the columns in the original order. Which should result in a file | |
| 1137 * byte-for-byte identical file to the original tab-delimited version. | |
| 1138 * | |
| 1139 * (do note that this method is not efficiently implemented; it's only | |
| 1140 * being used for experiments so far, to confirm the accuracy of the | |
| 1141 * accuracy of generateRotatedImage(). It should not be used for any | |
| 1142 * practical means in the application!) | |
| 1143 */ | |
| 1144 private void reverseRotatedImage (File rotfile, int varcount, int casecount) throws IOException { | |
| 1145 // open the file, read in the offset header: | |
| 1146 BufferedInputStream rotfileStream = new BufferedInputStream(new FileInputStream(rotfile)); | |
| 1147 | |
| 1148 byte[] offsetHeader = new byte[varcount * 8]; | |
| 1149 long[] byteOffsets = new long[varcount]; | |
| 1150 | |
| 1151 int readlen = rotfileStream.read(offsetHeader); | |
| 1152 | |
| 1153 if (readlen != varcount * 8) { | |
| 1154 throw new IOException ("Could not read "+varcount*8+" header bytes from the rotated file."); | |
| 1155 } | |
| 1156 | |
| 1157 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1158 byte[] offsetBytes = new byte[8]; | |
| 1159 System.arraycopy(offsetHeader, varindex*8, offsetBytes, 0, 8); | |
| 1160 | |
| 1161 ByteBuffer offsetByteBuffer = ByteBuffer.wrap(offsetBytes); | |
| 1162 byteOffsets[varindex] = offsetByteBuffer.getLong(); | |
| 1163 | |
| 1164 //System.out.println(byteOffsets[varindex]); | |
| 1165 } | |
| 1166 | |
| 1167 String [][] reversedMatrix = new String[casecount][varcount]; | |
| 1168 | |
| 1169 long offset = varcount * 8; | |
| 1170 byte[] columnBytes; | |
| 1171 | |
| 1172 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1173 long columnLength = byteOffsets[varindex] - offset; | |
| 1174 | |
| 1175 | |
| 1176 | |
| 1177 columnBytes = new byte[(int)columnLength]; | |
| 1178 readlen = rotfileStream.read(columnBytes); | |
| 1179 | |
| 1180 if (readlen != columnLength) { | |
| 1181 throw new IOException ("Could not read "+columnBytes+" bytes for column "+varindex); | |
| 1182 } | |
| 1183 /* | |
| 1184 String columnString = new String(columnBytes); | |
| 1185 //System.out.print(columnString); | |
| 1186 String[] values = columnString.split("\n", -1); | |
| 1187 | |
| 1188 if (values.length < casecount) { | |
| 1189 throw new IOException("count mismatch: "+values.length+" tokens found for column "+varindex); | |
| 1190 } | |
| 1191 | |
| 1192 for (int caseindex = 0; caseindex < casecount; caseindex++) { | |
| 1193 reversedMatrix[caseindex][varindex] = values[caseindex]; | |
| 1194 }*/ | |
| 1195 | |
| 1196 int bytecount = 0; | |
| 1197 int byteoffset = 0; | |
| 1198 int caseindex = 0; | |
| 1199 //System.out.println("generating value vector for column "+varindex); | |
| 1200 while (bytecount < columnLength) { | |
| 1201 if (columnBytes[bytecount] == '\n') { | |
| 1202 String token = new String(columnBytes, byteoffset, bytecount-byteoffset); | |
| 1203 reversedMatrix[caseindex++][varindex] = token; | |
| 1204 byteoffset = bytecount + 1; | |
| 1205 } | |
| 1206 bytecount++; | |
| 1207 } | |
| 1208 | |
| 1209 if (caseindex != casecount) { | |
| 1210 throw new IOException("count mismatch: "+caseindex+" tokens found for column "+varindex); | |
| 1211 } | |
| 1212 offset = byteOffsets[varindex]; | |
| 1213 } | |
| 1214 | |
| 1215 for (int caseindex = 0; caseindex < casecount; caseindex++) { | |
| 1216 for (int varindex = 0; varindex < varcount; varindex++) { | |
| 1217 System.out.print(reversedMatrix[caseindex][varindex]); | |
| 1218 if (varindex < varcount-1) { | |
| 1219 System.out.print("\t"); | |
| 1220 } else { | |
| 1221 System.out.print("\n"); | |
| 1222 } | |
| 1223 } | |
| 1224 } | |
| 1225 | |
| 1226 rotfileStream.close(); | |
| 1227 | |
| 1228 | |
| 1229 } | |
| 1230 | |
| 1231 /** | |
| 1232 * main() method, for testing | |
| 1233 * usage: java edu.harvard.iq.dataverse.dataaccess.TabularSubsetGenerator testfile.tab varcount casecount column type | |
| 1234 * make sure the CLASSPATH contains ... | |
| 1235 * | |
| 1236 */ | |
| 1237 | |
| 1238 public static void main(String[] args) { | |
| 1239 | |
| 1240 String tabFileName = args[0]; | |
| 1241 int varcount = new Integer(args[1]).intValue(); | |
| 1242 int casecount = new Integer(args[2]).intValue(); | |
| 1243 int column = new Integer(args[3]).intValue(); | |
| 1244 String type = args[4]; | |
| 1245 | |
| 1246 File tabFile = new File(tabFileName); | |
| 1247 File rotatedImageFile = null; | |
| 1248 | |
| 1249 TabularSubsetGenerator subsetGenerator = new TabularSubsetGenerator(); | |
| 1250 | |
| 1251 /* | |
| 1252 try { | |
| 1253 rotatedImageFile = subsetGenerator.getRotatedImage(tabFile, varcount, casecount); | |
| 1254 } catch (IOException ex) { | |
| 1255 System.out.println(ex.getMessage()); | |
| 1256 } | |
| 1257 */ | |
| 1258 | |
| 1259 //System.out.println("\nFinished generating \"rotated\" column image file."); | |
| 1260 | |
| 1261 //System.out.println("\nOffsets:"); | |
| 1262 | |
| 1263 MathContext doubleMathContext = new MathContext(15, RoundingMode.HALF_EVEN); | |
| 1264 String FORMAT_IEEE754 = "%+#.15e"; | |
| 1265 | |
| 1266 try { | |
| 1267 //subsetGenerator.reverseRotatedImage(rotatedImageFile, varcount, casecount); | |
| 1268 //String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount); | |
| 1269 if ("string".equals(type)) { | |
| 1270 String[] columns = subsetGenerator.subsetStringVector(tabFile, column, varcount, casecount); | |
| 1271 for (int i = 0; i < casecount; i++) { | |
| 1272 System.out.println(columns[i]); | |
| 1273 } | |
| 1274 } else { | |
| 1275 | |
| 1276 Double[] columns = subsetGenerator.subsetDoubleVector(tabFile, column, varcount, casecount); | |
| 1277 for (int i = 0; i < casecount; i++) { | |
| 1278 if (columns[i] != null) { | |
| 1279 BigDecimal outBigDecimal = new BigDecimal(columns[i], doubleMathContext); | |
| 1280 System.out.println(String.format(FORMAT_IEEE754, outBigDecimal)); | |
| 1281 } else { | |
| 1282 System.out.println("NA"); | |
| 1283 } | |
| 1284 //System.out.println(columns[i]); | |
| 1285 } | |
| 1286 } | |
| 1287 } catch (IOException ex) { | |
| 1288 System.out.println(ex.getMessage()); | |
| 1289 } | |
| 1290 } | |
| 1291 } | |
| 1292 | |
| 1293 |
