Mercurial > hg > LGDataverses
comparison src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 9:5926d6419569 | 10:a50cf11e5178 |
|---|---|
| 1 /* | |
| 2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College. | |
| 3 | |
| 4 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 you may not use this file except in compliance with the License. | |
| 6 You may obtain a copy of the License at | |
| 7 | |
| 8 http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 | |
| 10 Unless required by applicable law or agreed to in writing, software | |
| 11 distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 See the License for the specific language governing permissions and | |
| 14 limitations under the License. | |
| 15 | |
| 16 Dataverse Network - A web application to share, preserve and analyze research data. | |
| 17 Developed at the Institute for Quantitative Social Science, Harvard University. | |
| 18 Version 3.0. | |
| 19 */ | |
| 20 | |
| 21 package edu.harvard.iq.dataverse.util; | |
| 22 | |
| 23 import edu.harvard.iq.dataverse.DataFile; | |
| 24 import edu.harvard.iq.dataverse.ingest.IngestableDataChecker; | |
| 25 import java.io.BufferedInputStream; | |
| 26 import java.io.File; | |
| 27 import java.io.FileInputStream; | |
| 28 import java.io.FileOutputStream; | |
| 29 import java.io.FileReader; | |
| 30 import java.io.IOException; | |
| 31 import java.io.InputStream; | |
| 32 import java.util.ResourceBundle; | |
| 33 import java.util.MissingResourceException; | |
| 34 import java.nio.channels.FileChannel; | |
| 35 import java.nio.channels.WritableByteChannel; | |
| 36 import java.util.HashMap; | |
| 37 import java.util.Map; | |
| 38 import java.util.logging.Logger; | |
| 39 import javax.activation.MimetypesFileTypeMap; | |
| 40 import javax.ejb.EJBException; | |
| 41 import javax.xml.stream.XMLStreamConstants; | |
| 42 import javax.xml.stream.XMLStreamException; | |
| 43 import javax.xml.stream.XMLStreamReader; | |
| 44 import java.util.zip.GZIPInputStream; | |
| 45 | |
| 46 /** | |
| 47 * a 4.0 implementation of the DVN FileUtil; | |
| 48 * it provides some of the functionality from the 3.6 implementation, | |
| 49 * but the old code is ported creatively on the method-by-method basis. | |
| 50 * | |
| 51 * @author Leonid Andreev | |
| 52 */ | |
| 53 public class FileUtil implements java.io.Serializable { | |
| 54 private static final Logger logger = Logger.getLogger(FileUtil.class.getCanonicalName()); | |
| 55 | |
| 56 private static final String[] TABULAR_DATA_FORMAT_SET = {"POR", "SAV", "DTA", "RDA"}; | |
| 57 | |
| 58 private static Map<String, String> STATISTICAL_SYNTAX_FILE_EXTENSION = new HashMap<String, String>(); | |
| 59 | |
| 60 /* | |
| 61 * The following are Stata, SAS and SPSS syntax/control cards: | |
| 62 * These are recognized as text files (because they are!) so | |
| 63 * we check all the uploaded "text/plain" files for these extensions, and | |
| 64 * assign the following types when they are matched; | |
| 65 * Note that these types are only used in the metadata displayed on the | |
| 66 * dataset page. We don't support ingest on control cards. | |
| 67 * -- L.A. 4.0 Oct. 2014 | |
| 68 */ | |
| 69 | |
| 70 static { | |
| 71 STATISTICAL_SYNTAX_FILE_EXTENSION.put("do", "application/x-stata-syntax"); | |
| 72 STATISTICAL_SYNTAX_FILE_EXTENSION.put("sas", "application/x-sas-syntax"); | |
| 73 STATISTICAL_SYNTAX_FILE_EXTENSION.put("sps", "application/x-spss-syntax"); | |
| 74 } | |
| 75 | |
| 76 private static MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap(); | |
| 77 | |
| 78 public FileUtil() { | |
| 79 } | |
| 80 | |
| 81 public static void copyFile(File inputFile, File outputFile) throws IOException { | |
| 82 FileChannel in = null; | |
| 83 WritableByteChannel out = null; | |
| 84 | |
| 85 try { | |
| 86 in = new FileInputStream(inputFile).getChannel(); | |
| 87 out = new FileOutputStream(outputFile).getChannel(); | |
| 88 long bytesPerIteration = 50000; | |
| 89 long start = 0; | |
| 90 while ( start < in.size() ) { | |
| 91 in.transferTo(start, bytesPerIteration, out); | |
| 92 start += bytesPerIteration; | |
| 93 } | |
| 94 | |
| 95 } finally { | |
| 96 if (in != null) { in.close(); } | |
| 97 if (out != null) { out.close(); } | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 | |
| 102 public static String getFileExtension(String fileName){ | |
| 103 String ext = null; | |
| 104 if ( fileName.lastIndexOf(".") != -1){ | |
| 105 ext = (fileName.substring( fileName.lastIndexOf(".") + 1 )).toLowerCase(); | |
| 106 } | |
| 107 return ext; | |
| 108 } | |
| 109 | |
| 110 public static String replaceExtension(String originalName) { | |
| 111 return replaceExtension(originalName, "tab"); | |
| 112 } | |
| 113 | |
| 114 public static String replaceExtension(String originalName, String newExtension) { | |
| 115 int extensionIndex = originalName.lastIndexOf("."); | |
| 116 if (extensionIndex != -1 ) { | |
| 117 return originalName.substring(0, extensionIndex) + "."+newExtension ; | |
| 118 } else { | |
| 119 return originalName +"."+newExtension ; | |
| 120 } | |
| 121 } | |
| 122 | |
| 123 public static String getUserFriendlyFileType(DataFile dataFile) { | |
| 124 String fileType = dataFile.getContentType(); | |
| 125 | |
| 126 if (fileType != null) { | |
| 127 if (fileType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)){ | |
| 128 return ShapefileHandler.SHAPEFILE_FILE_TYPE_FRIENDLY_NAME; | |
| 129 } | |
| 130 if (fileType.indexOf(";") != -1) { | |
| 131 fileType = fileType.substring(0, fileType.indexOf(";")); | |
| 132 } | |
| 133 try { | |
| 134 return ResourceBundle.getBundle("MimeTypeDisplay").getString(fileType); | |
| 135 } catch (MissingResourceException e) { | |
| 136 return fileType; | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 return fileType; | |
| 141 } | |
| 142 | |
| 143 public static String getFacetFileType(DataFile dataFile) { | |
| 144 String fileType = dataFile.getContentType(); | |
| 145 | |
| 146 if (fileType != null) { | |
| 147 if (fileType.indexOf(";") != -1) { | |
| 148 fileType = fileType.substring(0, fileType.indexOf(";")); | |
| 149 } | |
| 150 | |
| 151 try { | |
| 152 return ResourceBundle.getBundle("MimeTypeFacets").getString(fileType); | |
| 153 } catch (MissingResourceException e) { | |
| 154 // if there's no defined "facet-friendly" form of this mime type | |
| 155 // we'll truncate the available type by "/", e.g., all the | |
| 156 // unknown image/* types will become "image"; many other, quite | |
| 157 // different types will all become "application" this way - | |
| 158 // but it is probably still better than to tag them all as | |
| 159 // "uknown". | |
| 160 // -- L.A. 4.0 alpha 1 | |
| 161 return fileType.split("/")[0]; | |
| 162 } | |
| 163 } | |
| 164 | |
| 165 return "unknown"; | |
| 166 } | |
| 167 | |
| 168 public static String getUserFriendlyOriginalType(DataFile dataFile) { | |
| 169 String fileType = dataFile.getOriginalFileFormat(); | |
| 170 | |
| 171 if (fileType != null && !fileType.equals("")) { | |
| 172 if (fileType.indexOf(";") != -1) { | |
| 173 fileType = fileType.substring(0, fileType.indexOf(";")); | |
| 174 } | |
| 175 try { | |
| 176 return ResourceBundle.getBundle("MimeTypeDisplay").getString(fileType); | |
| 177 } catch (MissingResourceException e) { | |
| 178 return fileType; | |
| 179 } | |
| 180 } | |
| 181 | |
| 182 return "UNKNOWN"; | |
| 183 } | |
| 184 | |
| 185 public static String determineFileType(File f, String fileName) throws IOException{ | |
| 186 String fileType = null; | |
| 187 String fileExtension = getFileExtension(fileName); | |
| 188 | |
| 189 | |
| 190 | |
| 191 // step 1: | |
| 192 // Apply our custom methods to try and recognize data files that can be | |
| 193 // converted to tabular data, or can be parsed for extra metadata | |
| 194 // (such as FITS). | |
| 195 logger.fine("Attempting to identify potential tabular data files;"); | |
| 196 IngestableDataChecker tabChk = new IngestableDataChecker(TABULAR_DATA_FORMAT_SET); | |
| 197 | |
| 198 fileType = tabChk.detectTabularDataFormat(f); | |
| 199 | |
| 200 logger.fine("determineFileType: tabular data checker found "+fileType); | |
| 201 | |
| 202 // step 2: If not found, check if graphml or FITS | |
| 203 if (fileType==null) { | |
| 204 if (isGraphMLFile(f)) { | |
| 205 fileType = "text/xml-graphml"; | |
| 206 } else // Check for FITS: | |
| 207 // our check is fairly weak (it appears to be hard to really | |
| 208 // really recognize a FITS file without reading the entire | |
| 209 // stream...), so in version 3.* we used to nsist on *both* | |
| 210 // the ".fits" extension and the header check; | |
| 211 // in 4.0, we'll accept either the extension, or the valid | |
| 212 // magic header: | |
| 213 if (isFITSFile(f) || (fileExtension != null | |
| 214 && fileExtension.equalsIgnoreCase("fits"))) { | |
| 215 fileType = "application/fits"; | |
| 216 } | |
| 217 } | |
| 218 | |
| 219 // step 3: check the mime type of this file with Jhove | |
| 220 if (fileType == null){ | |
| 221 JhoveFileType jw = new JhoveFileType(); | |
| 222 fileType = jw.getFileMimeType(f); | |
| 223 } | |
| 224 | |
| 225 // step 4: | |
| 226 // Additional processing; if we haven't gotten much useful information | |
| 227 // back from Jhove, we'll try and make an educated guess based on | |
| 228 // the file extension: | |
| 229 | |
| 230 if ( fileExtension != null) { | |
| 231 logger.fine("fileExtension="+fileExtension); | |
| 232 | |
| 233 if (fileType != null && fileType.startsWith("text/plain")){ | |
| 234 if (( fileExtension != null) && (STATISTICAL_SYNTAX_FILE_EXTENSION.containsKey(fileExtension))) { | |
| 235 // replace the mime type with the value of the HashMap | |
| 236 fileType = STATISTICAL_SYNTAX_FILE_EXTENSION.get(fileExtension); | |
| 237 } | |
| 238 } else if ("application/octet-stream".equals(fileType)) { | |
| 239 fileType = determineFileType(fileName); | |
| 240 logger.fine("mime type recognized by extension: "+fileType); | |
| 241 } | |
| 242 } else { | |
| 243 logger.fine("fileExtension is null"); | |
| 244 } | |
| 245 | |
| 246 // step 5: | |
| 247 // if this is a compressed file - zip or gzip - we'll check the | |
| 248 // file(s) inside the compressed stream and see if it's one of our | |
| 249 // recognized formats that we want to support compressed: | |
| 250 | |
| 251 if ("application/x-gzip".equals(fileType)) { | |
| 252 logger.fine("we'll run additional checks on this gzipped file."); | |
| 253 // We want to be able to support gzipped FITS files, same way as | |
| 254 // if they were just regular FITS files: | |
| 255 FileInputStream gzippedIn = new FileInputStream(f); | |
| 256 // (new FileInputStream() can throw a "filen not found" exception; | |
| 257 // however, if we've made it this far, it really means that the | |
| 258 // file does exist and can be opened) | |
| 259 InputStream uncompressedIn = null; | |
| 260 try { | |
| 261 uncompressedIn = new GZIPInputStream(gzippedIn); | |
| 262 if (isFITSFile(uncompressedIn)) { | |
| 263 fileType = "application/fits-gzipped"; | |
| 264 } | |
| 265 } catch (IOException ioex) { | |
| 266 if (uncompressedIn != null) { | |
| 267 try {uncompressedIn.close();} catch (IOException e) {} | |
| 268 } | |
| 269 } | |
| 270 } | |
| 271 if ("application/zip".equals(fileType)) { | |
| 272 | |
| 273 // Is this a zipped Shapefile? | |
| 274 // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile | |
| 275 //logger.info("Checking for shapefile"); | |
| 276 | |
| 277 ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(f)); | |
| 278 if (shp_handler.containsShapefile()){ | |
| 279 // logger.info("------- shapefile FOUND ----------"); | |
| 280 fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; //"application/zipped-shapefile"; | |
| 281 } | |
| 282 } | |
| 283 | |
| 284 logger.fine("returning fileType "+fileType); | |
| 285 return fileType; | |
| 286 } | |
| 287 | |
| 288 public static String determineFileType(String fileName) { | |
| 289 return MIME_TYPE_MAP.getContentType(fileName); | |
| 290 } | |
| 291 | |
| 292 | |
| 293 /* | |
| 294 * Custom method for identifying FITS files: | |
| 295 * TODO: | |
| 296 * the existing check for the "magic header" is very weak (see below); | |
| 297 * it should probably be replaced by attempting to parse and read at | |
| 298 * least the primary HDU, using the NOM fits parser. | |
| 299 * -- L.A. 4.0 alpha | |
| 300 */ | |
| 301 private static boolean isFITSFile(File file) { | |
| 302 BufferedInputStream ins = null; | |
| 303 | |
| 304 try { | |
| 305 ins = new BufferedInputStream(new FileInputStream(file)); | |
| 306 return isFITSFile(ins); | |
| 307 } catch (IOException ex) { | |
| 308 } | |
| 309 | |
| 310 return false; | |
| 311 } | |
| 312 | |
| 313 private static boolean isFITSFile(InputStream ins) { | |
| 314 boolean isFITS = false; | |
| 315 | |
| 316 // number of header bytes read for identification: | |
| 317 int magicWordLength = 6; | |
| 318 String magicWord = "SIMPLE"; | |
| 319 | |
| 320 try { | |
| 321 byte[] b = new byte[magicWordLength]; | |
| 322 logger.fine("attempting to read "+magicWordLength+" bytes from the FITS format candidate stream."); | |
| 323 if (ins.read(b, 0, magicWordLength) != magicWordLength) { | |
| 324 throw new IOException(); | |
| 325 } | |
| 326 | |
| 327 if (magicWord.equals(new String(b))) { | |
| 328 logger.fine("yes, this is FITS file!"); | |
| 329 isFITS = true; | |
| 330 } | |
| 331 } catch (IOException ex) { | |
| 332 isFITS = false; | |
| 333 } finally { | |
| 334 if (ins != null) { | |
| 335 try { | |
| 336 ins.close(); | |
| 337 } catch (Exception e) { | |
| 338 } | |
| 339 } | |
| 340 } | |
| 341 | |
| 342 return isFITS; | |
| 343 } | |
| 344 | |
| 345 private static boolean isGraphMLFile(File file) { | |
| 346 boolean isGraphML = false; | |
| 347 logger.fine("begin isGraphMLFile()"); | |
| 348 try{ | |
| 349 FileReader fileReader = new FileReader(file); | |
| 350 javax.xml.stream.XMLInputFactory xmlif = javax.xml.stream.XMLInputFactory.newInstance(); | |
| 351 xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); | |
| 352 | |
| 353 XMLStreamReader xmlr = xmlif.createXMLStreamReader(fileReader); | |
| 354 for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { | |
| 355 if (event == XMLStreamConstants.START_ELEMENT) { | |
| 356 if (xmlr.getLocalName().equals("graphml")) { | |
| 357 String schema = xmlr.getAttributeValue("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation"); | |
| 358 logger.fine("schema = "+schema); | |
| 359 if (schema!=null && schema.indexOf("http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd")!=-1){ | |
| 360 logger.fine("graphML is true"); | |
| 361 isGraphML = true; | |
| 362 } | |
| 363 } | |
| 364 break; | |
| 365 } | |
| 366 } | |
| 367 } catch(XMLStreamException e) { | |
| 368 logger.fine("XML error - this is not a valid graphML file."); | |
| 369 isGraphML = false; | |
| 370 } catch(IOException e) { | |
| 371 throw new EJBException(e); | |
| 372 } | |
| 373 logger.fine("end isGraphML()"); | |
| 374 return isGraphML; | |
| 375 } | |
| 376 | |
| 377 /** | |
| 378 * The number of bytes in a kilobyte, megabyte and gigabyte: | |
| 379 */ | |
| 380 public static final long ONE_KB = 1024; | |
| 381 public static final long ONE_MB = ONE_KB * ONE_KB; | |
| 382 public static final long ONE_GB = ONE_KB * ONE_MB; | |
| 383 | |
| 384 public static String getFriendlySize(Long filesize) { | |
| 385 if (filesize == null || filesize.longValue() < 0) { | |
| 386 return "unknown"; | |
| 387 } | |
| 388 | |
| 389 long bytesize = filesize.longValue(); | |
| 390 String displaySize; | |
| 391 | |
| 392 if (bytesize / ONE_GB > 0) { | |
| 393 displaySize = String.valueOf(bytesize / ONE_GB) + "." + String.valueOf((bytesize % ONE_GB) / (100 * ONE_MB)) + " GB"; | |
| 394 } else if (bytesize / ONE_MB > 0) { | |
| 395 displaySize = String.valueOf(bytesize / ONE_MB) + "." + String.valueOf((bytesize % ONE_MB) / (100 * ONE_KB)) + " MB"; | |
| 396 } else if (bytesize / ONE_KB > 0) { | |
| 397 displaySize = String.valueOf(bytesize / ONE_KB) + "." + String.valueOf((bytesize % ONE_KB) / 100) + " KB"; | |
| 398 } else { | |
| 399 displaySize = String.valueOf(bytesize) + " bytes"; | |
| 400 } | |
| 401 return displaySize; | |
| 402 | |
| 403 } | |
| 404 } |
