comparison src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
9:5926d6419569 10:a50cf11e5178
1 /*
2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15
16 Dataverse Network - A web application to share, preserve and analyze research data.
17 Developed at the Institute for Quantitative Social Science, Harvard University.
18 Version 3.0.
19 */
20
21 package edu.harvard.iq.dataverse.util;
22
23 import edu.harvard.iq.dataverse.DataFile;
24 import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
25 import java.io.BufferedInputStream;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.FileOutputStream;
29 import java.io.FileReader;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.util.ResourceBundle;
33 import java.util.MissingResourceException;
34 import java.nio.channels.FileChannel;
35 import java.nio.channels.WritableByteChannel;
36 import java.util.HashMap;
37 import java.util.Map;
38 import java.util.logging.Logger;
39 import javax.activation.MimetypesFileTypeMap;
40 import javax.ejb.EJBException;
41 import javax.xml.stream.XMLStreamConstants;
42 import javax.xml.stream.XMLStreamException;
43 import javax.xml.stream.XMLStreamReader;
44 import java.util.zip.GZIPInputStream;
45
46 /**
47 * a 4.0 implementation of the DVN FileUtil;
48 * it provides some of the functionality from the 3.6 implementation,
49 * but the old code is ported creatively on the method-by-method basis.
50 *
51 * @author Leonid Andreev
52 */
53 public class FileUtil implements java.io.Serializable {
54 private static final Logger logger = Logger.getLogger(FileUtil.class.getCanonicalName());
55
56 private static final String[] TABULAR_DATA_FORMAT_SET = {"POR", "SAV", "DTA", "RDA"};
57
58 private static Map<String, String> STATISTICAL_SYNTAX_FILE_EXTENSION = new HashMap<String, String>();
59
60 /*
61 * The following are Stata, SAS and SPSS syntax/control cards:
62 * These are recognized as text files (because they are!) so
63 * we check all the uploaded "text/plain" files for these extensions, and
64 * assign the following types when they are matched;
65 * Note that these types are only used in the metadata displayed on the
66 * dataset page. We don't support ingest on control cards.
67 * -- L.A. 4.0 Oct. 2014
68 */
69
70 static {
71 STATISTICAL_SYNTAX_FILE_EXTENSION.put("do", "application/x-stata-syntax");
72 STATISTICAL_SYNTAX_FILE_EXTENSION.put("sas", "application/x-sas-syntax");
73 STATISTICAL_SYNTAX_FILE_EXTENSION.put("sps", "application/x-spss-syntax");
74 }
75
76 private static MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap();
77
78 public FileUtil() {
79 }
80
81 public static void copyFile(File inputFile, File outputFile) throws IOException {
82 FileChannel in = null;
83 WritableByteChannel out = null;
84
85 try {
86 in = new FileInputStream(inputFile).getChannel();
87 out = new FileOutputStream(outputFile).getChannel();
88 long bytesPerIteration = 50000;
89 long start = 0;
90 while ( start < in.size() ) {
91 in.transferTo(start, bytesPerIteration, out);
92 start += bytesPerIteration;
93 }
94
95 } finally {
96 if (in != null) { in.close(); }
97 if (out != null) { out.close(); }
98 }
99 }
100
101
102 public static String getFileExtension(String fileName){
103 String ext = null;
104 if ( fileName.lastIndexOf(".") != -1){
105 ext = (fileName.substring( fileName.lastIndexOf(".") + 1 )).toLowerCase();
106 }
107 return ext;
108 }
109
110 public static String replaceExtension(String originalName) {
111 return replaceExtension(originalName, "tab");
112 }
113
114 public static String replaceExtension(String originalName, String newExtension) {
115 int extensionIndex = originalName.lastIndexOf(".");
116 if (extensionIndex != -1 ) {
117 return originalName.substring(0, extensionIndex) + "."+newExtension ;
118 } else {
119 return originalName +"."+newExtension ;
120 }
121 }
122
123 public static String getUserFriendlyFileType(DataFile dataFile) {
124 String fileType = dataFile.getContentType();
125
126 if (fileType != null) {
127 if (fileType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)){
128 return ShapefileHandler.SHAPEFILE_FILE_TYPE_FRIENDLY_NAME;
129 }
130 if (fileType.indexOf(";") != -1) {
131 fileType = fileType.substring(0, fileType.indexOf(";"));
132 }
133 try {
134 return ResourceBundle.getBundle("MimeTypeDisplay").getString(fileType);
135 } catch (MissingResourceException e) {
136 return fileType;
137 }
138 }
139
140 return fileType;
141 }
142
143 public static String getFacetFileType(DataFile dataFile) {
144 String fileType = dataFile.getContentType();
145
146 if (fileType != null) {
147 if (fileType.indexOf(";") != -1) {
148 fileType = fileType.substring(0, fileType.indexOf(";"));
149 }
150
151 try {
152 return ResourceBundle.getBundle("MimeTypeFacets").getString(fileType);
153 } catch (MissingResourceException e) {
154 // if there's no defined "facet-friendly" form of this mime type
155 // we'll truncate the available type by "/", e.g., all the
156 // unknown image/* types will become "image"; many other, quite
157 // different types will all become "application" this way -
158 // but it is probably still better than to tag them all as
159 // "uknown".
160 // -- L.A. 4.0 alpha 1
161 return fileType.split("/")[0];
162 }
163 }
164
165 return "unknown";
166 }
167
168 public static String getUserFriendlyOriginalType(DataFile dataFile) {
169 String fileType = dataFile.getOriginalFileFormat();
170
171 if (fileType != null && !fileType.equals("")) {
172 if (fileType.indexOf(";") != -1) {
173 fileType = fileType.substring(0, fileType.indexOf(";"));
174 }
175 try {
176 return ResourceBundle.getBundle("MimeTypeDisplay").getString(fileType);
177 } catch (MissingResourceException e) {
178 return fileType;
179 }
180 }
181
182 return "UNKNOWN";
183 }
184
185 public static String determineFileType(File f, String fileName) throws IOException{
186 String fileType = null;
187 String fileExtension = getFileExtension(fileName);
188
189
190
191 // step 1:
192 // Apply our custom methods to try and recognize data files that can be
193 // converted to tabular data, or can be parsed for extra metadata
194 // (such as FITS).
195 logger.fine("Attempting to identify potential tabular data files;");
196 IngestableDataChecker tabChk = new IngestableDataChecker(TABULAR_DATA_FORMAT_SET);
197
198 fileType = tabChk.detectTabularDataFormat(f);
199
200 logger.fine("determineFileType: tabular data checker found "+fileType);
201
202 // step 2: If not found, check if graphml or FITS
203 if (fileType==null) {
204 if (isGraphMLFile(f)) {
205 fileType = "text/xml-graphml";
206 } else // Check for FITS:
207 // our check is fairly weak (it appears to be hard to really
208 // really recognize a FITS file without reading the entire
209 // stream...), so in version 3.* we used to nsist on *both*
210 // the ".fits" extension and the header check;
211 // in 4.0, we'll accept either the extension, or the valid
212 // magic header:
213 if (isFITSFile(f) || (fileExtension != null
214 && fileExtension.equalsIgnoreCase("fits"))) {
215 fileType = "application/fits";
216 }
217 }
218
219 // step 3: check the mime type of this file with Jhove
220 if (fileType == null){
221 JhoveFileType jw = new JhoveFileType();
222 fileType = jw.getFileMimeType(f);
223 }
224
225 // step 4:
226 // Additional processing; if we haven't gotten much useful information
227 // back from Jhove, we'll try and make an educated guess based on
228 // the file extension:
229
230 if ( fileExtension != null) {
231 logger.fine("fileExtension="+fileExtension);
232
233 if (fileType != null && fileType.startsWith("text/plain")){
234 if (( fileExtension != null) && (STATISTICAL_SYNTAX_FILE_EXTENSION.containsKey(fileExtension))) {
235 // replace the mime type with the value of the HashMap
236 fileType = STATISTICAL_SYNTAX_FILE_EXTENSION.get(fileExtension);
237 }
238 } else if ("application/octet-stream".equals(fileType)) {
239 fileType = determineFileType(fileName);
240 logger.fine("mime type recognized by extension: "+fileType);
241 }
242 } else {
243 logger.fine("fileExtension is null");
244 }
245
246 // step 5:
247 // if this is a compressed file - zip or gzip - we'll check the
248 // file(s) inside the compressed stream and see if it's one of our
249 // recognized formats that we want to support compressed:
250
251 if ("application/x-gzip".equals(fileType)) {
252 logger.fine("we'll run additional checks on this gzipped file.");
253 // We want to be able to support gzipped FITS files, same way as
254 // if they were just regular FITS files:
255 FileInputStream gzippedIn = new FileInputStream(f);
256 // (new FileInputStream() can throw a "filen not found" exception;
257 // however, if we've made it this far, it really means that the
258 // file does exist and can be opened)
259 InputStream uncompressedIn = null;
260 try {
261 uncompressedIn = new GZIPInputStream(gzippedIn);
262 if (isFITSFile(uncompressedIn)) {
263 fileType = "application/fits-gzipped";
264 }
265 } catch (IOException ioex) {
266 if (uncompressedIn != null) {
267 try {uncompressedIn.close();} catch (IOException e) {}
268 }
269 }
270 }
271 if ("application/zip".equals(fileType)) {
272
273 // Is this a zipped Shapefile?
274 // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile
275 //logger.info("Checking for shapefile");
276
277 ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(f));
278 if (shp_handler.containsShapefile()){
279 // logger.info("------- shapefile FOUND ----------");
280 fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; //"application/zipped-shapefile";
281 }
282 }
283
284 logger.fine("returning fileType "+fileType);
285 return fileType;
286 }
287
288 public static String determineFileType(String fileName) {
289 return MIME_TYPE_MAP.getContentType(fileName);
290 }
291
292
293 /*
294 * Custom method for identifying FITS files:
295 * TODO:
296 * the existing check for the "magic header" is very weak (see below);
297 * it should probably be replaced by attempting to parse and read at
298 * least the primary HDU, using the NOM fits parser.
299 * -- L.A. 4.0 alpha
300 */
301 private static boolean isFITSFile(File file) {
302 BufferedInputStream ins = null;
303
304 try {
305 ins = new BufferedInputStream(new FileInputStream(file));
306 return isFITSFile(ins);
307 } catch (IOException ex) {
308 }
309
310 return false;
311 }
312
313 private static boolean isFITSFile(InputStream ins) {
314 boolean isFITS = false;
315
316 // number of header bytes read for identification:
317 int magicWordLength = 6;
318 String magicWord = "SIMPLE";
319
320 try {
321 byte[] b = new byte[magicWordLength];
322 logger.fine("attempting to read "+magicWordLength+" bytes from the FITS format candidate stream.");
323 if (ins.read(b, 0, magicWordLength) != magicWordLength) {
324 throw new IOException();
325 }
326
327 if (magicWord.equals(new String(b))) {
328 logger.fine("yes, this is FITS file!");
329 isFITS = true;
330 }
331 } catch (IOException ex) {
332 isFITS = false;
333 } finally {
334 if (ins != null) {
335 try {
336 ins.close();
337 } catch (Exception e) {
338 }
339 }
340 }
341
342 return isFITS;
343 }
344
345 private static boolean isGraphMLFile(File file) {
346 boolean isGraphML = false;
347 logger.fine("begin isGraphMLFile()");
348 try{
349 FileReader fileReader = new FileReader(file);
350 javax.xml.stream.XMLInputFactory xmlif = javax.xml.stream.XMLInputFactory.newInstance();
351 xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);
352
353 XMLStreamReader xmlr = xmlif.createXMLStreamReader(fileReader);
354 for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
355 if (event == XMLStreamConstants.START_ELEMENT) {
356 if (xmlr.getLocalName().equals("graphml")) {
357 String schema = xmlr.getAttributeValue("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation");
358 logger.fine("schema = "+schema);
359 if (schema!=null && schema.indexOf("http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd")!=-1){
360 logger.fine("graphML is true");
361 isGraphML = true;
362 }
363 }
364 break;
365 }
366 }
367 } catch(XMLStreamException e) {
368 logger.fine("XML error - this is not a valid graphML file.");
369 isGraphML = false;
370 } catch(IOException e) {
371 throw new EJBException(e);
372 }
373 logger.fine("end isGraphML()");
374 return isGraphML;
375 }
376
377 /**
378 * The number of bytes in a kilobyte, megabyte and gigabyte:
379 */
380 public static final long ONE_KB = 1024;
381 public static final long ONE_MB = ONE_KB * ONE_KB;
382 public static final long ONE_GB = ONE_KB * ONE_MB;
383
384 public static String getFriendlySize(Long filesize) {
385 if (filesize == null || filesize.longValue() < 0) {
386 return "unknown";
387 }
388
389 long bytesize = filesize.longValue();
390 String displaySize;
391
392 if (bytesize / ONE_GB > 0) {
393 displaySize = String.valueOf(bytesize / ONE_GB) + "." + String.valueOf((bytesize % ONE_GB) / (100 * ONE_MB)) + " GB";
394 } else if (bytesize / ONE_MB > 0) {
395 displaySize = String.valueOf(bytesize / ONE_MB) + "." + String.valueOf((bytesize % ONE_MB) / (100 * ONE_KB)) + " MB";
396 } else if (bytesize / ONE_KB > 0) {
397 displaySize = String.valueOf(bytesize / ONE_KB) + "." + String.valueOf((bytesize % ONE_KB) / 100) + " KB";
398 } else {
399 displaySize = String.valueOf(bytesize) + " bytes";
400 }
401 return displaySize;
402
403 }
404 }