comparison src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
9:5926d6419569 10:a50cf11e5178
1 /*
2 * To change this license header, choose License Headers in Project Properties.
3 * To change this template file, choose Tools | Templates
4 * and open the template in the editor.
5 */
6
7 package edu.harvard.iq.dataverse;
8
9 import edu.harvard.iq.dataverse.authorization.Permission;
10 import edu.harvard.iq.dataverse.authorization.users.User;
11 import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
12 import java.util.Collections;
13 import java.util.Date;
14 import java.util.List;
15 import java.util.UUID;
16 import java.util.logging.Level;
17 import java.util.logging.Logger;
18 import javax.ejb.EJB;
19 import javax.ejb.Stateless;
20 import javax.inject.Named;
21 import javax.persistence.EntityManager;
22 import javax.persistence.PersistenceContext;
23 import javax.persistence.Query;
24
25 /**
26 *
27 * @author Leonid Andreev
28 *
29 * Basic skeleton of the new DataFile service for DVN 4.0
30 *
31 */
32
33 @Stateless
34 @Named
35 public class DataFileServiceBean implements java.io.Serializable {
36
37 private static final Logger logger = Logger.getLogger(DataFileServiceBean.class.getCanonicalName());
38 @EJB
39 DatasetServiceBean datasetService;
40 @EJB
41 PermissionServiceBean permissionService;
42
43 @PersistenceContext(unitName = "VDCNet-ejbPU")
44 private EntityManager em;
45
46 // File type "classes" tags:
47
48 private static final String FILE_CLASS_AUDIO = "audio";
49 private static final String FILE_CLASS_CODE = "code";
50 private static final String FILE_CLASS_DOCUMENT = "document";
51 private static final String FILE_CLASS_ASTRO = "astro";
52 private static final String FILE_CLASS_IMAGE = "image";
53 private static final String FILE_CLASS_NETWORK = "network";
54 private static final String FILE_CLASS_GEO = "geodata";
55 private static final String FILE_CLASS_TABULAR = "tabular";
56 private static final String FILE_CLASS_VIDEO = "video";
57 private static final String FILE_CLASS_OTHER = "other";
58
59 // Assorted useful mime types:
60
61 // 3rd-party and/or proprietary tabular data formasts that we know
62 // how to ingest:
63
64 private static final String MIME_TYPE_STATA = "application/x-stata";
65 private static final String MIME_TYPE_STATA13 = "application/x-stata-13";
66 private static final String MIME_TYPE_RDATA = "application/x-rlang-transport";
67 private static final String MIME_TYPE_CSV = "text/csv";
68 private static final String MIME_TYPE_CSV_ALT = "text/comma-separated-values";
69 private static final String MIME_TYPE_XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
70 private static final String MIME_TYPE_SPSS_SAV = "application/x-spss-sav";
71 private static final String MIME_TYPE_SPSS_POR = "application/x-spss-por";
72
73 // Tabular data formats we don't know how to ingets, but still recognize
74 // as "tabular data":
75 // TODO: - add more to this list? -- L.A. 4.0 beta13
76
77 private static final String MIME_TYPE_TAB = "text/tab-separated-values";
78 private static final String MIME_TYPE_FIXED_FIELD = "text/x-fixed-field";
79 private static final String MIME_TYPE_SAS_TRANSPORT = "application/x-sas-transport";
80 private static final String MIME_TYPE_SAS_SYSTEM = "application/x-sas-system";
81
82 // The following are the "control card/syntax" formats that we recognize
83 // as "code":
84
85 private static final String MIME_TYPE_R_SYNTAX = "application/x-r-syntax";
86 private static final String MIME_TYPE_STATA_SYNTAX = "text/x-stata-syntax";
87 private static final String MIME_TYPE_SPSS_CCARD = "text/x-spss-syntax";
88 private static final String MIME_TYPE_SAS_SYNTAX = "text/x-sas-syntax";
89
90 // The types recognized as "documents":
91 // TODO: there has to be more! -- L.A. 4.0 beta13
92
93 private static final String MIME_TYPE_PLAIN_TEXT = "text/plain";
94 private static final String MIME_TYPE_DOCUMENT_PDF = "application/pdf";
95 private static final String MIME_TYPE_DOCUMENT_MSWORD = "application/msword";
96 private static final String MIME_TYPE_DOCUMENT_MSEXCEL = "application/vnd.ms-excel";
97 private static final String MIME_TYPE_DOCUMENT_MSWORD_OPENXML = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
98
99 // Supported Astrophysics formats:
100 // (only FITS at this point)
101
102 private static final String MIME_TYPE_FITS = "application/fits";
103 private static final String MIME_TYPE_FITSIMAGE = "image/fits";
104
105 // Network Data files:
106 // (only GRAPHML at this point):
107
108 private static final String MIME_TYPE_NETWORK_GRAPHML = "text/xml-graphml";
109
110 // SHAPE file type:
111 // this is the only supported file type in the GEO DATA class:
112
113 private static final String MIME_TYPE_GEO_SHAPE = "application/zipped-shapefile";
114
115 private static final String MIME_TYPE_ZIP = "application/zip";
116
117 private static final String MIME_TYPE_UNDETERMINED_DEFAULT = "application/octet-stream";
118 private static final String MIME_TYPE_UNDETERMINED_BINARY = "application/binary";
119
120 public DataFile find(Object pk) {
121 return (DataFile) em.find(DataFile.class, pk);
122 }
123
124 /*public DataFile findByMD5(String md5Value){
125 if (md5Value == null){
126 return null;
127 }
128 Query query = em.createQuery("select object(o) from DataFile as o where o.md5 =:md5Value order by o.id");
129 query.setParameter("md5Value", md5Value);
130 return (DataFile)query.getSingleResult();
131
132 }*/
133
134 public List<DataFile> findByDatasetId(Long studyId) {
135 /*
136 Sure, we don't have *studies* any more, in 4.0; it's a tribute
137 to the past. -- L.A.
138 */
139 Query query = em.createQuery("select o from DataFile o where o.owner.id = :studyId order by o.id");
140 query.setParameter("studyId", studyId);
141 return query.getResultList();
142 }
143
144 public List<DataFile> findIngestsInProgress() {
145 if ( em.isOpen() ) {
146 Query query = em.createQuery("select object(o) from DataFile as o where o.ingestStatus =:scheduledStatusCode or o.ingestStatus =:progressStatusCode order by o.id");
147 query.setParameter("scheduledStatusCode", DataFile.INGEST_STATUS_SCHEDULED);
148 query.setParameter("progressStatusCode", DataFile.INGEST_STATUS_INPROGRESS);
149 return query.getResultList();
150 } else {
151 return Collections.emptyList();
152 }
153 }
154
155
156 public DataTable findDataTableByFileId(Long fileId) {
157 Query query = em.createQuery("select object(o) from DataTable as o where o.dataFile.id =:fileId order by o.id");
158 query.setParameter("fileId", fileId);
159 return (DataTable)query.getSingleResult();
160 }
161
162 public List<DataFile> findAll() {
163 return em.createQuery("select object(o) from DataFile as o order by o.id").getResultList();
164 }
165
166 public DataFile save(DataFile dataFile) {
167
168 DataFile savedDataFile = em.merge(dataFile);
169 return savedDataFile;
170 }
171
172 public Boolean isPreviouslyPublished(Long fileId){
173 Query query = em.createQuery("select object(o) from FileMetadata as o where o.dataFile.id =:fileId");
174 query.setParameter("fileId", fileId);
175 List retList = query.getResultList();
176 return (retList.size() > 1);
177 }
178
179 public void deleteFromVersion( DatasetVersion d, DataFile f ) {
180 em.createNamedQuery("DataFile.removeFromDatasetVersion")
181 .setParameter("versionId", d.getId()).setParameter("fileId", f.getId())
182 .executeUpdate();
183 }
184
185 public void generateStorageIdentifier(DataFile dataFile) {
186 dataFile.setFileSystemName(generateStorageIdentifier());
187 }
188
189 public String generateStorageIdentifier() {
190
191 UUID uid = UUID.randomUUID();
192
193 logger.log(Level.FINE, "UUID value: {0}", uid.toString());
194
195 // last 6 bytes, of the random UUID, in hex:
196
197 String hexRandom = uid.toString().substring(24);
198
199 logger.log(Level.FINE, "UUID (last 6 bytes, 12 hex digits): {0}", hexRandom);
200
201 String hexTimestamp = Long.toHexString(new Date().getTime());
202
203 logger.log(Level.FINE, "(not UUID) timestamp in hex: {0}", hexTimestamp);
204
205 String storageIdentifier = hexTimestamp + "-" + hexRandom;
206
207 logger.log(Level.FINE, "timestamp/UUID hybrid: {0}", storageIdentifier);
208 return storageIdentifier;
209 }
210
211 public boolean isSpssPorFile (DataFile file) {
212 return MIME_TYPE_SPSS_POR.equalsIgnoreCase(file.getContentType());
213 }
214
215 public boolean isSpssSavFile (DataFile file) {
216 return MIME_TYPE_SPSS_SAV.equalsIgnoreCase(file.getContentType());
217 }
218
219 /*
220 public boolean isSpssPorFile (FileMetadata fileMetadata) {
221 if (fileMetadata != null && fileMetadata.getDataFile() != null) {
222 return isSpssPorFile(fileMetadata.getDataFile());
223 }
224 return false;
225 }
226 */
227
228
229 /*
230 * This method tells you if thumbnail generation is *supported*
231 * on this type of file. i.e., if true, it does not guarantee that a thumbnail
232 * can/will be generated; but it means that we can try.
233 */
234 public boolean thumbnailSupported (DataFile file) {
235 if (file == null) {
236 return false;
237 }
238
239 String contentType = file.getContentType();
240
241 // Some browsers (Chrome?) seem to identify FITS files as mime
242 // type "image/fits" on upload; this is both incorrect (the official
243 // mime type for FITS is "application/fits", and problematic: then
244 // the file is identified as an image, and the page will attempt to
245 // generate a preview - which of course is going to fail...
246 if (MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType)) {
247 return false;
248 }
249 // besides most image/* types, we can generate thumbnails for
250 // pdf and "world map" files:
251
252 return (contentType != null &&
253 (contentType.startsWith("image/") ||
254 contentType.equalsIgnoreCase("application/pdf") ||
255 contentType.equalsIgnoreCase(MIME_TYPE_GEO_SHAPE)));
256 }
257
258 /*
259 * This method will return true if the thumbnail is *actually available* and
260 * ready to be downloaded. (it will try to generate a thumbnail for supported
261 * file types, if not yet available)
262 */
263 public boolean isThumbnailAvailable (DataFile file, User user) {
264 if (file == null) {
265 return false;
266 }
267
268 // If thumbnails are not even supported for this class of files,
269 // there's notthing to talk about:
270
271 if (!thumbnailSupported(file)) {
272 return false;
273 }
274
275 // Also, thumbnails are only shown to users who have permission to see
276 // the full-size image file. So before we do anything else, let's
277 // do some authentication and authorization:
278 if (!permissionService.userOn(user, file).has(Permission.DownloadFile)) {
279 logger.fine("No permission to download the file.");
280 return false;
281 }
282
283
284
285 return ImageThumbConverter.isThumbnailAvailable(file);
286 }
287
288
289 // TODO:
290 // Document this.
291 // -- L.A. 4.0 beta14
292
293 public boolean isTemporaryPreviewAvailable(String fileSystemId, String mimeType) {
294
295 String filesRootDirectory = System.getProperty("dataverse.files.directory");
296 if (filesRootDirectory == null || filesRootDirectory.equals("")) {
297 filesRootDirectory = "/tmp/files";
298 }
299
300 String fileSystemName = filesRootDirectory + "/temp/" + fileSystemId;
301
302 String imageThumbFileName = null;
303
304 if ("application/pdf".equals(mimeType)) {
305 imageThumbFileName = ImageThumbConverter.generatePDFThumb(fileSystemName);
306 } else if (mimeType != null && mimeType.startsWith("image/")) {
307 imageThumbFileName = ImageThumbConverter.generateImageThumb(fileSystemName);
308 }
309
310 if (imageThumbFileName != null) {
311 return true;
312 }
313
314 return false;
315 }
316
317 /*
318 * TODO:
319 * similar method, but for non-default thumbnail sizes:
320 */
321
322 public boolean isThumbnailAvailableForSize (DataFile file) {
323 return false;
324 }
325
326 public boolean ingestableAsTabular(DataFile dataFile) {
327 /*
328 * In the final 4.0 we'll be doing real-time checks, going through the
329 * available plugins and verifying the lists of mime types that they
330 * can handle. In 4.0 beta, the ingest plugins are still built into the
331 * main code base, so we can just go through a hard-coded list of mime
332 * types. -- L.A.
333 */
334
335 String mimeType = dataFile.getContentType();
336
337 if (mimeType == null) {
338 return false;
339 }
340
341 if (mimeType.equals(MIME_TYPE_STATA)) {
342 return true;
343 } else if (mimeType.equals(MIME_TYPE_STATA13)) {
344 return true;
345 } else if (mimeType.equals(MIME_TYPE_RDATA)) {
346 return true;
347 } else if (mimeType.equals(MIME_TYPE_CSV) || mimeType.equals(MIME_TYPE_CSV_ALT)) {
348 return true;
349 } else if (mimeType.equals(MIME_TYPE_XLSX)) {
350 return true;
351 } else if (mimeType.equals(MIME_TYPE_SPSS_SAV)) {
352 return true;
353 } else if (mimeType.equals(MIME_TYPE_SPSS_POR)) {
354 return true;
355 }
356
357 return false;
358 }
359
360 /*
361 * Methods for identifying "classes" (groupings) of files by type:
362 */
363
364 public String getFileClassById (Long fileId) {
365 DataFile file = find(fileId);
366
367 if (file == null) {
368 return null;
369 }
370
371 return getFileClass(file);
372 }
373
374 public String getFileClass (DataFile file) {
375 if (isFileClassImage(file)) {
376 return FILE_CLASS_IMAGE;
377 }
378
379 if (isFileClassVideo(file)) {
380 return FILE_CLASS_VIDEO;
381 }
382
383 if (isFileClassAudio(file)) {
384 return FILE_CLASS_AUDIO;
385 }
386
387 if (isFileClassCode(file)) {
388 return FILE_CLASS_CODE;
389 }
390
391 if (isFileClassDocument(file)) {
392 return FILE_CLASS_DOCUMENT;
393 }
394
395 if (isFileClassAstro(file)) {
396 return FILE_CLASS_ASTRO;
397 }
398
399 if (isFileClassNetwork(file)) {
400 return FILE_CLASS_NETWORK;
401 }
402
403 if (isFileClassGeo(file)) {
404 return FILE_CLASS_GEO;
405 }
406
407 if (isFileClassTabularData(file)) {
408 return FILE_CLASS_TABULAR;
409 }
410
411
412 return FILE_CLASS_OTHER;
413 }
414
415
416
417 public boolean isFileClassImage (DataFile file) {
418 if (file == null) {
419 return false;
420 }
421
422 String contentType = file.getContentType();
423
424 // Some browsers (Chrome?) seem to identify FITS files as mime
425 // type "image/fits" on upload; this is both incorrect (the official
426 // mime type for FITS is "application/fits", and problematic: then
427 // the file is identified as an image, and the page will attempt to
428 // generate a preview - which of course is going to fail...
429
430 if (MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType)) {
431 return false;
432 }
433 // besides most image/* types, we can generate thumbnails for
434 // pdf and "world map" files:
435
436 return (contentType != null && (contentType.toLowerCase().startsWith("image/")));
437 }
438
439 public boolean isFileClassAudio (DataFile file) {
440 if (file == null) {
441 return false;
442 }
443
444 String contentType = file.getContentType();
445
446 // TODO:
447 // verify that there are no audio types that don't start with "audio/" -
448 // some exotic mp[34]... ?
449
450 return (contentType != null && (contentType.toLowerCase().startsWith("audio/")));
451 }
452
453 public boolean isFileClassCode (DataFile file) {
454 if (file == null) {
455 return false;
456 }
457
458 String contentType = file.getContentType();
459
460 // The following are the "control card/syntax" formats that we recognize
461 // as "code":
462
463 return (MIME_TYPE_R_SYNTAX.equalsIgnoreCase(contentType)
464 || MIME_TYPE_STATA_SYNTAX.equalsIgnoreCase(contentType)
465 || MIME_TYPE_SAS_SYNTAX.equalsIgnoreCase(contentType)
466 || MIME_TYPE_SPSS_CCARD.equalsIgnoreCase(contentType));
467
468 }
469
470 public boolean isFileClassDocument (DataFile file) {
471 if (file == null) {
472 return false;
473 }
474
475 // "Documents": PDF, assorted MS docs, etc.
476
477 String contentType = file.getContentType();
478 int scIndex = 0;
479 if (contentType != null && (scIndex = contentType.indexOf(';')) > 0) {
480 contentType = contentType.substring(0, scIndex);
481 }
482
483 return (MIME_TYPE_PLAIN_TEXT.equalsIgnoreCase(contentType)
484 || MIME_TYPE_DOCUMENT_PDF.equalsIgnoreCase(contentType)
485 || MIME_TYPE_DOCUMENT_MSWORD.equalsIgnoreCase(contentType)
486 || MIME_TYPE_DOCUMENT_MSEXCEL.equalsIgnoreCase(contentType)
487 || MIME_TYPE_DOCUMENT_MSWORD_OPENXML.equalsIgnoreCase(contentType));
488
489 }
490
491 public boolean isFileClassAstro (DataFile file) {
492 if (file == null) {
493 return false;
494 }
495
496 String contentType = file.getContentType();
497
498 // The only known/supported "Astro" file type is FITS,
499 // so far:
500
501 return (MIME_TYPE_FITS.equalsIgnoreCase(contentType) || MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType));
502
503 }
504
505 public boolean isFileClassNetwork (DataFile file) {
506 if (file == null) {
507 return false;
508 }
509
510 String contentType = file.getContentType();
511
512 // The only known/supported Network Data type is GRAPHML,
513 // so far:
514
515 return MIME_TYPE_NETWORK_GRAPHML.equalsIgnoreCase(contentType);
516
517 }
518
519 /*
520 * we don't really need a method for "other" -
521 * it's "other" if it fails to identify as any specific class...
522 * (or do we?)
523 public boolean isFileClassOther (DataFile file) {
524 if (file == null) {
525 return false;
526 }
527
528 }
529 */
530
531 public boolean isFileClassGeo (DataFile file) {
532 if (file == null) {
533 return false;
534 }
535
536 String contentType = file.getContentType();
537
538 // The only known/supported Geo Data type is SHAPE,
539 // so far:
540
541 return MIME_TYPE_GEO_SHAPE.equalsIgnoreCase(contentType);
542 }
543
544 public boolean isFileClassTabularData (DataFile file) {
545 if (file == null) {
546 return false;
547 }
548
549 // "Tabular data" is EITHER an INGESTED tabular data file, i.e.
550 // a file with a DataTable and DataVariables; or a DataFile
551 // of one of the many known tabular data formats - SPSS, Stata, etc.
552 // that for one reason or another didn't get ingested:
553
554 if (file.isTabularData()) {
555 return true;
556 }
557
558 // The formats we know how to ingest:
559 if (ingestableAsTabular(file)) {
560 return true;
561 }
562
563 String contentType = file.getContentType();
564
565 // And these are the formats we DON'T know how to ingest,
566 // but nevertheless recognize as "tabular data":
567
568 return (MIME_TYPE_TAB.equalsIgnoreCase(contentType)
569 || MIME_TYPE_FIXED_FIELD.equalsIgnoreCase(contentType)
570 || MIME_TYPE_SAS_TRANSPORT.equalsIgnoreCase(contentType)
571 || MIME_TYPE_SAS_SYSTEM.equalsIgnoreCase(contentType));
572
573 }
574
575 public boolean isFileClassVideo (DataFile file) {
576 if (file == null) {
577 return false;
578 }
579
580 String contentType = file.getContentType();
581
582 // TODO:
583 // check if there are video types that don't start with "audio/" -
584 // some exotic application/... formats ?
585
586 return (contentType != null && (contentType.toLowerCase().startsWith("video/")));
587
588 }
589
590 }