diff src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java	Tue Sep 08 17:00:21 2015 +0200
@@ -0,0 +1,590 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package edu.harvard.iq.dataverse;
+
+import edu.harvard.iq.dataverse.authorization.Permission;
+import edu.harvard.iq.dataverse.authorization.users.User;
+import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
+import java.util.Collections;
+import java.util.Date;
+import java.util.List;
+import java.util.UUID;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import javax.ejb.EJB;
+import javax.ejb.Stateless;
+import javax.inject.Named;
+import javax.persistence.EntityManager;
+import javax.persistence.PersistenceContext;
+import javax.persistence.Query;
+
+/**
+ *
+ * @author Leonid Andreev
+ * 
+ * Basic skeleton of the new DataFile service for DVN 4.0
+ * 
+ */
+
+@Stateless
+@Named
+public class DataFileServiceBean implements java.io.Serializable {
+    
+    private static final Logger logger = Logger.getLogger(DataFileServiceBean.class.getCanonicalName());
+    @EJB
+    DatasetServiceBean datasetService;
+    @EJB
+    PermissionServiceBean permissionService;
+
+    @PersistenceContext(unitName = "VDCNet-ejbPU")
+    private EntityManager em;
+    
+    // File type "classes" tags:
+    
+    private static final String FILE_CLASS_AUDIO = "audio";
+    private static final String FILE_CLASS_CODE = "code";
+    private static final String FILE_CLASS_DOCUMENT = "document";
+    private static final String FILE_CLASS_ASTRO = "astro";
+    private static final String FILE_CLASS_IMAGE = "image";
+    private static final String FILE_CLASS_NETWORK = "network";
+    private static final String FILE_CLASS_GEO = "geodata";
+    private static final String FILE_CLASS_TABULAR = "tabular";
+    private static final String FILE_CLASS_VIDEO = "video";
+    private static final String FILE_CLASS_OTHER = "other";
+
+    // Assorted useful mime types:
+    
+    // 3rd-party and/or proprietary tabular data formasts that we know
+    // how to ingest: 
+    
+    private static final String MIME_TYPE_STATA = "application/x-stata";
+    private static final String MIME_TYPE_STATA13 = "application/x-stata-13";
+    private static final String MIME_TYPE_RDATA = "application/x-rlang-transport";
+    private static final String MIME_TYPE_CSV   = "text/csv";
+    private static final String MIME_TYPE_CSV_ALT = "text/comma-separated-values";
+    private static final String MIME_TYPE_XLSX  = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
+    private static final String MIME_TYPE_SPSS_SAV = "application/x-spss-sav";
+    private static final String MIME_TYPE_SPSS_POR = "application/x-spss-por";
+    
+    // Tabular data formats we don't know how to ingets, but still recognize
+    // as "tabular data":
+    // TODO: - add more to this list? -- L.A. 4.0 beta13
+    
+    private static final String MIME_TYPE_TAB   = "text/tab-separated-values";
+    private static final String MIME_TYPE_FIXED_FIELD = "text/x-fixed-field";
+    private static final String MIME_TYPE_SAS_TRANSPORT = "application/x-sas-transport";
+    private static final String MIME_TYPE_SAS_SYSTEM = "application/x-sas-system";
+    
+    // The following are the "control card/syntax" formats that we recognize 
+    // as "code":
+    
+    private static final String MIME_TYPE_R_SYNTAX = "application/x-r-syntax";
+    private static final String MIME_TYPE_STATA_SYNTAX = "text/x-stata-syntax";
+    private static final String MIME_TYPE_SPSS_CCARD = "text/x-spss-syntax";
+    private static final String MIME_TYPE_SAS_SYNTAX = "text/x-sas-syntax";
+
+    // The types recognized as "documents":
+    // TODO: there has to be more! -- L.A. 4.0 beta13
+    
+    private static final String MIME_TYPE_PLAIN_TEXT = "text/plain";
+    private static final String MIME_TYPE_DOCUMENT_PDF = "application/pdf";
+    private static final String MIME_TYPE_DOCUMENT_MSWORD = "application/msword";
+    private static final String MIME_TYPE_DOCUMENT_MSEXCEL = "application/vnd.ms-excel";
+    private static final String MIME_TYPE_DOCUMENT_MSWORD_OPENXML = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
+    
+    // Supported Astrophysics formats: 
+    // (only FITS at this point)
+    
+    private static final String MIME_TYPE_FITS  = "application/fits";
+    private static final String MIME_TYPE_FITSIMAGE = "image/fits";
+   
+    // Network Data files: 
+    // (only GRAPHML at this point): 
+    
+    private static final String MIME_TYPE_NETWORK_GRAPHML = "text/xml-graphml";
+   
+    // SHAPE file type: 
+    // this is the only supported file type in the GEO DATA class:
+    
+    private static final String MIME_TYPE_GEO_SHAPE = "application/zipped-shapefile";
+    
+    private static final String MIME_TYPE_ZIP   = "application/zip";
+    
+    private static final String MIME_TYPE_UNDETERMINED_DEFAULT = "application/octet-stream";
+    private static final String MIME_TYPE_UNDETERMINED_BINARY = "application/binary";
+    
+    public DataFile find(Object pk) {
+        return (DataFile) em.find(DataFile.class, pk);
+    }   
+    
+    /*public DataFile findByMD5(String md5Value){
+        if (md5Value == null){
+            return null;
+        }
+        Query query = em.createQuery("select object(o) from DataFile as o where o.md5 =:md5Value order by o.id");
+        query.setParameter("md5Value", md5Value);
+        return (DataFile)query.getSingleResult();
+        
+    }*/
+    
+    public List<DataFile> findByDatasetId(Long studyId) {
+        /* 
+           Sure, we don't have *studies* any more, in 4.0; it's a tribute 
+           to the past. -- L.A.
+        */
+        Query query = em.createQuery("select o from DataFile o where o.owner.id = :studyId order by o.id");
+        query.setParameter("studyId", studyId);
+        return query.getResultList();
+    }  
+
+    public List<DataFile> findIngestsInProgress() {
+        if ( em.isOpen() ) {
+            Query query = em.createQuery("select object(o) from DataFile as o where o.ingestStatus =:scheduledStatusCode or o.ingestStatus =:progressStatusCode order by o.id");
+            query.setParameter("scheduledStatusCode", DataFile.INGEST_STATUS_SCHEDULED);
+            query.setParameter("progressStatusCode", DataFile.INGEST_STATUS_INPROGRESS);
+            return query.getResultList();
+        } else {
+            return Collections.emptyList();
+        }
+    }
+    
+    
+    public DataTable findDataTableByFileId(Long fileId) {
+        Query query = em.createQuery("select object(o) from DataTable as o where o.dataFile.id =:fileId order by o.id");
+        query.setParameter("fileId", fileId);
+        return (DataTable)query.getSingleResult();
+    }
+    
+    public List<DataFile> findAll() {
+        return em.createQuery("select object(o) from DataFile as o order by o.id").getResultList();
+    }
+    
+    public DataFile save(DataFile dataFile) {
+            
+        DataFile savedDataFile = em.merge(dataFile);
+        return savedDataFile;
+    }
+    
+    public Boolean isPreviouslyPublished(Long fileId){
+        Query query = em.createQuery("select object(o) from FileMetadata as o where o.dataFile.id =:fileId");
+        query.setParameter("fileId", fileId);
+        List retList = query.getResultList();
+        return (retList.size() > 1);
+    }
+    
+    public void deleteFromVersion( DatasetVersion d, DataFile f ) {
+		em.createNamedQuery("DataFile.removeFromDatasetVersion")
+			.setParameter("versionId", d.getId()).setParameter("fileId", f.getId())
+				.executeUpdate();
+    }
+
+    public void generateStorageIdentifier(DataFile dataFile) {
+        dataFile.setFileSystemName(generateStorageIdentifier());
+    }
+    
+    public String generateStorageIdentifier() {
+        
+        UUID uid = UUID.randomUUID();
+                
+        logger.log(Level.FINE, "UUID value: {0}", uid.toString());
+        
+        // last 6 bytes, of the random UUID, in hex: 
+        
+        String hexRandom = uid.toString().substring(24);
+        
+        logger.log(Level.FINE, "UUID (last 6 bytes, 12 hex digits): {0}", hexRandom);
+        
+        String hexTimestamp = Long.toHexString(new Date().getTime());
+        
+        logger.log(Level.FINE, "(not UUID) timestamp in hex: {0}", hexTimestamp);
+            
+        String storageIdentifier = hexTimestamp + "-" + hexRandom;
+        
+        logger.log(Level.FINE, "timestamp/UUID hybrid: {0}", storageIdentifier);
+        return storageIdentifier; 
+    }
+    
+    public boolean isSpssPorFile (DataFile file) {
+        return MIME_TYPE_SPSS_POR.equalsIgnoreCase(file.getContentType());
+    }
+    
+    public boolean isSpssSavFile (DataFile file) {
+        return MIME_TYPE_SPSS_SAV.equalsIgnoreCase(file.getContentType());
+    }
+    
+    /*
+    public boolean isSpssPorFile (FileMetadata fileMetadata) {
+        if (fileMetadata != null && fileMetadata.getDataFile() != null) {
+            return isSpssPorFile(fileMetadata.getDataFile());
+        }
+        return false; 
+    }
+    */
+    
+    
+    /* 
+     * This method tells you if thumbnail generation is *supported* 
+     * on this type of file. i.e., if true, it does not guarantee that a thumbnail 
+     * can/will be generated; but it means that we can try. 
+     */
+    public boolean thumbnailSupported (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+        
+        // Some browsers (Chrome?) seem to identify FITS files as mime
+        // type "image/fits" on upload; this is both incorrect (the official
+        // mime type for FITS is "application/fits", and problematic: then
+        // the file is identified as an image, and the page will attempt to 
+        // generate a preview - which of course is going to fail...
+        if (MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType)) {
+            return false;
+        }
+        // besides most image/* types, we can generate thumbnails for 
+        // pdf and "world map" files:
+        
+        return (contentType != null && 
+                (contentType.startsWith("image/") || 
+                contentType.equalsIgnoreCase("application/pdf") ||
+                contentType.equalsIgnoreCase(MIME_TYPE_GEO_SHAPE)));
+    }
+    
+    /* 
+     * This method will return true if the thumbnail is *actually available* and
+     * ready to be downloaded. (it will try to generate a thumbnail for supported
+     * file types, if not yet available)
+    */
+    public boolean isThumbnailAvailable (DataFile file, User user) {
+        if (file == null) {
+            return false; 
+        } 
+        
+        // If thumbnails are not even supported for this class of files, 
+        // there's notthing to talk about: 
+        
+        if (!thumbnailSupported(file)) {
+            return false;
+        }
+        
+        // Also, thumbnails are only shown to users who have permission to see 
+        // the full-size image file. So before we do anything else, let's
+        // do some authentication and authorization:        
+        if (!permissionService.userOn(user, file).has(Permission.DownloadFile)) { 
+            logger.fine("No permission to download the file.");
+            return false; 
+        }
+        
+        
+        
+       return ImageThumbConverter.isThumbnailAvailable(file);      
+    }
+
+    
+    // TODO: 
+    // Document this.
+    // -- L.A. 4.0 beta14
+    
+    public boolean isTemporaryPreviewAvailable(String fileSystemId, String mimeType) {
+        
+        String filesRootDirectory = System.getProperty("dataverse.files.directory");
+        if (filesRootDirectory == null || filesRootDirectory.equals("")) {
+            filesRootDirectory = "/tmp/files";
+        }
+
+        String fileSystemName = filesRootDirectory + "/temp/" + fileSystemId;
+        
+        String imageThumbFileName = null;
+        
+        if ("application/pdf".equals(mimeType)) {
+            imageThumbFileName = ImageThumbConverter.generatePDFThumb(fileSystemName);
+        } else if (mimeType != null && mimeType.startsWith("image/")) {
+            imageThumbFileName = ImageThumbConverter.generateImageThumb(fileSystemName);
+        }
+        
+        if (imageThumbFileName != null) {
+            return true; 
+        }
+            
+        return false;
+    }
+    
+    /* 
+     * TODO: 
+     * similar method, but for non-default thumbnail sizes:
+    */
+    
+    public boolean isThumbnailAvailableForSize (DataFile file) {
+        return false; 
+    }
+    
+    public boolean ingestableAsTabular(DataFile dataFile) {
+        /* 
+         * In the final 4.0 we'll be doing real-time checks, going through the 
+         * available plugins and verifying the lists of mime types that they 
+         * can handle. In 4.0 beta, the ingest plugins are still built into the 
+         * main code base, so we can just go through a hard-coded list of mime 
+         * types. -- L.A. 
+         */
+        
+        String mimeType = dataFile.getContentType();
+        
+        if (mimeType == null) {
+            return false;
+        }
+        
+        if (mimeType.equals(MIME_TYPE_STATA)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_STATA13)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_RDATA)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_CSV) || mimeType.equals(MIME_TYPE_CSV_ALT)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_XLSX)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_SPSS_SAV)) {
+            return true;
+        } else if (mimeType.equals(MIME_TYPE_SPSS_POR)) {
+            return true;
+        }
+
+        return false;
+    }
+    
+    /* 
+     * Methods for identifying "classes" (groupings) of files by type:
+    */
+    
+    public String getFileClassById (Long fileId) {
+        DataFile file = find(fileId);
+        
+        if (file == null) {
+            return null; 
+        }
+        
+        return getFileClass(file);
+    }
+    
+    public String getFileClass (DataFile file) {
+        if (isFileClassImage(file)) {
+            return FILE_CLASS_IMAGE;
+        }
+        
+        if (isFileClassVideo(file)) {
+            return FILE_CLASS_VIDEO;
+        }
+        
+        if (isFileClassAudio(file)) {
+            return FILE_CLASS_AUDIO;
+        }
+        
+        if (isFileClassCode(file)) {
+            return FILE_CLASS_CODE;
+        }
+        
+        if (isFileClassDocument(file)) {
+            return FILE_CLASS_DOCUMENT;
+        }
+        
+        if (isFileClassAstro(file)) {
+            return FILE_CLASS_ASTRO;
+        }
+        
+        if (isFileClassNetwork(file)) {
+            return FILE_CLASS_NETWORK;
+        }
+        
+        if (isFileClassGeo(file)) {
+            return FILE_CLASS_GEO;
+        }
+        
+        if (isFileClassTabularData(file)) {
+            return FILE_CLASS_TABULAR;
+        }
+        
+        
+        return FILE_CLASS_OTHER;
+    }
+    
+    
+    
+    public boolean isFileClassImage (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+
+        // Some browsers (Chrome?) seem to identify FITS files as mime
+        // type "image/fits" on upload; this is both incorrect (the official
+        // mime type for FITS is "application/fits", and problematic: then
+        // the file is identified as an image, and the page will attempt to 
+        // generate a preview - which of course is going to fail...
+        
+        if (MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType)) {
+            return false;
+        }
+        // besides most image/* types, we can generate thumbnails for 
+        // pdf and "world map" files:
+        
+        return (contentType != null && (contentType.toLowerCase().startsWith("image/")));
+    }
+    
+    public boolean isFileClassAudio (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+        
+        // TODO: 
+        // verify that there are no audio types that don't start with "audio/" - 
+        //  some exotic mp[34]... ?
+        
+        return (contentType != null && (contentType.toLowerCase().startsWith("audio/")));    
+    }
+    
+    public boolean isFileClassCode (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+     
+        String contentType = file.getContentType();
+        
+        // The following are the "control card/syntax" formats that we recognize 
+        // as "code":
+    
+        return (MIME_TYPE_R_SYNTAX.equalsIgnoreCase(contentType)
+            || MIME_TYPE_STATA_SYNTAX.equalsIgnoreCase(contentType) 
+            || MIME_TYPE_SAS_SYNTAX.equalsIgnoreCase(contentType)
+            || MIME_TYPE_SPSS_CCARD.equalsIgnoreCase(contentType));
+        
+    }
+    
+    public boolean isFileClassDocument (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        // "Documents": PDF, assorted MS docs, etc. 
+        
+        String contentType = file.getContentType();
+        int scIndex = 0;
+        if (contentType != null && (scIndex = contentType.indexOf(';')) > 0) {
+            contentType = contentType.substring(0, scIndex);
+        }
+        
+        return (MIME_TYPE_PLAIN_TEXT.equalsIgnoreCase(contentType)
+            || MIME_TYPE_DOCUMENT_PDF.equalsIgnoreCase(contentType)
+            || MIME_TYPE_DOCUMENT_MSWORD.equalsIgnoreCase(contentType)
+            || MIME_TYPE_DOCUMENT_MSEXCEL.equalsIgnoreCase(contentType)
+            || MIME_TYPE_DOCUMENT_MSWORD_OPENXML.equalsIgnoreCase(contentType));
+        
+    }
+    
+    public boolean isFileClassAstro (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+       
+        // The only known/supported "Astro" file type is FITS,
+        // so far:
+        
+        return (MIME_TYPE_FITS.equalsIgnoreCase(contentType) || MIME_TYPE_FITSIMAGE.equalsIgnoreCase(contentType));
+        
+    }
+    
+    public boolean isFileClassNetwork (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+       
+        // The only known/supported Network Data type is GRAPHML,
+        // so far:
+        
+        return MIME_TYPE_NETWORK_GRAPHML.equalsIgnoreCase(contentType);
+        
+    }
+    
+    /* 
+     * we don't really need a method for "other" - 
+     * it's "other" if it fails to identify as any specific class... 
+     * (or do we?)
+    public boolean isFileClassOther (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+    }
+    */
+    
+    public boolean isFileClassGeo (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+       
+        // The only known/supported Geo Data type is SHAPE,
+        // so far:
+        
+        return MIME_TYPE_GEO_SHAPE.equalsIgnoreCase(contentType);
+    }
+    
+    public boolean isFileClassTabularData (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        // "Tabular data" is EITHER an INGESTED tabular data file, i.e.
+        // a file with a DataTable and DataVariables; or a DataFile 
+        // of one of the many known tabular data formats - SPSS, Stata, etc.
+        // that for one reason or another didn't get ingested: 
+        
+        if (file.isTabularData()) {
+            return true; 
+        }
+        
+        // The formats we know how to ingest: 
+        if (ingestableAsTabular(file)) {
+            return true;
+        }
+        
+        String contentType = file.getContentType();
+        
+        // And these are the formats we DON'T know how to ingest, 
+        // but nevertheless recognize as "tabular data":
+        
+        return (MIME_TYPE_TAB.equalsIgnoreCase(contentType)
+            || MIME_TYPE_FIXED_FIELD.equalsIgnoreCase(contentType) 
+            || MIME_TYPE_SAS_TRANSPORT.equalsIgnoreCase(contentType)
+            || MIME_TYPE_SAS_SYSTEM.equalsIgnoreCase(contentType));
+        
+    }
+    
+    public boolean isFileClassVideo (DataFile file) {
+        if (file == null) {
+            return false;
+        }
+        
+        String contentType = file.getContentType();
+        
+        // TODO: 
+        // check if there are video types that don't start with "audio/" - 
+        // some exotic application/... formats ?
+        
+        return (contentType != null && (contentType.toLowerCase().startsWith("video/")));    
+        
+    }
+        
+}