Mercurial > hg > LGDataverses
diff src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java Tue Sep 08 17:00:21 2015 +0200 @@ -0,0 +1,776 @@ +package edu.harvard.iq.dataverse.util; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Date; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipException; +import java.util.HashMap; +import java.util.*; + +import java.nio.file.Files; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.commons.io.FileUtils; + +/** + * Used to identify, "repackage", and extract data from Shapefiles in .zip format + * + * (1) Identify if a .zip contains a shapefile: + * boolean containsShapefile(FileInputStream zipStream) or boolean containsShapefile(FileInputStream zip_filename) + * + * + * + * (2) Unpack/"Repackage" .zip: + * (a) All files extracted + * (b) Each group of files that make up a shapefile are made into individual .zip files + * (c) Non shapefile-related files left on their own + * + * If the original .zip contains: "shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.ain", "shape1.aih", + * "shape2.shp", "shape2.shx", "shape2.dbf", "shape2.prj", + * "shape1.pdf", "README.md", "shape_notes.txt" + * The repackaging results in a folder containing: + * "shape1.zip", + * "shape2.zip", + * "shape1.pdf", "README.md", "shape_notes.txt" + * + * Code Example: + * FileInputStream shp_file_input_stream = new FileInputStream(new File("zipped_shapefile.zip")) + * ShapefileHandler shp_handler = new ShapefileHandler(shp_file_input_stream); + * if (shp_handler.containsShapefile()){ + * File rezip_folder = new File("~/folder_for_rezipping"); + * boolean rezip_success = shp_handler.rezipShapefileSets(shp_file_input_stream, rezip_folder ); + * if (!rezip_success){ + * // rezip failed, should be an error message (String) available + System.out.println(shp_handler.error_message); + * } + * }else{ + * if (shp_handler.errorFound){ + * System.out.println("Error message: " + shp_handler.error_message; + * } + * } + * + * + * @author raprasad + * + * + */ +public class ShapefileHandler{ + + private static final Logger logger = Logger.getLogger(ShapefileHandler.class.getCanonicalName()); + + // Reference for these extensions: http://en.wikipedia.org/wiki/Shapefile + public final static String SHAPEFILE_FILE_TYPE = "application/zipped-shapefile"; + public final static String SHAPEFILE_FILE_TYPE_FRIENDLY_NAME = "Shapefile as ZIP Archive"; + public final static List<String> SHAPEFILE_MANDATORY_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj"); + public final static String SHP_XML_EXTENSION = "shp.xml"; + public final static String BLANK_EXTENSION = "__PLACEHOLDER-FOR-BLANK-EXTENSION__"; + public final static List<String> SHAPEFILE_ALL_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj", "sbn", "sbx", "fbn", "fbx", "ain", "aih", "ixs", "mxs", "atx", ".cpg", SHP_XML_EXTENSION); + + public boolean DEBUG = false; + + private boolean zipFileProcessed = false; + public boolean errorFound = false; + public String errorMessage = new String(); + + // List of files in .zip archive + private List<String> filesListInDir = new ArrayList<>(); + + // Hash of file names and byte sizes { "file name" : bytes } example: { "water.shp" : 541234 } + private HashMap<String, Long> filesizeHash = new HashMap<>(); + + // Hash of file basenames and a list of extensions. + /* e.g. { "subway_shapefile" : [ ".dbf", ".prj", ".sbn", ".sbx", ".shp", ".shx"] + , "shapefile_info" : [".docx"] + , "README" : ["md"] + , "Notes" : [""] + } + */ + private Map<String, List<String>> fileGroups = new HashMap<>(); + + private List<File> finalRezippedFiles = new ArrayList<>(); + + private String outputFolder = "unzipped"; + private String rezippedFolder = "rezipped"; + + // Debug helper + private void msg(String s){ + //logger.info(s); + if (DEBUG){ + System.out.println(s); + } + } + + private void msgt(String s){ + msg("-------------------------------"); + msg(s); + msg("-------------------------------"); + } + + /* + Constructor, start with filename + */ + public ShapefileHandler(String filename){ + + if (filename==null){ + this.addErrorMessage("The filename was null"); + return; + } + + FileInputStream zip_file_stream; + try { + zip_file_stream = new FileInputStream(new File(filename)); + } catch (FileNotFoundException ex) { + this.addErrorMessage("The file was not found"); + return; + } + + this.examineZipfile(zip_file_stream); + + } + + + /* + Constructor, start with FileInputStream + */ + public ShapefileHandler(FileInputStream zip_file_stream){ + + if (zip_file_stream==null){ + this.addErrorMessage("The zip_file_stream was null"); + return; + } + this.examineZipfile(zip_file_stream); + } + + public List<File> getFinalRezippedFiles(){ + return this.finalRezippedFiles; + } + + private void addFinalRezippedFile(String targetFileFullpath){ + if (targetFileFullpath==null){ + logger.warning("addFinalRezippedFile. targetFileFullpath is null"); + return; + } + File finalFile = new File(targetFileFullpath); + if (!(finalFile.isFile())){ + logger.warning("addFinalRezippedFile. Not a file: " + targetFileFullpath); + return; + } + this.finalRezippedFiles.add(finalFile); + }; + + + private void addErrorMessage(String m){ + if (m == null){ + return; + } + logger.severe("ShapeFileHandler Error: " + m); + this.errorFound = true; + this.errorMessage = m; + } + /* + Create a directory, if one doesn"t exist + */ + private boolean createDirectory(String fname){ + if (fname == null){ + return false; + } + File folder_obj = new File(fname); + msg("ShapefileHandler. Folder created: " + folder_obj.getAbsolutePath()); + return createDirectory(folder_obj); + + } // createDirectory + + private boolean createDirectory(File folder){ + if (folder == null){ + return false; + } + try{ + if(!folder.exists()){ + msg("Creating folder: " + folder.getName()); + folder.mkdirs(); + }else{ + msg("Folder exists: " + folder.getName()); + } + }catch(SecurityException ex){ + this.addErrorMessage("Tried to create directory but resulted in SecurityException"); + return false; + }catch(NullPointerException ex){ + this.addErrorMessage("Tried to create directory but resulted in NullPointerException"); + + return false; + } + return true; + } // createDirectory + + + /* + Print out the key/value pairs of the Hash of filenames and sizes + */ + private void showFileNamesSizes(){ + msgt("Hash: file names + sizes"); + Iterator<String> keySetIterator = this.filesizeHash.keySet().iterator(); + + while(keySetIterator.hasNext()){ + String key = keySetIterator.next(); + msg("key: [" + key + "] value: [" + this.filesizeHash.get(key)+"]"); + + } + } // end showFileNamesSizes + + + public Map getFileGroups(){ + return this.fileGroups; + } + + /* + Iterate through Hash of file base names and extensions + */ + public void showFileGroups(){ + + msgt("Hash: file base names + extensions"); + + for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ + msg("\nKey: [" + entry.getKey() + "] Ext List: " + entry.getValue()); + if (doesListContainShapefileExtensions(entry.getValue())){ + msg(" >>>> YES, This is a shapefile!"); + }else{ + msg(" >>>> Not a shapefile"); + } + } + + } // end showFileGroups + + /* + Return a count of shapefile sets in this .zip + */ + public int getShapefileCount(){ + int shp_cnt = 0; + + for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ + if (doesListContainShapefileExtensions(entry.getValue())){ + shp_cnt+=1; + } + } + return shp_cnt; + } + + + private boolean deleteDirectory(String dirname){ + + if (dirname==null){ + return false; + } + File dir_obj = new File(dirname); + if (!(dir_obj.exists())){ + return true; + } + File[] entries = dir_obj.listFiles(); + msgt("deleteDirectory"); + if (entries==null){ + return true; + } + for(File f: entries){ + f.delete(); + } + dir_obj.delete(); + return true; + + } + + private String getFileBasename(String fileName){ + if (fileName==null){ + return null; + } + String unzipFileName = new File(fileName).getName(); + if (unzipFileName.equals("")){ + logger.info("getFileBasename. fileName is an empty string: " + fileName); + return null; + } + return unzipFileName; + } + /* + Unzip the files to the directory, FLATTENING the directory structure + + Any colliding names will result in overwrites + + */ + private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File target_directory){ + //logger.info("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); + + if (zipfile_input_stream== null){ + this.addErrorMessage("unzipFilesToDirectory. The zipfile_input_stream is null."); + return false; + } + if (!target_directory.isDirectory()){ + this.addErrorMessage("This directory does not exist: " + target_directory.getAbsolutePath()); + return false; + } + + List<String> unzippedFileNames = new ArrayList<>(); + + ZipInputStream zipStream = new ZipInputStream(zipfile_input_stream); + + ZipEntry origEntry; + byte[] buffer = new byte[2048]; + try { + while((origEntry = zipStream.getNextEntry())!=null){ + + String zentryFileName = origEntry.getName(); + //logger.info("\nOriginal entry name: " + origEntry); + + if (this.isFileToSkip(zentryFileName)){ + logger.fine("Skip file"); + continue; + } + + // Create sub directory, if needed + if (origEntry.isDirectory()) { + //logger.info("Subdirectory found!"); + logger.fine("Skip directory"); + //String dirpath = target_directory.getAbsolutePath() + "/" + zentryFileName; + //createDirectory(dirpath); + continue; // Continue to next Entry + } + logger.fine("file found!"); + + // Write the file + String unzipFileName = this.getFileBasename(zentryFileName); + if (unzipFileName==null){ + logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); + continue; + } + + String outpath = target_directory.getAbsolutePath() + "/" + unzipFileName; + if (unzippedFileNames.contains(outpath)){ + logger.info("Potential name collision. Avoiding duplicate files in 'collapsed' zip directories. Skipping file: " + zentryFileName); + continue; + }else{ + unzippedFileNames.add(outpath); + } + logger.fine("Write zip file: " + outpath); + FileOutputStream fileOutputStream; + long fsize = 0; + fileOutputStream = new FileOutputStream(outpath); + int len;// = 0; + while ((len = zipStream.read(buffer)) > 0){ + fileOutputStream.write(buffer, 0, len); + fsize+=len; + } // end while + fileOutputStream.close(); + } // end outer while + } catch (IOException ex) { + for (StackTraceElement el : ex.getStackTrace()){ + logger.severe(el.toString()); + } + this.addErrorMessage("Failed to open ZipInputStream entry" + ex.getMessage()); + return false; + } + + try { + zipStream.close(); + } catch (IOException ex) { + Logger.getLogger(ShapefileHandler.class.getName()).log(Level.SEVERE, null, ex); + } + return true; + } + /* + Rezip the shapefile(s) into a given directory + Assumes that the zipfile_input_stream has already been checked! + */ + public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rezippedFolder) throws IOException{ + logger.fine("rezipShapefileSets"); + //msgt("rezipShapefileSets"); + if (!this.zipFileProcessed){ + this.addErrorMessage("First use 'examineZipFile' (called in the constructor)"); + return false; + } + if (!this.containsShapefile()){ + this.addErrorMessage("There are no shapefiles here!"); + return false; + } + if (zipfile_input_stream== null){ + this.addErrorMessage("The zipfile_input_stream is null."); + return false; + } + if (rezippedFolder == null){ + this.addErrorMessage("The rezippedFolder is null."); + return false; + } + + if (!rezippedFolder.isDirectory()){ + this.addErrorMessage("The rezippedFolder does not exist: " + rezippedFolder.getAbsolutePath()); + return false; + } + if (!containsShapefile()){ + msgt("There are no shapefiles to re-zip"); + return false; + } + + // Create target directory for unzipping files + String dirname_for_unzipping; + File dir_for_unzipping; + + dirname_for_unzipping = rezippedFolder.getAbsolutePath() + "/" + "scratch-for-unzip-12345"; + dir_for_unzipping = new File(dirname_for_unzipping); + logger.fine("Try to create directory: " + dirname_for_unzipping ); + + if (!this.createDirectory(dir_for_unzipping)){ + this.addErrorMessage("Failed to make directory: " + dirname_for_unzipping); + return false; + } + + + // Unzip files! + if (!this.unzipFilesToDirectory(zipfile_input_stream, dir_for_unzipping)){ + this.addErrorMessage("Failed to unzip files."); + return false; + } + // Redistribute files! + String target_dirname = rezippedFolder.getAbsolutePath(); + boolean redistribute_success = this.redistributeFilesFromZip(dirname_for_unzipping, target_dirname); + + //logger.fine("About to delete: " + dir_for_unzipping); + // Delete unzipped files in scratch directory + //FileUtils.deleteDirectory(dir_for_unzipping); + + logger.fine("Post redistribute:)"); + for (File f : new File(target_dirname).listFiles()){ + logger.fine("File exists: " + f.getAbsolutePath()); + } + + return redistribute_success; + + } + + private String getRedistributeFilePath(String dirname, String file_basename, String file_ext){ + + if (dirname==null){ + this.addErrorMessage("getRedistributeFilePath. dirname is null"); + return null; + } + if (file_basename==null){ + this.addErrorMessage("getRedistributeFilePath. file_basename is null"); + return null; + } + if (file_ext==null){ + this.addErrorMessage("getRedistributeFilePath. file_ext is null"); + return null; + } + if (file_ext.equals(BLANK_EXTENSION)){ + return dirname + "/" + file_basename; + } + return dirname + "/" + file_basename + "." + file_ext; + } + + /* + Create new zipped shapefile + + + */ + private boolean redistributeFilesFromZip(String source_dirname, String target_dirname){ + + logger.fine("redistributeFilesFromZip. source: '" + source_dirname + "' target: '" + target_dirname + "'"); + + int cnt =0; + /* START: Redistribute files by iterating through the Map of basenames + extensions + + example key: "shape1" + example ext_list: ["shp", "shx", "dbf", "prj"] + */ + for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ + cnt++; + String key = entry.getKey(); + List<String> ext_list = entry.getValue(); + + msg("\n(" + cnt + ") Basename: " + key); + msg("Extensions: " + Arrays.toString(ext_list.toArray())); + + // Is this a shapefile? If so, rezip it + if (doesListContainShapefileExtensions(ext_list)){ + + List<String> namesToZip = new ArrayList<>(); + + for (String ext_name : ext_list) { + if (!this.isShapefileExtension(ext_name)){ + // Another file with similar basename as shapefile. + // e.g. if shapefile basename is "census", this might be "census.xls", "census.pdf", or another non-shapefile extension + String source_file_fullpath = this.getRedistributeFilePath(source_dirname, key, ext_name); + String targetFileFullpath = this.getRedistributeFilePath(target_dirname, key, ext_name); + this.straightFileCopy(source_file_fullpath, targetFileFullpath); + this.addFinalRezippedFile(targetFileFullpath); + }else{ + namesToZip.add(key + "." + ext_name); + + } + } + + String target_zipfile_name = target_dirname + "/" + key + ".zip"; + //this.msg("target_zipfile_name: "+ target_zipfile_name); + //this.msg("source_dirname: "+ source_dirname); + + //msgt("create zipped shapefile"); + ZipMaker zip_maker = new ZipMaker(namesToZip, source_dirname, target_zipfile_name); + this.addFinalRezippedFile(target_zipfile_name); + + // rezip it + + }else{ + // Non-shapefiles + for (String ext_name : ext_list) { + String source_file_fullpath = this.getRedistributeFilePath(source_dirname, key, ext_name); + String targetFileFullpath = this.getRedistributeFilePath(target_dirname, key, ext_name); + this.straightFileCopy(source_file_fullpath, targetFileFullpath); + this.addFinalRezippedFile(targetFileFullpath); + + } + } + } + + // END: Redistribute files + + return true; + } // end: redistributeFilesFromZip + + + private boolean straightFileCopy(String sourceFileName, String targetFileName){ + + //msg("Copy [" + sourceFileName + " to [" + targetFileName + "]"); + if ((sourceFileName == null)||(targetFileName==null)){ + this.addErrorMessage("The source or target file was null.\nSource: " + sourceFileName +"\nTarget: " + targetFileName); + return false; + } + + File source_file = new File(sourceFileName); + File target_file = new File(targetFileName); + try { + Files.copy(source_file.toPath(), target_file.toPath(), REPLACE_EXISTING); + } catch (IOException ex) { + this.addErrorMessage("Failed to copy file. IOException\nSource: " + sourceFileName +"\nTarget: " + targetFileName); + return false; + } + + return true; + + } + + public boolean containsOnlySingleShapefile(){ + if (containsShapefile()){ + if (fileGroups.size()==filesizeHash.size()){ + return true; + } + } + return false; + } + + /* + Does this zip file contain a shapefile set? + */ + public boolean containsShapefile(){ + for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ + String key = entry.getKey(); + List<String> ext_list = entry.getValue(); + if (doesListContainShapefileExtensions(ext_list)){ + return true; + } + } + + return false; + } + + private boolean isShapefileExtension(String ext_name){ + if (ext_name == null){ + return false; + } + return SHAPEFILE_ALL_EXTENSIONS.contains(ext_name); + } + /* + Does a list of file extensions match those required for a shapefile set? + */ + private boolean doesListContainShapefileExtensions(List<String> ext_list){ + if (ext_list == null){ + return false; + } + return ext_list.containsAll(SHAPEFILE_MANDATORY_EXTENSIONS); + } + + + private void addToFileGroupHash(String basename, String ext){ + if ((basename==null)||(ext==null)){ + return; + } + List<String> extension_list = fileGroups.get(basename); + if (extension_list==null) { + extension_list = new ArrayList<>(); + } + if (!(extension_list.contains(ext))){ + extension_list.add(ext); + fileGroups.put(basename, extension_list); + } + } // end addToFileGroupHash + + /** + * Update the fileGroup hash which contains a { base_filename : [ext1, ext2, etc ]} + * This is used to determine whether a .zip contains a shapefile set + # + * @param fname filename in String format + */ + private void updateFileGroupHash(String fname){ + if (fname == null){ + return; + } + + // Split filename into basename and extension. No extension yields only basename + // + if (fname.toLowerCase().endsWith(SHP_XML_EXTENSION)){ + int idx = fname.toLowerCase().indexOf("." + SHP_XML_EXTENSION); + if (idx >= 1){ // if idx==0, then the file name is ".shp.xml"" + String basename = fname.substring(0, idx); + String ext = fname.substring(idx+1); + addToFileGroupHash(basename, ext); + return; + } + } + + String[] tokens = fname.split("\\.(?=[^\\.]+$)"); + if (tokens.length==1){ + addToFileGroupHash(tokens[0], BLANK_EXTENSION); // file basename, no extension + + }else if (tokens.length==2){ + addToFileGroupHash(tokens[0], tokens[1]); // file basename, extension + } + } // end updateFileGroupHash + + private boolean isFileToSkip(String fname){ + if ((fname==null)||(fname.equals(""))){ + return true; + } + + if (fname.startsWith("__")){ + return true; + } + + if (fname.startsWith("._")){ + return true; + } + + File fnameFile = new File(fname); + if (fnameFile.getName().endsWith(".DS_Store")){ + return true; + } + return false; + } + + /************************************** + * Iterate through the zip file contents. + * Does it contain any shapefiles? + * + * @param FileInputStream zip_file_stream + */ + private boolean examineZipfile(FileInputStream zip_file_stream){ + // msgt("examineZipfile"); + + if (zip_file_stream==null){ + this.addErrorMessage("The zip file stream was null"); + return false; + } + + // Clear out file lists + this.filesListInDir.clear(); + this.filesizeHash.clear(); + this.fileGroups.clear(); + + try{ + ZipInputStream zipStream = new ZipInputStream(zip_file_stream); + ZipEntry entry; + + while((entry = zipStream.getNextEntry())!=null){ + + String zentryFileName = entry.getName(); + //msg("zip entry: " + entry.getName()); + // Skip files or folders starting with __ + if (this.isFileToSkip(zentryFileName)){ + continue; + } + + if (entry.isDirectory()) { + //String dirpath = outputFolder + "/" + zentryFileName; + //createDirectory(dirpath); + continue; + } + + String unzipFileName = this.getFileBasename(zentryFileName); + if (unzipFileName==null){ + logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); + continue; + } + + + String s = String.format("Entry: %s len %d added %TD", + unzipFileName, entry.getSize(), + new Date(entry.getTime())); + + if (!this.filesListInDir.contains(s)){ + this.filesListInDir.add(s); + updateFileGroupHash(unzipFileName); + this.filesizeHash.put(unzipFileName, entry.getSize()); + } + } // end while + + zipStream.close(); + + if (this.filesListInDir.isEmpty()){ + errorMessage = "No files in zipStream"; + return false; + } + + this.zipFileProcessed = true; + return true; + + }catch(ZipException ex){ + this.addErrorMessage("ZipException"); + msgt("ZipException"); + return false; + + }catch(IOException ex){ + //ex.printStackTrace(); + this.addErrorMessage("IOException File name"); + msgt("IOException"); + return false; + }catch(IllegalArgumentException ex){ + this.addErrorMessage("IllegalArgumentException when parsing zipfile"); + msgt("IllegalArgumentException when parsing zipfile"); + return false; + + }finally{ + + } + + } // end examineFile + + public static void main(String[] args){ + + // Example usage + if (args.length == 0){ + + + }else if(args.length > 1){ + System.out.println( "Please only give one file name!"); + }else{ + /* + String zip_name = args[0]; + System.out.println( "Process File: " + zip_name); + System.out.println( "Process File: " + zip_name); + ShapefileHandler zpt = new ShapefileHandler(zip_name); + */ + } + } // end main + +} // end ShapefileHandler \ No newline at end of file
