package edu.harvard.iq.dataverse.util; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Date; import java.util.ArrayList; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipException; import java.util.HashMap; import java.util.*; import java.nio.file.Files; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; /** * Used to identify, "repackage", and extract data from Shapefiles in .zip format * * (1) Identify if a .zip contains a shapefile: * boolean containsShapefile(FileInputStream zipStream) or boolean containsShapefile(FileInputStream zip_filename) * * * * (2) Unpack/"Repackage" .zip: * (a) All files extracted * (b) Each group of files that make up a shapefile are made into individual .zip files * (c) Non shapefile-related files left on their own * * If the original .zip contains: "shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.ain", "shape1.aih", * "shape2.shp", "shape2.shx", "shape2.dbf", "shape2.prj", * "shape1.pdf", "README.md", "shape_notes.txt" * The repackaging results in a folder containing: * "shape1.zip", * "shape2.zip", * "shape1.pdf", "README.md", "shape_notes.txt" * * Code Example: * FileInputStream shp_file_input_stream = new FileInputStream(new File("zipped_shapefile.zip")) * ShapefileHandler shp_handler = new ShapefileHandler(shp_file_input_stream); * if (shp_handler.containsShapefile()){ * File rezip_folder = new File("~/folder_for_rezipping"); * boolean rezip_success = shp_handler.rezipShapefileSets(shp_file_input_stream, rezip_folder ); * if (!rezip_success){ * // rezip failed, should be an error message (String) available System.out.println(shp_handler.error_message); * } * }else{ * if (shp_handler.errorFound){ * System.out.println("Error message: " + shp_handler.error_message; * } * } * * * @author raprasad * * */ public class ShapefileHandler{ private static final Logger logger = Logger.getLogger(ShapefileHandler.class.getCanonicalName()); // Reference for these extensions: http://en.wikipedia.org/wiki/Shapefile public final static String SHAPEFILE_FILE_TYPE = "application/zipped-shapefile"; public final static String SHAPEFILE_FILE_TYPE_FRIENDLY_NAME = "Shapefile as ZIP Archive"; public final static List<String> SHAPEFILE_MANDATORY_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj"); public final static String SHP_XML_EXTENSION = "shp.xml"; public final static String BLANK_EXTENSION = "__PLACEHOLDER-FOR-BLANK-EXTENSION__"; public final static List<String> SHAPEFILE_ALL_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj", "sbn", "sbx", "fbn", "fbx", "ain", "aih", "ixs", "mxs", "atx", ".cpg", SHP_XML_EXTENSION); public boolean DEBUG = false; private boolean zipFileProcessed = false; public boolean errorFound = false; public String errorMessage = new String(); // List of files in .zip archive private List<String> filesListInDir = new ArrayList<>(); // Hash of file names and byte sizes { "file name" : bytes } example: { "water.shp" : 541234 } private HashMap<String, Long> filesizeHash = new HashMap<>(); // Hash of file basenames and a list of extensions. /* e.g. { "subway_shapefile" : [ ".dbf", ".prj", ".sbn", ".sbx", ".shp", ".shx"] , "shapefile_info" : [".docx"] , "README" : ["md"] , "Notes" : [""] } */ private Map<String, List<String>> fileGroups = new HashMap<>(); private List<File> finalRezippedFiles = new ArrayList<>(); private String outputFolder = "unzipped"; private String rezippedFolder = "rezipped"; // Debug helper private void msg(String s){ //logger.info(s); if (DEBUG){ System.out.println(s); } } private void msgt(String s){ msg("-------------------------------"); msg(s); msg("-------------------------------"); } /* Constructor, start with filename */ public ShapefileHandler(String filename){ if (filename==null){ this.addErrorMessage("The filename was null"); return; } FileInputStream zip_file_stream; try { zip_file_stream = new FileInputStream(new File(filename)); } catch (FileNotFoundException ex) { this.addErrorMessage("The file was not found"); return; } this.examineZipfile(zip_file_stream); } /* Constructor, start with FileInputStream */ public ShapefileHandler(FileInputStream zip_file_stream){ if (zip_file_stream==null){ this.addErrorMessage("The zip_file_stream was null"); return; } this.examineZipfile(zip_file_stream); } public List<File> getFinalRezippedFiles(){ return this.finalRezippedFiles; } private void addFinalRezippedFile(String targetFileFullpath){ if (targetFileFullpath==null){ logger.warning("addFinalRezippedFile. targetFileFullpath is null"); return; } File finalFile = new File(targetFileFullpath); if (!(finalFile.isFile())){ logger.warning("addFinalRezippedFile. Not a file: " + targetFileFullpath); return; } this.finalRezippedFiles.add(finalFile); }; private void addErrorMessage(String m){ if (m == null){ return; } logger.severe("ShapeFileHandler Error: " + m); this.errorFound = true; this.errorMessage = m; } /* Create a directory, if one doesn"t exist */ private boolean createDirectory(String fname){ if (fname == null){ return false; } File folder_obj = new File(fname); msg("ShapefileHandler. Folder created: " + folder_obj.getAbsolutePath()); return createDirectory(folder_obj); } // createDirectory private boolean createDirectory(File folder){ if (folder == null){ return false; } try{ if(!folder.exists()){ msg("Creating folder: " + folder.getName()); folder.mkdirs(); }else{ msg("Folder exists: " + folder.getName()); } }catch(SecurityException ex){ this.addErrorMessage("Tried to create directory but resulted in SecurityException"); return false; }catch(NullPointerException ex){ this.addErrorMessage("Tried to create directory but resulted in NullPointerException"); return false; } return true; } // createDirectory /* Print out the key/value pairs of the Hash of filenames and sizes */ private void showFileNamesSizes(){ msgt("Hash: file names + sizes"); Iterator<String> keySetIterator = this.filesizeHash.keySet().iterator(); while(keySetIterator.hasNext()){ String key = keySetIterator.next(); msg("key: [" + key + "] value: [" + this.filesizeHash.get(key)+"]"); } } // end showFileNamesSizes public Map<String, List<String>> getFileGroups(){ return this.fileGroups; } /* Iterate through Hash of file base names and extensions */ public void showFileGroups(){ msgt("Hash: file base names + extensions"); for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ msg("\nKey: [" + entry.getKey() + "] Ext List: " + entry.getValue()); if (doesListContainShapefileExtensions(entry.getValue())){ msg(" >>>> YES, This is a shapefile!"); }else{ msg(" >>>> Not a shapefile"); } } } // end showFileGroups /* Return a count of shapefile sets in this .zip */ public int getShapefileCount(){ int shp_cnt = 0; for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ if (doesListContainShapefileExtensions(entry.getValue())){ shp_cnt+=1; } } return shp_cnt; } private boolean deleteDirectory(String dirname){ if (dirname==null){ return false; } File dir_obj = new File(dirname); if (!(dir_obj.exists())){ return true; } File[] entries = dir_obj.listFiles(); msgt("deleteDirectory"); if (entries==null){ return true; } for(File f: entries){ f.delete(); } dir_obj.delete(); return true; } private String getFileBasename(String fileName){ if (fileName==null){ return null; } String unzipFileName = new File(fileName).getName(); if (unzipFileName.equals("")){ logger.info("getFileBasename. fileName is an empty string: " + fileName); return null; } return unzipFileName; } /* Unzip the files to the directory, FLATTENING the directory structure Any colliding names will result in overwrites */ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File target_directory){ //logger.info("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); if (zipfile_input_stream== null){ this.addErrorMessage("unzipFilesToDirectory. The zipfile_input_stream is null."); return false; } if (!target_directory.isDirectory()){ this.addErrorMessage("This directory does not exist: " + target_directory.getAbsolutePath()); return false; } List<String> unzippedFileNames = new ArrayList<>(); ZipInputStream zipStream = new ZipInputStream(zipfile_input_stream); ZipEntry origEntry; byte[] buffer = new byte[2048]; try { while((origEntry = zipStream.getNextEntry())!=null){ String zentryFileName = origEntry.getName(); //logger.info("\nOriginal entry name: " + origEntry); if (this.isFileToSkip(zentryFileName)){ logger.fine("Skip file"); continue; } // Create sub directory, if needed if (origEntry.isDirectory()) { //logger.info("Subdirectory found!"); logger.fine("Skip directory"); //String dirpath = target_directory.getAbsolutePath() + "/" + zentryFileName; //createDirectory(dirpath); continue; // Continue to next Entry } logger.fine("file found!"); // Write the file String unzipFileName = this.getFileBasename(zentryFileName); if (unzipFileName==null){ logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); continue; } String outpath = target_directory.getAbsolutePath() + "/" + unzipFileName; if (unzippedFileNames.contains(outpath)){ logger.info("Potential name collision. Avoiding duplicate files in 'collapsed' zip directories. Skipping file: " + zentryFileName); continue; }else{ unzippedFileNames.add(outpath); } logger.fine("Write zip file: " + outpath); FileOutputStream fileOutputStream; long fsize = 0; fileOutputStream = new FileOutputStream(outpath); int len;// = 0; while ((len = zipStream.read(buffer)) > 0){ fileOutputStream.write(buffer, 0, len); fsize+=len; } // end while fileOutputStream.close(); } // end outer while } catch (IOException ex) { for (StackTraceElement el : ex.getStackTrace()){ logger.severe(el.toString()); } this.addErrorMessage("Failed to open ZipInputStream entry" + ex.getMessage()); return false; } try { zipStream.close(); } catch (IOException ex) { Logger.getLogger(ShapefileHandler.class.getName()).log(Level.SEVERE, null, ex); } return true; } /* Rezip the shapefile(s) into a given directory Assumes that the zipfile_input_stream has already been checked! */ public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rezippedFolder) throws IOException{ logger.fine("rezipShapefileSets"); //msgt("rezipShapefileSets"); if (!this.zipFileProcessed){ this.addErrorMessage("First use 'examineZipFile' (called in the constructor)"); return false; } if (!this.containsShapefile()){ this.addErrorMessage("There are no shapefiles here!"); return false; } if (zipfile_input_stream== null){ this.addErrorMessage("The zipfile_input_stream is null."); return false; } if (rezippedFolder == null){ this.addErrorMessage("The rezippedFolder is null."); return false; } if (!rezippedFolder.isDirectory()){ this.addErrorMessage("The rezippedFolder does not exist: " + rezippedFolder.getAbsolutePath()); return false; } if (!containsShapefile()){ msgt("There are no shapefiles to re-zip"); return false; } // Create target directory for unzipping files String dirname_for_unzipping; File dir_for_unzipping; dirname_for_unzipping = rezippedFolder.getAbsolutePath() + "/" + "scratch-for-unzip-12345"; dir_for_unzipping = new File(dirname_for_unzipping); logger.fine("Try to create directory: " + dirname_for_unzipping ); if (!this.createDirectory(dir_for_unzipping)){ this.addErrorMessage("Failed to make directory: " + dirname_for_unzipping); return false; } // Unzip files! if (!this.unzipFilesToDirectory(zipfile_input_stream, dir_for_unzipping)){ this.addErrorMessage("Failed to unzip files."); return false; } // Redistribute files! String target_dirname = rezippedFolder.getAbsolutePath(); boolean redistribute_success = this.redistributeFilesFromZip(dirname_for_unzipping, target_dirname); //logger.fine("About to delete: " + dir_for_unzipping); // Delete unzipped files in scratch directory //FileUtils.deleteDirectory(dir_for_unzipping); logger.fine("Post redistribute:)"); for (File f : new File(target_dirname).listFiles()){ logger.fine("File exists: " + f.getAbsolutePath()); } return redistribute_success; } private String getRedistributeFilePath(String dirname, String file_basename, String file_ext){ if (dirname==null){ this.addErrorMessage("getRedistributeFilePath. dirname is null"); return null; } if (file_basename==null){ this.addErrorMessage("getRedistributeFilePath. file_basename is null"); return null; } if (file_ext==null){ this.addErrorMessage("getRedistributeFilePath. file_ext is null"); return null; } if (file_ext.equals(BLANK_EXTENSION)){ return dirname + "/" + file_basename; } return dirname + "/" + file_basename + "." + file_ext; } /* Create new zipped shapefile */ private boolean redistributeFilesFromZip(String source_dirname, String target_dirname){ logger.fine("redistributeFilesFromZip. source: '" + source_dirname + "' target: '" + target_dirname + "'"); int cnt =0; /* START: Redistribute files by iterating through the Map of basenames + extensions example key: "shape1" example ext_list: ["shp", "shx", "dbf", "prj"] */ for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ cnt++; String key = entry.getKey(); List<String> ext_list = entry.getValue(); msg("\n(" + cnt + ") Basename: " + key); msg("Extensions: " + Arrays.toString(ext_list.toArray())); // Is this a shapefile? If so, rezip it if (doesListContainShapefileExtensions(ext_list)){ List<String> namesToZip = new ArrayList<>(); for (String ext_name : ext_list) { if (!this.isShapefileExtension(ext_name)){ // Another file with similar basename as shapefile. // e.g. if shapefile basename is "census", this might be "census.xls", "census.pdf", or another non-shapefile extension String source_file_fullpath = this.getRedistributeFilePath(source_dirname, key, ext_name); String targetFileFullpath = this.getRedistributeFilePath(target_dirname, key, ext_name); this.straightFileCopy(source_file_fullpath, targetFileFullpath); this.addFinalRezippedFile(targetFileFullpath); }else{ namesToZip.add(key + "." + ext_name); } } String target_zipfile_name = target_dirname + "/" + key + ".zip"; //this.msg("target_zipfile_name: "+ target_zipfile_name); //this.msg("source_dirname: "+ source_dirname); //msgt("create zipped shapefile"); ZipMaker zip_maker = new ZipMaker(namesToZip, source_dirname, target_zipfile_name); this.addFinalRezippedFile(target_zipfile_name); // rezip it }else{ // Non-shapefiles for (String ext_name : ext_list) { String source_file_fullpath = this.getRedistributeFilePath(source_dirname, key, ext_name); String targetFileFullpath = this.getRedistributeFilePath(target_dirname, key, ext_name); this.straightFileCopy(source_file_fullpath, targetFileFullpath); this.addFinalRezippedFile(targetFileFullpath); } } } // END: Redistribute files return true; } // end: redistributeFilesFromZip private boolean straightFileCopy(String sourceFileName, String targetFileName){ //msg("Copy [" + sourceFileName + " to [" + targetFileName + "]"); if ((sourceFileName == null)||(targetFileName==null)){ this.addErrorMessage("The source or target file was null.\nSource: " + sourceFileName +"\nTarget: " + targetFileName); return false; } File source_file = new File(sourceFileName); File target_file = new File(targetFileName); try { Files.copy(source_file.toPath(), target_file.toPath(), REPLACE_EXISTING); } catch (IOException ex) { this.addErrorMessage("Failed to copy file. IOException\nSource: " + sourceFileName +"\nTarget: " + targetFileName); return false; } return true; } public boolean containsOnlySingleShapefile(){ if (containsShapefile()){ if (fileGroups.size()==filesizeHash.size()){ return true; } } return false; } /* Does this zip file contain a shapefile set? */ public boolean containsShapefile(){ for (Map.Entry<String, List<String>> entry : fileGroups.entrySet()){ String key = entry.getKey(); List<String> ext_list = entry.getValue(); if (doesListContainShapefileExtensions(ext_list)){ return true; } } return false; } private boolean isShapefileExtension(String ext_name){ if (ext_name == null){ return false; } return SHAPEFILE_ALL_EXTENSIONS.contains(ext_name); } /* Does a list of file extensions match those required for a shapefile set? */ private boolean doesListContainShapefileExtensions(List<String> ext_list){ if (ext_list == null){ return false; } return ext_list.containsAll(SHAPEFILE_MANDATORY_EXTENSIONS); } private void addToFileGroupHash(String basename, String ext){ if ((basename==null)||(ext==null)){ return; } List<String> extension_list = fileGroups.get(basename); if (extension_list==null) { extension_list = new ArrayList<>(); } if (!(extension_list.contains(ext))){ extension_list.add(ext); fileGroups.put(basename, extension_list); } } // end addToFileGroupHash /** * Update the fileGroup hash which contains a { base_filename : [ext1, ext2, etc ]} * This is used to determine whether a .zip contains a shapefile set # * @param fname filename in String format */ private void updateFileGroupHash(String fname){ if (fname == null){ return; } // Split filename into basename and extension. No extension yields only basename // if (fname.toLowerCase().endsWith(SHP_XML_EXTENSION)){ int idx = fname.toLowerCase().indexOf("." + SHP_XML_EXTENSION); if (idx >= 1){ // if idx==0, then the file name is ".shp.xml"" String basename = fname.substring(0, idx); String ext = fname.substring(idx+1); addToFileGroupHash(basename, ext); return; } } String[] tokens = fname.split("\\.(?=[^\\.]+$)"); if (tokens.length==1){ addToFileGroupHash(tokens[0], BLANK_EXTENSION); // file basename, no extension }else if (tokens.length==2){ addToFileGroupHash(tokens[0], tokens[1]); // file basename, extension } } // end updateFileGroupHash private boolean isFileToSkip(String fname){ if ((fname==null)||(fname.equals(""))){ return true; } if (fname.startsWith("__")){ return true; } if (fname.startsWith("._")){ return true; } File fnameFile = new File(fname); if (fnameFile.getName().endsWith(".DS_Store")){ return true; } return false; } /************************************** * Iterate through the zip file contents. * Does it contain any shapefiles? * * @param FileInputStream zip_file_stream */ private boolean examineZipfile(FileInputStream zip_file_stream){ // msgt("examineZipfile"); if (zip_file_stream==null){ this.addErrorMessage("The zip file stream was null"); return false; } // Clear out file lists this.filesListInDir.clear(); this.filesizeHash.clear(); this.fileGroups.clear(); try{ ZipInputStream zipStream = new ZipInputStream(zip_file_stream); ZipEntry entry; while((entry = zipStream.getNextEntry())!=null){ String zentryFileName = entry.getName(); //msg("zip entry: " + entry.getName()); // Skip files or folders starting with __ if (this.isFileToSkip(zentryFileName)){ continue; } if (entry.isDirectory()) { //String dirpath = outputFolder + "/" + zentryFileName; //createDirectory(dirpath); continue; } String unzipFileName = this.getFileBasename(zentryFileName); if (unzipFileName==null){ logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); continue; } String s = String.format("Entry: %s len %d added %TD", unzipFileName, entry.getSize(), new Date(entry.getTime())); if (!this.filesListInDir.contains(s)){ this.filesListInDir.add(s); updateFileGroupHash(unzipFileName); this.filesizeHash.put(unzipFileName, entry.getSize()); } } // end while zipStream.close(); if (this.filesListInDir.isEmpty()){ errorMessage = "No files in zipStream"; return false; } this.zipFileProcessed = true; return true; }catch(ZipException ex){ this.addErrorMessage("ZipException"); msgt("ZipException"); return false; }catch(IOException ex){ //ex.printStackTrace(); this.addErrorMessage("IOException File name"); msgt("IOException"); return false; }catch(IllegalArgumentException ex){ this.addErrorMessage("IllegalArgumentException when parsing zipfile"); msgt("IllegalArgumentException when parsing zipfile"); return false; }finally{ } } // end examineFile public static void main(String[] args){ // Example usage if (args.length == 0){ }else if(args.length > 1){ System.out.println( "Please only give one file name!"); }else{ /* String zip_name = args[0]; System.out.println( "Process File: " + zip_name); System.out.println( "Process File: " + zip_name); ShapefileHandler zpt = new ShapefileHandler(zip_name); */ } } // end main } // end ShapefileHandler