package water.parser; import water.DKV; import water.Iced; import water.Key; import water.exceptions.H2OIllegalArgumentException; import water.fvec.ByteVec; import water.fvec.FileVec; import water.fvec.Frame; import water.util.Log; import water.util.UnsafeUtils; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; import static water.fvec.FileVec.getPathForKey; abstract class ZipUtil { public enum Compression { NONE, ZIP, GZIP } /** * This method will attempt to read the few bytes off a file which will in turn be used * to guess what kind of parsers we should use to parse the file. * * @param bv * @return */ static byte [] getFirstUnzippedBytes( ByteVec bv ) { try { byte[] bits = bv.getFirstBytes(); return unzipBytes(bits, guessCompressionMethod(bits), FileVec.DFLT_CHUNK_SIZE); } catch(Exception e) { Log.debug("Cannot get unzipped bytes from ByteVec!", e); return null; } } /** * This method check if the input argument is a zip directory containing files. * * @param key * @return true if bv is a zip directory containing files, false otherwise. */ static boolean isZipDirectory(Key key) { Iced ice = DKV.getGet(key); if (ice == null) throw new H2OIllegalArgumentException("Missing data", "Did not find any data under " + "key " + key); ByteVec bv = (ByteVec) (ice instanceof ByteVec ? ice : ((Frame) ice).vecs()[0]); return isZipDirectory(bv); } static boolean isZipDirectory(ByteVec bv) { byte[] bits = bv.getFirstBytes(); ZipUtil.Compression compressionMethod = guessCompressionMethod(bits); try { if (compressionMethod == Compression.ZIP) { ByteArrayInputStream bais = new ByteArrayInputStream(bits); ZipInputStream zis = new ZipInputStream(bais); ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry boolean isDir = ze.isDirectory(); zis.close(); // There is at least one entry in zip file and it is not a directory. return isDir; } } catch (IOException e) { e.printStackTrace(); } return false; } static ArrayList<String> getFileNames(ByteVec bv) { ArrayList<String> fileList = new ArrayList<String>(); if (bv instanceof FileVec) { String strPath = getPathForKey(((FileVec) bv)._key); try { ZipFile zipFile = new ZipFile(strPath); Enumeration<? extends ZipEntry> entries = zipFile.entries(); while (entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); if (!entry.isDirectory()) {// add file to list to parse if not a directory. fileList.add(entry.getName()); } } zipFile.close(); } catch (IOException e) { e.printStackTrace(); } } return fileList; } /** * When a file is a zip file that contains multiple files, this method will return the decompression ratio. * * @param bv * @return */ static float getDecompressionRatio(ByteVec bv) { long totalSize = 0L; long totalCompSize = 0L; if (bv instanceof FileVec) { String strPath = getPathForKey(((FileVec) bv)._key); try { ZipFile zipFile = new ZipFile(strPath); Enumeration<? extends ZipEntry> entries = zipFile.entries(); while (entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); if (!entry.isDirectory()) {// add file to list to parse if not a directory. totalSize = totalSize + entry.getSize(); totalCompSize = totalCompSize + entry.getCompressedSize(); } } zipFile.close(); } catch (IOException e) { e.printStackTrace(); } } if (totalCompSize == 0) // something is wrong. Return no compression. return 1; else return totalSize/totalCompSize; } static Compression guessCompressionMethod(byte [] bits) { // Look for ZIP magic if( bits.length > ZipFile.LOCHDR && UnsafeUtils.get4(bits, 0) == ZipFile.LOCSIG ) return Compression.ZIP; if( bits.length > 2 && (UnsafeUtils.get2(bits,0)&0xffff) == GZIPInputStream.GZIP_MAGIC ) return Compression.GZIP; return Compression.NONE; } static float decompressionRatio(ByteVec bv) { byte[] zips = bv.getFirstBytes(); ZipUtil.Compression cpr = ZipUtil.guessCompressionMethod(zips); if (cpr == Compression.NONE ) return 1; // no compression else if (cpr == Compression.ZIP) { ByteArrayInputStream bais = new ByteArrayInputStream(zips); ZipInputStream zis = new ZipInputStream(bais); ZipEntry ze = null; // Get the *FIRST* entry try { ze = zis.getNextEntry(); boolean isDir = ze.isDirectory(); if (isDir) { return getDecompressionRatio(bv); } else { byte[] bits = ZipUtil.unzipBytes(zips, cpr, FileVec.DFLT_CHUNK_SIZE); return bits.length / zips.length; } } catch (IOException e) { e.printStackTrace(); } } else { byte[] bits = ZipUtil.unzipBytes(zips, cpr, FileVec.DFLT_CHUNK_SIZE); return bits.length / zips.length; } return 1; } static byte[] unzipBytes( byte[] bs, Compression cmp, int chkSize ) { if( cmp == Compression.NONE ) return bs; // No compression // Wrap the bytes in a stream ByteArrayInputStream bais = new ByteArrayInputStream(bs); InputStream is = null; try { if( cmp == Compression.ZIP ) { ZipInputStream zis = new ZipInputStream(bais); ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry // There is at least one entry in zip file and it is not a directory. if( ze == null || ze.isDirectory() ) zis.getNextEntry(); // read the next entry which should be a file is = zis; } else { assert cmp == Compression.GZIP; is = new GZIPInputStream(bais); } // If reading from a compressed stream, estimate we can read 2x uncompressed bs = new byte[bs.length * 2]; // Now read from the compressed stream int off = 0; while( off < bs.length ) { int len = is.read(bs, off, bs.length - off); if( len < 0 ) break; off += len; if( off == bs.length ) { // Dataset is uncompressing alot! Need more space... if( bs.length >= chkSize ) break; // Already got enough bs = Arrays.copyOf(bs, bs.length * 2); } } } catch( IOException ioe ) { throw Log.throwErr(ioe); } finally { try { if( is != null ) is.close(); } catch( IOException ignore ) { } } return bs; } /** * This method will read a compressed zip file and return the uncompressed bits so that we can * check the beginning of the file and make sure it does not contain the column names. * * @param bs * @param chkSize * @return */ static byte[] unzipForHeader( byte[] bs, int chkSize ) { ByteArrayInputStream bais = new ByteArrayInputStream(bs); ZipInputStream zis = new ZipInputStream(bais); InputStream is = zis; // Now read from the compressed stream int off = 0; try { while( off < bs.length ) { int len = 0; len = is.read(bs, off, bs.length - off); if( len < 0 ) break; off += len; if( off == bs.length ) { // Dataset is uncompressing alot! Need more space... if( bs.length >= chkSize ) break; // Already got enough bs = Arrays.copyOf(bs, bs.length * 2); } } } catch (IOException e) { e.printStackTrace(); } try { is.close(); } catch (IOException e) { e.printStackTrace(); } return bs; } }