ZipUtil.java example

Explorer
h2o-3-master
package water.parser;

import water.DKV;
import water.Iced;
import water.Key;
import water.exceptions.H2OIllegalArgumentException;
import water.fvec.ByteVec;
import water.fvec.FileVec;
import water.fvec.Frame;
import water.util.Log;
import water.util.UnsafeUtils;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;

import static water.fvec.FileVec.getPathForKey;

abstract class ZipUtil {

  public enum Compression { NONE, ZIP, GZIP }

  /**
   * This method will attempt to read the few bytes off a file which will in turn be used
   * to guess what kind of parsers we should use to parse the file.
   *
   * @param bv
   * @return
   */
  static byte [] getFirstUnzippedBytes( ByteVec bv ) {
    try {
      byte[] bits = bv.getFirstBytes();
      return unzipBytes(bits, guessCompressionMethod(bits), FileVec.DFLT_CHUNK_SIZE);
    } catch(Exception e) {
      Log.debug("Cannot get unzipped bytes from ByteVec!", e);
      return null;
    }
  }

  /**
   * This method check if the input argument is a zip directory containing files.
   *
   * @param key
   * @return true if bv is a zip directory containing files, false otherwise.
   */
  static boolean isZipDirectory(Key key) {
    Iced ice = DKV.getGet(key);
    if (ice == null) throw new H2OIllegalArgumentException("Missing data", "Did not find any data under " +
            "key " + key);
    ByteVec bv = (ByteVec) (ice instanceof ByteVec ? ice : ((Frame) ice).vecs()[0]);

    return isZipDirectory(bv);
  }

  static boolean isZipDirectory(ByteVec bv) {
    byte[] bits = bv.getFirstBytes();
    ZipUtil.Compression compressionMethod = guessCompressionMethod(bits);
    try {
      if (compressionMethod == Compression.ZIP) {
        ByteArrayInputStream bais = new ByteArrayInputStream(bits);
        ZipInputStream zis = new ZipInputStream(bais);
        ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry
        boolean isDir = ze.isDirectory();
        zis.close();
        // There is at least one entry in zip file and it is not a directory.
        return isDir;
      }
    } catch (IOException e) {
      e.printStackTrace();
    }

    return false;
  }

  static ArrayList<String> getFileNames(ByteVec bv) {
    ArrayList<String> fileList = new ArrayList<String>();

    if (bv instanceof FileVec) {
      String strPath = getPathForKey(((FileVec) bv)._key);

      try {
        ZipFile zipFile = new ZipFile(strPath);

        Enumeration<? extends ZipEntry> entries = zipFile.entries();

        while (entries.hasMoreElements()) {
          ZipEntry entry = entries.nextElement();
          if (!entry.isDirectory()) {// add file to list to parse if not a directory.
            fileList.add(entry.getName());
          }
        }
        zipFile.close();
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    return fileList;
  }


  /**
   *   When a file is a zip file that contains multiple files, this method will return the decompression ratio.
   *
   * @param bv
   * @return
   */
  static float getDecompressionRatio(ByteVec bv) {
    long totalSize = 0L;
    long totalCompSize = 0L;

    if (bv instanceof FileVec) {
      String strPath = getPathForKey(((FileVec) bv)._key);

      try {
        ZipFile zipFile = new ZipFile(strPath);

        Enumeration<? extends ZipEntry> entries = zipFile.entries();

        while (entries.hasMoreElements()) {
          ZipEntry entry = entries.nextElement();
          if (!entry.isDirectory()) {// add file to list to parse if not a directory.
            totalSize = totalSize + entry.getSize();
            totalCompSize = totalCompSize + entry.getCompressedSize();
          }
        }
        zipFile.close();
      } catch (IOException e) {
        e.printStackTrace();
      }
    }

    if (totalCompSize == 0) // something is wrong.  Return no compression.
      return 1;
    else
      return totalSize/totalCompSize;
  }

  static Compression guessCompressionMethod(byte [] bits) {
    // Look for ZIP magic
    if( bits.length > ZipFile.LOCHDR && UnsafeUtils.get4(bits, 0) == ZipFile.LOCSIG )
      return Compression.ZIP;
    if( bits.length > 2 && (UnsafeUtils.get2(bits,0)&0xffff) == GZIPInputStream.GZIP_MAGIC )
      return Compression.GZIP;
    return Compression.NONE;
  }

  static float decompressionRatio(ByteVec bv) {
    byte[] zips = bv.getFirstBytes();
    ZipUtil.Compression cpr = ZipUtil.guessCompressionMethod(zips);
    if (cpr == Compression.NONE )
      return 1; // no compression
    else if (cpr == Compression.ZIP) {
      ByteArrayInputStream bais = new ByteArrayInputStream(zips);
      ZipInputStream zis = new ZipInputStream(bais);
      ZipEntry ze = null; // Get the *FIRST* entry

      try {
        ze = zis.getNextEntry();
        boolean isDir = ze.isDirectory();

        if (isDir) {
          return getDecompressionRatio(bv);
        } else {
          byte[] bits = ZipUtil.unzipBytes(zips, cpr, FileVec.DFLT_CHUNK_SIZE);
          return bits.length / zips.length;
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    } else {
      byte[] bits = ZipUtil.unzipBytes(zips, cpr, FileVec.DFLT_CHUNK_SIZE);
      return bits.length / zips.length;
    }
    return 1;
  }


  static byte[] unzipBytes( byte[] bs, Compression cmp, int chkSize ) {
    if( cmp == Compression.NONE ) return bs; // No compression
    // Wrap the bytes in a stream
    ByteArrayInputStream bais = new ByteArrayInputStream(bs);
    InputStream is = null;
    try {
      if( cmp == Compression.ZIP ) {
        ZipInputStream zis = new ZipInputStream(bais);
        ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry
        // There is at least one entry in zip file and it is not a directory.
        if( ze == null || ze.isDirectory() )
          zis.getNextEntry(); // read the next entry which should be a file
        is = zis;
      } else {
        assert cmp == Compression.GZIP;
        is = new GZIPInputStream(bais);
      }

      // If reading from a compressed stream, estimate we can read 2x uncompressed
      bs = new byte[bs.length * 2];
      // Now read from the compressed stream
      int off = 0;
      while( off < bs.length ) {
        int len = is.read(bs, off, bs.length - off);
        if( len < 0 )
          break;
        off += len;
        if( off == bs.length ) { // Dataset is uncompressing alot! Need more space...
          if( bs.length >= chkSize )
            break; // Already got enough
          bs = Arrays.copyOf(bs, bs.length * 2);
        }
      } 
    } catch( IOException ioe ) { 
      throw Log.throwErr(ioe); 
    } finally { 
      try { if( is != null ) is.close(); } catch( IOException ignore ) { }
    }

    return bs;
  }

  /**
   * This method will read a compressed zip file and return the uncompressed bits so that we can
   * check the beginning of the file and make sure it does not contain the column names.
   *
   * @param bs
   * @param chkSize
   * @return
   */
  static byte[] unzipForHeader( byte[] bs, int chkSize ) {

    ByteArrayInputStream bais = new ByteArrayInputStream(bs);
    ZipInputStream zis = new ZipInputStream(bais);

    InputStream is = zis;

    // Now read from the compressed stream
    int off = 0;

    try {
      while( off < bs.length ) {
        int len = 0;
        len = is.read(bs, off, bs.length - off);
        if( len < 0 )
          break;
        off += len;
        if( off == bs.length ) { // Dataset is uncompressing alot! Need more space...
          if( bs.length >= chkSize )
            break; // Already got enough
          bs = Arrays.copyOf(bs, bs.length * 2);
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    }

    try {
      is.close();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return bs;
  }
}