ZipInputFormat.java example

Explorer
cascading-master
- src
/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tap.hadoop;

import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/**
 * Class ZipInputFormat is an {@link InputFormat} for zip files. Each file within a zip file is broken
 * into lines. Either line-feed or carriage-return are used to signal end of
 * line. Keys are the position in the file, and values are the line of text.
 * <p/>
 * If the underlying {@link FileSystem} is HDFS or FILE, each {@link ZipEntry} is returned
 * as a unique split. Otherwise this input format returns false for isSplitable, and will
 * subsequently iterate over each ZipEntry and treat all internal files as the 'same' file.
 */
public class ZipInputFormat extends FileInputFormat<LongWritable, Text> implements JobConfigurable
  {
  public void configure( JobConf conf )
    {

    }

  /**
   * Return true only if the file is in ZIP format.
   *
   * @param fs   the file system that the file is on
   * @param file the path that represents this file
   * @return is this file splitable?
   */
  protected boolean isSplitable( FileSystem fs, Path file )
    {
    if( !isAllowSplits( fs ) )
      return false;

    if( LOG.isDebugEnabled() )
      LOG.debug( "verifying ZIP format for file: " + file.toString() );

    boolean splitable = true;
    ZipInputStream zipInputStream = null;

    try
      {
      zipInputStream = new ZipInputStream( fs.open( file ) );
      ZipEntry zipEntry = zipInputStream.getNextEntry();

      if( zipEntry == null )
        throw new IOException( "no entries found, empty zip file" );

      if( LOG.isDebugEnabled() )
        LOG.debug( "ZIP format verification successful" );
      }
    catch( IOException exception )
      {
      LOG.error( "exception encountered while trying to open and read ZIP input stream", exception );
      splitable = false;
      }
    finally
      {
      safeClose( zipInputStream );
      }

    return splitable;
    }

  protected Path[] listPathsInternal( JobConf jobConf ) throws IOException
    {
    Path[] dirs = FileInputFormat.getInputPaths( jobConf );

    if( dirs.length == 0 )
      throw new IOException( "no input paths specified in job" );

    for( Path dir : dirs )
      {
      FileSystem fs = dir.getFileSystem( jobConf );

      if( !fs.isFile( dir ) )
        throw new IOException( "does not support directories: " + dir );
      }

    return dirs;
    }

  @Override
  protected FileStatus[] listStatus( JobConf jobConf ) throws IOException
    {
    Path[] paths = listPathsInternal( jobConf );
    FileStatus[] statuses = new FileStatus[paths.length];

    for( int i = 0; i < paths.length; i++ )
      {
      Path path = paths[ i ];
      statuses[ i ] = path.getFileSystem( jobConf ).getFileStatus( path );
      }

    return statuses;
    }

  /**
   * Splits files returned by {@link #listPathsInternal(JobConf)}. Each file is
   * expected to be in zip format and each split corresponds to
   * {@link ZipEntry}.
   *
   * @param job       the JobConf data structure, see {@link JobConf}
   * @param numSplits the number of splits required. Ignored here
   * @throws IOException if input files are not in zip format
   */
  public InputSplit[] getSplits( JobConf job, int numSplits ) throws IOException
    {
    if( LOG.isDebugEnabled() )
      LOG.debug( "start splitting input ZIP files" );

    Path[] files = listPathsInternal( job );

    for( int i = 0; i < files.length; i++ )
      { // check we have valid files
      Path file = files[ i ];
      FileSystem fs = file.getFileSystem( job );

      if( !fs.isFile( file ) || !fs.exists( file ) )
        throw new IOException( "not a file: " + files[ i ] );
      }

    // generate splits
    ArrayList<ZipSplit> splits = new ArrayList<ZipSplit>( numSplits );

    for( int i = 0; i < files.length; i++ )
      {
      Path file = files[ i ];
      FileSystem fs = file.getFileSystem( job );

      if( LOG.isDebugEnabled() )
        LOG.debug( "opening zip file: " + file.toString() );

      if( isAllowSplits( fs ) )
        makeSplits( job, splits, fs, file );
      else
        makeSplit( job, splits, file );
      }

    if( LOG.isDebugEnabled() )
      LOG.debug( "end splitting input ZIP files" );

    return splits.toArray( new ZipSplit[splits.size()] );
    }

  private void makeSplit( JobConf job, ArrayList<ZipSplit> splits, Path file ) throws IOException
    {
    if( LOG.isDebugEnabled() )
      LOG.debug( "creating split for zip: " + file );

    // unknown uncompressed size. if set to compressed size, data will be truncated
    splits.add( new ZipSplit( file, -1 ) );
    }

  private void makeSplits( JobConf job, ArrayList<ZipSplit> splits, FileSystem fs, Path file ) throws IOException
    {
    ZipInputStream zipInputStream = new ZipInputStream( fs.open( file ) );

    try
      {
      ZipEntry zipEntry;

      while( ( zipEntry = zipInputStream.getNextEntry() ) != null )
        {
        ZipSplit zipSplit = new ZipSplit( file, zipEntry.getName(), zipEntry.getSize() );

        if( LOG.isDebugEnabled() )
          LOG.debug( String.format( "creating split for zip entry: %s size: %d method: %s compressed size: %d", zipEntry.getName(), zipEntry.getSize(), ZipEntry.DEFLATED == zipEntry.getMethod() ? "DEFLATED" : "STORED", zipEntry.getCompressedSize() ) );

        splits.add( zipSplit );
        }
      }
    finally
      {
      safeClose( zipInputStream );
      }
    }

  public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter ) throws IOException
    {
    reporter.setStatus( genericSplit.toString() );

    ZipSplit split = (ZipSplit) genericSplit;
    Path file = split.getPath();
    long length = split.getLength();

    // Set it max value if length is unknown.
    // Setting length to Max value does not have
    // a side effect as Record reader would not be
    // able to read past the actual size of
    // current entry.
    length = length == -1 ? Long.MAX_VALUE - 1 : length;

    FileSystem fs = file.getFileSystem( job );

    FSDataInputStream inputStream = fs.open( file );

    if( isAllowSplits( fs ) )
      return getReaderForEntry( inputStream, split, length );
    else
      return getReaderForAll( inputStream );
    }

  private RecordReader<LongWritable, Text> getReaderForAll( final FSDataInputStream inputStream ) throws IOException
    {
    final long bytesSize[] = new long[]{0};
    final long bytesRead[] = new long[]{0};

    Enumeration<InputStream> enumeration = new Enumeration<InputStream>()
    {
    boolean returnCurrent = false;
    ZipEntry nextEntry;
    ZipInputStream zipInputStream = new ZipInputStream( inputStream );
    InputStream closeableInputStream = makeInputStream( zipInputStream );

    public boolean hasMoreElements()
      {
      if( returnCurrent )
        return nextEntry != null;

      getNext();

      return nextEntry != null;
      }

    public InputStream nextElement()
      {
      if( returnCurrent )
        {
        returnCurrent = false;
        return closeableInputStream;
        }

      getNext();

      if( nextEntry == null )
        throw new IllegalStateException( "no more zip entries in zip input stream" );

      return closeableInputStream;
      }

    private void getNext()
      {
      try
        {
        nextEntry = zipInputStream.getNextEntry();

        while( nextEntry != null && nextEntry.isDirectory() )
          nextEntry = zipInputStream.getNextEntry();

        if( nextEntry != null )
          bytesSize[ 0 ] += nextEntry.getSize();

        returnCurrent = true;
        }
      catch( IOException exception )
        {
        throw new RuntimeException( "could not get next zip entry", exception );
        }
      finally
        {
        // i think, better than sending across a fake input stream that closes the zip
        if( nextEntry == null )
          safeClose( zipInputStream );
        }
      }

    private InputStream makeInputStream( ZipInputStream zipInputStream )
      {
      return new FilterInputStream( zipInputStream )
      {
      @Override
      public int read() throws IOException
        {
        bytesRead[ 0 ]++;
        return super.read();
        }

      @Override
      public int read( byte[] bytes ) throws IOException
        {
        int result = super.read( bytes );
        bytesRead[ 0 ] += result;
        return result;
        }

      @Override
      public int read( byte[] bytes, int i, int i1 ) throws IOException
        {
        int result = super.read( bytes, i, i1 );
        bytesRead[ 0 ] += result;
        return result;
        }

      @Override
      public long skip( long l ) throws IOException
        {
        long result = super.skip( l );
        bytesRead[ 0 ] += result;
        return result;
        }

      @Override
      public void close() throws IOException
        {
        // do nothing
        }
      };
      }
    };

    return new LineRecordReader( new SequenceInputStream( enumeration ), 0, Long.MAX_VALUE, Integer.MAX_VALUE )
    {
    @Override
    public float getProgress()
      {
      if( 0 == bytesSize[ 0 ] )
        return 0.0f;
      else
        return Math.min( 1.0f, bytesRead[ 0 ] / (float) bytesSize[ 0 ] );
      }
    };
    }

  private RecordReader<LongWritable, Text> getReaderForEntry( FSDataInputStream inputStream, ZipSplit split, long length ) throws IOException
    {
    ZipInputStream zipInputStream = new ZipInputStream( inputStream );
    String entryPath = split.getEntryPath();

    ZipEntry zipEntry = zipInputStream.getNextEntry();

    while( zipEntry != null && !zipEntry.getName().equals( entryPath ) )
      zipEntry = zipInputStream.getNextEntry();

    return new LineRecordReader( zipInputStream, 0, length, Integer.MAX_VALUE );
    }

  protected boolean isAllowSplits( FileSystem fs )
    {
    // only allow if fs is local or dfs
    URI uri = fs.getUri();
    String scheme = uri.getScheme();

    return scheme.equalsIgnoreCase( "hdfs" ) || scheme.equalsIgnoreCase( "file" );
    }

  private void safeClose( ZipInputStream zipInputStream )
    {
    try
      {
      if( zipInputStream != null )
        zipInputStream.close();
      }
    catch( IOException exception )
      {
      LOG.error( "exception while trying to close ZIP input stream", exception );
      }
    }

  }