/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.tap.hadoop;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
/**
* Class ZipInputFormat is an {@link InputFormat} for zip files. Each file within a zip file is broken
* into lines. Either line-feed or carriage-return are used to signal end of
* line. Keys are the position in the file, and values are the line of text.
* <p/>
* If the underlying {@link FileSystem} is HDFS or FILE, each {@link ZipEntry} is returned
* as a unique split. Otherwise this input format returns false for isSplitable, and will
* subsequently iterate over each ZipEntry and treat all internal files as the 'same' file.
*/
public class ZipInputFormat extends FileInputFormat<LongWritable, Text> implements JobConfigurable
{
public void configure( JobConf conf )
{
}
/**
* Return true only if the file is in ZIP format.
*
* @param fs the file system that the file is on
* @param file the path that represents this file
* @return is this file splitable?
*/
protected boolean isSplitable( FileSystem fs, Path file )
{
if( !isAllowSplits( fs ) )
return false;
if( LOG.isDebugEnabled() )
LOG.debug( "verifying ZIP format for file: " + file.toString() );
boolean splitable = true;
ZipInputStream zipInputStream = null;
try
{
zipInputStream = new ZipInputStream( fs.open( file ) );
ZipEntry zipEntry = zipInputStream.getNextEntry();
if( zipEntry == null )
throw new IOException( "no entries found, empty zip file" );
if( LOG.isDebugEnabled() )
LOG.debug( "ZIP format verification successful" );
}
catch( IOException exception )
{
LOG.error( "exception encountered while trying to open and read ZIP input stream", exception );
splitable = false;
}
finally
{
safeClose( zipInputStream );
}
return splitable;
}
protected Path[] listPathsInternal( JobConf jobConf ) throws IOException
{
Path[] dirs = FileInputFormat.getInputPaths( jobConf );
if( dirs.length == 0 )
throw new IOException( "no input paths specified in job" );
for( Path dir : dirs )
{
FileSystem fs = dir.getFileSystem( jobConf );
if( !fs.isFile( dir ) )
throw new IOException( "does not support directories: " + dir );
}
return dirs;
}
@Override
protected FileStatus[] listStatus( JobConf jobConf ) throws IOException
{
Path[] paths = listPathsInternal( jobConf );
FileStatus[] statuses = new FileStatus[paths.length];
for( int i = 0; i < paths.length; i++ )
{
Path path = paths[ i ];
statuses[ i ] = path.getFileSystem( jobConf ).getFileStatus( path );
}
return statuses;
}
/**
* Splits files returned by {@link #listPathsInternal(JobConf)}. Each file is
* expected to be in zip format and each split corresponds to
* {@link ZipEntry}.
*
* @param job the JobConf data structure, see {@link JobConf}
* @param numSplits the number of splits required. Ignored here
* @throws IOException if input files are not in zip format
*/
public InputSplit[] getSplits( JobConf job, int numSplits ) throws IOException
{
if( LOG.isDebugEnabled() )
LOG.debug( "start splitting input ZIP files" );
Path[] files = listPathsInternal( job );
for( int i = 0; i < files.length; i++ )
{ // check we have valid files
Path file = files[ i ];
FileSystem fs = file.getFileSystem( job );
if( !fs.isFile( file ) || !fs.exists( file ) )
throw new IOException( "not a file: " + files[ i ] );
}
// generate splits
ArrayList<ZipSplit> splits = new ArrayList<ZipSplit>( numSplits );
for( int i = 0; i < files.length; i++ )
{
Path file = files[ i ];
FileSystem fs = file.getFileSystem( job );
if( LOG.isDebugEnabled() )
LOG.debug( "opening zip file: " + file.toString() );
if( isAllowSplits( fs ) )
makeSplits( job, splits, fs, file );
else
makeSplit( job, splits, file );
}
if( LOG.isDebugEnabled() )
LOG.debug( "end splitting input ZIP files" );
return splits.toArray( new ZipSplit[splits.size()] );
}
private void makeSplit( JobConf job, ArrayList<ZipSplit> splits, Path file ) throws IOException
{
if( LOG.isDebugEnabled() )
LOG.debug( "creating split for zip: " + file );
// unknown uncompressed size. if set to compressed size, data will be truncated
splits.add( new ZipSplit( file, -1 ) );
}
private void makeSplits( JobConf job, ArrayList<ZipSplit> splits, FileSystem fs, Path file ) throws IOException
{
ZipInputStream zipInputStream = new ZipInputStream( fs.open( file ) );
try
{
ZipEntry zipEntry;
while( ( zipEntry = zipInputStream.getNextEntry() ) != null )
{
ZipSplit zipSplit = new ZipSplit( file, zipEntry.getName(), zipEntry.getSize() );
if( LOG.isDebugEnabled() )
LOG.debug( String.format( "creating split for zip entry: %s size: %d method: %s compressed size: %d", zipEntry.getName(), zipEntry.getSize(), ZipEntry.DEFLATED == zipEntry.getMethod() ? "DEFLATED" : "STORED", zipEntry.getCompressedSize() ) );
splits.add( zipSplit );
}
}
finally
{
safeClose( zipInputStream );
}
}
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter ) throws IOException
{
reporter.setStatus( genericSplit.toString() );
ZipSplit split = (ZipSplit) genericSplit;
Path file = split.getPath();
long length = split.getLength();
// Set it max value if length is unknown.
// Setting length to Max value does not have
// a side effect as Record reader would not be
// able to read past the actual size of
// current entry.
length = length == -1 ? Long.MAX_VALUE - 1 : length;
FileSystem fs = file.getFileSystem( job );
FSDataInputStream inputStream = fs.open( file );
if( isAllowSplits( fs ) )
return getReaderForEntry( inputStream, split, length );
else
return getReaderForAll( inputStream );
}
private RecordReader<LongWritable, Text> getReaderForAll( final FSDataInputStream inputStream ) throws IOException
{
final long bytesSize[] = new long[]{0};
final long bytesRead[] = new long[]{0};
Enumeration<InputStream> enumeration = new Enumeration<InputStream>()
{
boolean returnCurrent = false;
ZipEntry nextEntry;
ZipInputStream zipInputStream = new ZipInputStream( inputStream );
InputStream closeableInputStream = makeInputStream( zipInputStream );
public boolean hasMoreElements()
{
if( returnCurrent )
return nextEntry != null;
getNext();
return nextEntry != null;
}
public InputStream nextElement()
{
if( returnCurrent )
{
returnCurrent = false;
return closeableInputStream;
}
getNext();
if( nextEntry == null )
throw new IllegalStateException( "no more zip entries in zip input stream" );
return closeableInputStream;
}
private void getNext()
{
try
{
nextEntry = zipInputStream.getNextEntry();
while( nextEntry != null && nextEntry.isDirectory() )
nextEntry = zipInputStream.getNextEntry();
if( nextEntry != null )
bytesSize[ 0 ] += nextEntry.getSize();
returnCurrent = true;
}
catch( IOException exception )
{
throw new RuntimeException( "could not get next zip entry", exception );
}
finally
{
// i think, better than sending across a fake input stream that closes the zip
if( nextEntry == null )
safeClose( zipInputStream );
}
}
private InputStream makeInputStream( ZipInputStream zipInputStream )
{
return new FilterInputStream( zipInputStream )
{
@Override
public int read() throws IOException
{
bytesRead[ 0 ]++;
return super.read();
}
@Override
public int read( byte[] bytes ) throws IOException
{
int result = super.read( bytes );
bytesRead[ 0 ] += result;
return result;
}
@Override
public int read( byte[] bytes, int i, int i1 ) throws IOException
{
int result = super.read( bytes, i, i1 );
bytesRead[ 0 ] += result;
return result;
}
@Override
public long skip( long l ) throws IOException
{
long result = super.skip( l );
bytesRead[ 0 ] += result;
return result;
}
@Override
public void close() throws IOException
{
// do nothing
}
};
}
};
return new LineRecordReader( new SequenceInputStream( enumeration ), 0, Long.MAX_VALUE, Integer.MAX_VALUE )
{
@Override
public float getProgress()
{
if( 0 == bytesSize[ 0 ] )
return 0.0f;
else
return Math.min( 1.0f, bytesRead[ 0 ] / (float) bytesSize[ 0 ] );
}
};
}
private RecordReader<LongWritable, Text> getReaderForEntry( FSDataInputStream inputStream, ZipSplit split, long length ) throws IOException
{
ZipInputStream zipInputStream = new ZipInputStream( inputStream );
String entryPath = split.getEntryPath();
ZipEntry zipEntry = zipInputStream.getNextEntry();
while( zipEntry != null && !zipEntry.getName().equals( entryPath ) )
zipEntry = zipInputStream.getNextEntry();
return new LineRecordReader( zipInputStream, 0, length, Integer.MAX_VALUE );
}
protected boolean isAllowSplits( FileSystem fs )
{
// only allow if fs is local or dfs
URI uri = fs.getUri();
String scheme = uri.getScheme();
return scheme.equalsIgnoreCase( "hdfs" ) || scheme.equalsIgnoreCase( "file" );
}
private void safeClose( ZipInputStream zipInputStream )
{
try
{
if( zipInputStream != null )
zipInputStream.close();
}
catch( IOException exception )
{
LOG.error( "exception while trying to close ZIP input stream", exception );
}
}
}