/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.SequenceInputStream; import java.net.URI; import java.util.ArrayList; import java.util.Enumeration; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.apache.hadoop.mapred.LineRecordReader; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; /** * Class ZipInputFormat is an {@link InputFormat} for zip files. Each file within a zip file is broken * into lines. Either line-feed or carriage-return are used to signal end of * line. Keys are the position in the file, and values are the line of text. * <p/> * If the underlying {@link FileSystem} is HDFS or FILE, each {@link ZipEntry} is returned * as a unique split. Otherwise this input format returns false for isSplitable, and will * subsequently iterate over each ZipEntry and treat all internal files as the 'same' file. */ public class ZipInputFormat extends FileInputFormat<LongWritable, Text> implements JobConfigurable { public void configure( JobConf conf ) { } /** * Return true only if the file is in ZIP format. * * @param fs the file system that the file is on * @param file the path that represents this file * @return is this file splitable? */ protected boolean isSplitable( FileSystem fs, Path file ) { if( !isAllowSplits( fs ) ) return false; if( LOG.isDebugEnabled() ) LOG.debug( "verifying ZIP format for file: " + file.toString() ); boolean splitable = true; ZipInputStream zipInputStream = null; try { zipInputStream = new ZipInputStream( fs.open( file ) ); ZipEntry zipEntry = zipInputStream.getNextEntry(); if( zipEntry == null ) throw new IOException( "no entries found, empty zip file" ); if( LOG.isDebugEnabled() ) LOG.debug( "ZIP format verification successful" ); } catch( IOException exception ) { LOG.error( "exception encountered while trying to open and read ZIP input stream", exception ); splitable = false; } finally { safeClose( zipInputStream ); } return splitable; } protected Path[] listPathsInternal( JobConf jobConf ) throws IOException { Path[] dirs = FileInputFormat.getInputPaths( jobConf ); if( dirs.length == 0 ) throw new IOException( "no input paths specified in job" ); for( Path dir : dirs ) { FileSystem fs = dir.getFileSystem( jobConf ); if( !fs.isFile( dir ) ) throw new IOException( "does not support directories: " + dir ); } return dirs; } @Override protected FileStatus[] listStatus( JobConf jobConf ) throws IOException { Path[] paths = listPathsInternal( jobConf ); FileStatus[] statuses = new FileStatus[paths.length]; for( int i = 0; i < paths.length; i++ ) { Path path = paths[ i ]; statuses[ i ] = path.getFileSystem( jobConf ).getFileStatus( path ); } return statuses; } /** * Splits files returned by {@link #listPathsInternal(JobConf)}. Each file is * expected to be in zip format and each split corresponds to * {@link ZipEntry}. * * @param job the JobConf data structure, see {@link JobConf} * @param numSplits the number of splits required. Ignored here * @throws IOException if input files are not in zip format */ public InputSplit[] getSplits( JobConf job, int numSplits ) throws IOException { if( LOG.isDebugEnabled() ) LOG.debug( "start splitting input ZIP files" ); Path[] files = listPathsInternal( job ); for( int i = 0; i < files.length; i++ ) { // check we have valid files Path file = files[ i ]; FileSystem fs = file.getFileSystem( job ); if( !fs.isFile( file ) || !fs.exists( file ) ) throw new IOException( "not a file: " + files[ i ] ); } // generate splits ArrayList<ZipSplit> splits = new ArrayList<ZipSplit>( numSplits ); for( int i = 0; i < files.length; i++ ) { Path file = files[ i ]; FileSystem fs = file.getFileSystem( job ); if( LOG.isDebugEnabled() ) LOG.debug( "opening zip file: " + file.toString() ); if( isAllowSplits( fs ) ) makeSplits( job, splits, fs, file ); else makeSplit( job, splits, file ); } if( LOG.isDebugEnabled() ) LOG.debug( "end splitting input ZIP files" ); return splits.toArray( new ZipSplit[splits.size()] ); } private void makeSplit( JobConf job, ArrayList<ZipSplit> splits, Path file ) throws IOException { if( LOG.isDebugEnabled() ) LOG.debug( "creating split for zip: " + file ); // unknown uncompressed size. if set to compressed size, data will be truncated splits.add( new ZipSplit( file, -1 ) ); } private void makeSplits( JobConf job, ArrayList<ZipSplit> splits, FileSystem fs, Path file ) throws IOException { ZipInputStream zipInputStream = new ZipInputStream( fs.open( file ) ); try { ZipEntry zipEntry; while( ( zipEntry = zipInputStream.getNextEntry() ) != null ) { ZipSplit zipSplit = new ZipSplit( file, zipEntry.getName(), zipEntry.getSize() ); if( LOG.isDebugEnabled() ) LOG.debug( String.format( "creating split for zip entry: %s size: %d method: %s compressed size: %d", zipEntry.getName(), zipEntry.getSize(), ZipEntry.DEFLATED == zipEntry.getMethod() ? "DEFLATED" : "STORED", zipEntry.getCompressedSize() ) ); splits.add( zipSplit ); } } finally { safeClose( zipInputStream ); } } public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter ) throws IOException { reporter.setStatus( genericSplit.toString() ); ZipSplit split = (ZipSplit) genericSplit; Path file = split.getPath(); long length = split.getLength(); // Set it max value if length is unknown. // Setting length to Max value does not have // a side effect as Record reader would not be // able to read past the actual size of // current entry. length = length == -1 ? Long.MAX_VALUE - 1 : length; FileSystem fs = file.getFileSystem( job ); FSDataInputStream inputStream = fs.open( file ); if( isAllowSplits( fs ) ) return getReaderForEntry( inputStream, split, length ); else return getReaderForAll( inputStream ); } private RecordReader<LongWritable, Text> getReaderForAll( final FSDataInputStream inputStream ) throws IOException { final long bytesSize[] = new long[]{0}; final long bytesRead[] = new long[]{0}; Enumeration<InputStream> enumeration = new Enumeration<InputStream>() { boolean returnCurrent = false; ZipEntry nextEntry; ZipInputStream zipInputStream = new ZipInputStream( inputStream ); InputStream closeableInputStream = makeInputStream( zipInputStream ); public boolean hasMoreElements() { if( returnCurrent ) return nextEntry != null; getNext(); return nextEntry != null; } public InputStream nextElement() { if( returnCurrent ) { returnCurrent = false; return closeableInputStream; } getNext(); if( nextEntry == null ) throw new IllegalStateException( "no more zip entries in zip input stream" ); return closeableInputStream; } private void getNext() { try { nextEntry = zipInputStream.getNextEntry(); while( nextEntry != null && nextEntry.isDirectory() ) nextEntry = zipInputStream.getNextEntry(); if( nextEntry != null ) bytesSize[ 0 ] += nextEntry.getSize(); returnCurrent = true; } catch( IOException exception ) { throw new RuntimeException( "could not get next zip entry", exception ); } finally { // i think, better than sending across a fake input stream that closes the zip if( nextEntry == null ) safeClose( zipInputStream ); } } private InputStream makeInputStream( ZipInputStream zipInputStream ) { return new FilterInputStream( zipInputStream ) { @Override public int read() throws IOException { bytesRead[ 0 ]++; return super.read(); } @Override public int read( byte[] bytes ) throws IOException { int result = super.read( bytes ); bytesRead[ 0 ] += result; return result; } @Override public int read( byte[] bytes, int i, int i1 ) throws IOException { int result = super.read( bytes, i, i1 ); bytesRead[ 0 ] += result; return result; } @Override public long skip( long l ) throws IOException { long result = super.skip( l ); bytesRead[ 0 ] += result; return result; } @Override public void close() throws IOException { // do nothing } }; } }; return new LineRecordReader( new SequenceInputStream( enumeration ), 0, Long.MAX_VALUE, Integer.MAX_VALUE ) { @Override public float getProgress() { if( 0 == bytesSize[ 0 ] ) return 0.0f; else return Math.min( 1.0f, bytesRead[ 0 ] / (float) bytesSize[ 0 ] ); } }; } private RecordReader<LongWritable, Text> getReaderForEntry( FSDataInputStream inputStream, ZipSplit split, long length ) throws IOException { ZipInputStream zipInputStream = new ZipInputStream( inputStream ); String entryPath = split.getEntryPath(); ZipEntry zipEntry = zipInputStream.getNextEntry(); while( zipEntry != null && !zipEntry.getName().equals( entryPath ) ) zipEntry = zipInputStream.getNextEntry(); return new LineRecordReader( zipInputStream, 0, length, Integer.MAX_VALUE ); } protected boolean isAllowSplits( FileSystem fs ) { // only allow if fs is local or dfs URI uri = fs.getUri(); String scheme = uri.getScheme(); return scheme.equalsIgnoreCase( "hdfs" ) || scheme.equalsIgnoreCase( "file" ); } private void safeClose( ZipInputStream zipInputStream ) { try { if( zipInputStream != null ) zipInputStream.close(); } catch( IOException exception ) { LOG.error( "exception while trying to close ZIP input stream", exception ); } } }