package edu.isi.karma.mapreduce.inputformat; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @InterfaceAudience.Public @InterfaceStability.Stable public class ZIPRecordReader extends RecordReader<Writable, Writable> { private static Logger LOG = LoggerFactory.getLogger(ZIPRecordReader.class); private Text key = new Text(); private BytesWritable value = new BytesWritable(); private long start; private long bytesReadEstimate = 0; private long end; private ZipInputStream is; private ZipEntry entry; public ZIPRecordReader() throws IOException { //TODO } public void initialize(InputSplit rawSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) rawSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); is = new ZipInputStream(fs.open(file)); } @Override public Text getCurrentKey() throws IOException, InterruptedException { if(entry != null) { key.set(entry.getName()); } else { key.set(""); } return key; } @Override public Writable getCurrentValue() throws IOException, InterruptedException { try { if(entry != null) { int size = (int)entry.getSize(); size = size== -1? 1000: size; ByteArrayOutputStream baos = new ByteArrayOutputStream(size); while(true){ int b = is.read(); if(b == -1) break; baos.write(b); } byte[] filebytes = baos.toByteArray(); value = new BytesWritable(filebytes); bytesReadEstimate += filebytes.length; } } catch(Exception e) { LOG.error("Unable to process: " + key.toString(), e); value = new BytesWritable(); } return value; } public synchronized boolean nextKeyValue() throws IOException, InterruptedException { try { while((entry = is.getNextEntry())!=null && entry.isDirectory()); return entry != null; } catch(Exception e) { LOG.error("Unable to process next key/value", e); return false; } } public float getProgress() throws IOException, InterruptedException { return bytesReadEstimate/end; } public synchronized void close() throws IOException { is.close(); } }