package org.archive.hadoop.mapreduce; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.zip.GZIPInputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.archive.hadoop.cdx.HDFSBlockLoader; public class GZIPRangeLineDereferencingRecordReader extends LineDereferencingRecordReader{ private static final String SKIP_BAD_GZ_RANGES = "gzip.range.skipbad"; public static void setSkipBadGZIPRanges(Configuration conf, boolean skip) { conf.setBoolean(SKIP_BAD_GZ_RANGES, skip); } public static boolean getSkipBadGZIPRanges(Configuration conf) { return conf.getBoolean(SKIP_BAD_GZ_RANGES,false); } String curInputLine = null; FSDataInputStream fsdis = null; long curStart = 0; byte[] buffer = null; private boolean skipBad = false; HDFSBlockLoader loader = null; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); FileSplit fileSplit = (FileSplit) split; loader = new HDFSBlockLoader(fileSplit.getPath().getFileSystem(conf)); skipBad = getSkipBadGZIPRanges(conf); super.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(key == null) { key = new Text(); } if(value == null) { value = new Text(); } while(true) { if(curReader == null) { // are there more? if(internal.nextKeyValue()) { progress = internal.getProgress(); curInputLine = internal.getCurrentValue().toString(); String[] parts = curInputLine.split("\\s"); if(parts.length != 3) { throw new IOException("Bad format line(" + curInputLine +")"); } String newFile = parts[0]; if(fsdis != null) { if(!newFile.equals(curFile)) { // close old and open new, otherwise we can just // do another read on the current one: fsdis.close(); curFile = newFile; Path path = new Path(curFile); fsdis = fileSystem.open(path); } } else { curFile = newFile; Path path = new Path(curFile); fsdis = fileSystem.open(path); } curFile = parts[0]; curStart = Long.parseLong(parts[1]); int length = Integer.parseInt(parts[2]); if(buffer == null) { buffer = new byte[length]; } else if (buffer.length < length) { buffer = new byte[length]; } InputStream is = null; // the whole chunk is now in buffer: try { fsdis.readFully(curStart,buffer,0,length); is = new GZIPInputStream(new ByteArrayInputStream(buffer,0,length)); } catch (IOException e) { if(skipBad) { System.err.format("GZIP-BLOCK-ERROR\t%s\t%d\t%s\t%s\n", curFile,curStart,e.getClass().toString(), e.getMessage()); curReader = null; continue; // while(true) loop } else { throw new IOException(String.format("%s:%d - (%s) %s", curFile,curStart, e.getClass().toString(),e.getMessage())); } } curReader = new BufferedReader(new InputStreamReader(is,UTF8)); curLine = 0; } else { // all done: return false; } } // try to read another line: String nextLine = null; try { nextLine = curReader.readLine(); } catch(IOException e) { if(skipBad) { System.err.format("GZIP-BLOCK-ERROR\t%s\t%d\t%s\t%s\n", curFile,curStart,e.getClass().toString(), e.getMessage()); nextLine = null; } else { throw new IOException(String.format("%s:%d - (%s) %s", curFile,curStart, e.getClass().toString(),e.getMessage())); } } if(nextLine != null) { key.set(curFile+":"+curStart+":"+curLine); value.set(nextLine); curLine++; return true; } curReader.close(); curReader = null; } } }