package org.archive.hadoop.pig; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.archive.format.gzip.zipnum.ZipNumCluster; import org.archive.format.gzip.zipnum.ZipNumParams; import org.archive.util.iterator.CloseableIterator; public class ZipNumRecordReader extends RecordReader<Text, Text> { protected ZipNumCluster cluster = null; protected Text nextCdxLine; protected Text key; protected CloseableIterator<String> cdxReader; protected LineRecordReader inner; protected ZipNumParams params; @Override public Text getCurrentKey() { return key; } @Override public float getProgress() { return inner.getProgress(); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { FileSplit fileSplit = (FileSplit)split; inner = new LineRecordReader(); inner.initialize(split, context); Path summaryPath = fileSplit.getPath(); String summaryFile = summaryPath.toString(); if (summaryFile.startsWith("file:/")) { summaryFile = summaryFile.substring(5); } cluster = new ZipNumCluster(); cluster.setSummaryFile(summaryFile); cluster.init(); key = new Text(""); nextCdxLine = new Text(""); params = new ZipNumParams(); params.setMaxAggregateBlocks(0); params.setMaxBlocks(0); cdxReader = cluster.getCDXIterator(new RecordReaderValueIterator(inner), params); //cdxReader = cluster.getCDXIterator(clusterSplit.createSummaryIterator()); } @Override public boolean nextKeyValue() throws IOException { if (cdxReader != null && cdxReader.hasNext()) { String cdxLine = cdxReader.next(); int spaceIndex = cdxLine.indexOf(' '); if (spaceIndex >= 0) { key.set(cdxLine.substring(0, spaceIndex)); } else { key.set(cdxLine); } nextCdxLine.set(cdxLine); return true; } else { return false; } } @Override public Text getCurrentValue() { return nextCdxLine; } @Override public synchronized void close() throws IOException { if (cdxReader != null) { cdxReader.close(); cdxReader = null; } inner.close(); } public void seekNear(String key) { try { if (cdxReader != null) { cdxReader.close(); cdxReader = null; } cdxReader = cluster.getCDXIterator(key, null); } catch (IOException e) { e.printStackTrace(); } } }