package org.archive.hadoop.streaming; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.LineRecordReader; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.CombineFileSplit; import org.archive.format.gzip.zipnum.ZipNumIndex; import org.archive.format.gzip.zipnum.ZipNumParams; import org.archive.util.iterator.CloseableIterator; public class ZipNumRecordReader implements RecordReader<Text, Text> { protected ZipNumIndex cluster = null; protected CloseableIterator<String> cdxReader; protected LineRecordReader inner; protected ZipNumParams params; public ZipNumRecordReader(CombineFileSplit combineSplit, Configuration conf, Reporter reporter, Integer index) throws IOException { Path path = combineSplit.getPath(index); long start = combineSplit.getOffset(index); long length = combineSplit.getLength(index); String[] locs = combineSplit.getLocations(); init(conf, new FileSplit(path, start, length, locs)); } public ZipNumRecordReader(JobConf job, FileSplit fileSplit) throws IOException { init(job, fileSplit); } protected void init(Configuration conf, FileSplit fileSplit) throws IOException { inner = new LineRecordReader(conf, fileSplit); String summaryFile = conf.get("conf.zipnum.locationPath", null); if (summaryFile == null) { Path summaryPath = fileSplit.getPath(); summaryFile = summaryPath.toString(); } if (!summaryFile.contains(":/")) { summaryFile = conf.get(FileSystem.FS_DEFAULT_NAME_KEY, "") + summaryFile; } cluster = ZipNumIndex.createIndexWithSummaryPath(summaryFile); params = new ZipNumParams(); params.setMaxAggregateBlocks(conf.getInt("conf.zipnum.maxAggBlocks", 3000)); params.setMaxBlocks(0); cdxReader = cluster.getCDXIterator(new RecordReaderValueIterator(inner), params); } @Override public float getProgress() { return inner.getProgress(); } @Override public synchronized boolean next(Text key, Text value) throws IOException { if (cdxReader != null && cdxReader.hasNext()) { String cdxLine = cdxReader.next(); key.set(cdxLine); value.set(""); return true; } else { return false; } } @Override public synchronized void close() throws IOException { if (cdxReader != null) { cdxReader.close(); cdxReader = null; } inner.close(); } public void seekNear(String key) { try { if (cdxReader != null) { cdxReader.close(); cdxReader = null; } cdxReader = cluster.getCDXIterator(key, null); } catch (IOException e) { e.printStackTrace(); } } @Override public Text createKey() { return new Text(); } @Override public Text createValue() { return inner.createValue(); } @Override public long getPos() throws IOException { return inner.getPos(); } }