package org.archive.hadoop.pig; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.archive.format.gzip.zipnum.ZipNumCluster; import org.archive.format.gzip.zipnum.ZipNumParams; import org.archive.util.iterator.CloseableIterator; public class HttpZipNumDerefLineRecordReader extends RecordReader<LongWritable, Text> { protected ZipNumCluster cluster; protected ZipNumParams params; protected String clusterUri; protected String start, end; protected Text nextCdxLine; protected HttpInputLineRecordReader inner; protected CloseableIterator<String> cdxReader; public HttpZipNumDerefLineRecordReader(String clusterUri, String summaryQueryUrl, int split, int maxAggregateBlocks) throws IOException { this.inner = new HttpInputLineRecordReader(summaryQueryUrl, split); this.clusterUri = clusterUri; this.nextCdxLine = new Text(""); this.params = new ZipNumParams(); this.params.setMaxAggregateBlocks(maxAggregateBlocks); } protected String getParam(String query, String key) { int index = query.indexOf(key); if (index < 0) { return null; } int endIndex = query.indexOf('&', index + 1); if (endIndex < 0) { return query.substring(index + key.length()); } else { return query.substring(index + key.length(), endIndex); } } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { inner.initialize(split, context); cluster = new ZipNumCluster(clusterUri); cluster.init(); String theUrl = inner.getUrl(); String query = theUrl.substring(theUrl.indexOf('?') + 1); start = getParam(query, "start="); end = getParam(query, "end="); HttpClusterInputSplit hcis = (HttpClusterInputSplit)split; cdxReader = cluster.getCDXIterator(new RecordReaderValueIterator(inner), start, end, hcis.getSplit(), hcis.getNumSplits()); } @Override public boolean nextKeyValue() throws IOException { if (cdxReader != null && cdxReader.hasNext()) { nextCdxLine.set(cdxReader.next()); inner.incCounters(nextCdxLine.getLength() + 2); return true; } else { return false; } } @Override public Text getCurrentValue() { return nextCdxLine; } @Override public synchronized void close() throws IOException { if (cdxReader != null) { cdxReader.close(); cdxReader = null; } inner.close(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { // TODO Auto-generated method stub return null; } @Override public float getProgress() throws IOException, InterruptedException { // TODO Auto-generated method stub return 0; } }