package org.archive.hadoop.mapreduce;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.archive.hadoop.cdx.HDFSBlockLoader;
import org.archive.hadoop.cdx.ZipNumBlockIterator;
public class GZIPMembersLineRecordReader extends RecordReader<Text, Text> {
LineRecordReader internal = null;
HDFSBlockLoader loader = null;
Configuration conf = null;
Text key;
Text value;
String context;
Iterator<String> activeBlock = null;
int lineNumber = 0;
@Override
public void close() throws IOException {
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return internal.getProgress();
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
internal = new LineRecordReader();
internal.initialize(split, context);
conf = context.getConfiguration();
FileSplit fileSplit = (FileSplit) split;
loader = new HDFSBlockLoader(fileSplit.getPath().getFileSystem(conf));
key = new Text();
value = new Text();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
while(true) {
if(activeBlock == null) {
if(!internal.nextKeyValue()) {
return false;
}
String v = internal.getCurrentValue().toString();
String parts[] = v.split("\\s");
if(parts.length != 3) {
throw new IOException("Bad line:" + v);
}
long offset = 0;
int len = 0;
try {
offset = Long.parseLong(parts[1]);
len = Integer.parseInt(parts[2]);
} catch(NumberFormatException e) {
throw new IOException("Bad line:" + v);
}
context = String.format("%s:%d",parts[0],offset);
byte[] compressed = loader.readBlock(parts[0], offset, len);
activeBlock = new ZipNumBlockIterator(compressed).iterator();
}
if(activeBlock != null) {
try {
if(activeBlock.hasNext()) {
lineNumber++;
key.set(context+":"+lineNumber);
value.set(activeBlock.next());
return true;
} else {
activeBlock = null;
}
} catch(Exception e) {
throw new IOException(context,e);
}
}
}
}
}