package edu.umd.cloud9.collection.trecweb; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import edu.umd.cloud9.collection.IndexableFileInputFormat; import edu.umd.cloud9.collection.WebDocument; import edu.umd.cloud9.collection.XMLInputFormatOld; import edu.umd.cloud9.collection.XMLInputFormat.XMLRecordReader; public class TrecWebDocumentInputFormat extends IndexableFileInputFormat<LongWritable, WebDocument> { @Override public RecordReader<LongWritable, WebDocument> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new TrecWebDocumentRecordReader(); } public static class TrecWebDocumentRecordReader extends RecordReader<LongWritable, WebDocument> { private final XMLRecordReader reader = new XMLRecordReader(); private final TrecWebDocument doc = new TrecWebDocument(); @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); conf.set(XMLInputFormatOld.START_TAG_KEY, TrecWebDocument.XML_START_TAG); conf.set(XMLInputFormatOld.END_TAG_KEY, TrecWebDocument.XML_END_TAG); reader.initialize(split, context); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return reader.getCurrentKey(); } @Override public WebDocument getCurrentValue() throws IOException, InterruptedException { TrecWebDocument.readDocument(doc, reader.getCurrentValue().toString()); return doc; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return reader.nextKeyValue(); } @Override public void close() throws IOException { reader.close(); } @Override public float getProgress() throws IOException, InterruptedException { return reader.getProgress(); } } }