package org.commoncrawl.mapred.ec2.parser;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapRunner;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.ParseOutput;
/**
* Custom MapRunner, primarily to trap a successful execution of a map task
* (so that we can forward this information via the Task Data Client).
*
* @author rana
*
*/
public class ParserMapRunner extends MapRunner<Text,CrawlURL,Text,ParseOutput>{
@Override
public void run(RecordReader<Text, CrawlURL> input,
OutputCollector<Text, ParseOutput> output, Reporter reporter)
throws IOException {
try {
// allocate key & value instances that are re-used for all entries
Text key = input.createKey();
CrawlURL value = input.createValue();
while (input.next(key, value)) {
// update progress first ...
((ParserMapper)getMapper()).updateProgressAndPosition(input.getProgress(),input.getPos());
// next map pair to output
getMapper().map(key, value, output, reporter);
// ok see if mapper terminated early ...
if (((ParserMapper)getMapper()).wasTerminatedEarly()) {
// skip processing remaining stream ...
break;
}
}
// ok .. if we reach here without any exceptions ...
// inform the TDC that this was a successful (potentially partially completed)
// mapper task
((ParserMapper)getMapper()).commitTask(reporter);
} finally {
getMapper().close();
}
}
}