package uk.bl.wa.hadoop.mapreduce.nlp; import java.io.IOException; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.PropertyConfigurator; import uk.bl.wa.hadoop.WritableArchiveRecord; import uk.bl.wa.hadoop.indexer.WARCIndexerMapper; import uk.bl.wa.hadoop.indexer.WritableSolrRecord; import uk.bl.wa.hadoop.mapreduce.mdx.MDX; import uk.bl.wa.hadoop.mapreduce.mdx.MDXWritable; import uk.bl.wa.solr.SolrRecord; @SuppressWarnings( { "deprecation" } ) public class WARCSentenceMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, MDXWritable> { private static final Log LOG = LogFactory.getLog(WARCSentenceMapper.class); private WARCIndexerMapper wim; public WARCSentenceMapper() { try { // Re-configure logging: Properties props = new Properties(); props.load(getClass().getResourceAsStream("/log4j-override.properties")); PropertyConfigurator.configure(props); } catch (IOException e1) { LOG.error("Failed to load log4j config from properties file."); } } @Override public void configure(JobConf job) { if (wim == null) { wim = new WARCIndexerMapper(); wim.configure(job); } } @Override public void map(Text key, WritableArchiveRecord value, OutputCollector<Text, MDXWritable> output, Reporter reporter) throws IOException { // Use the main indexing code: WritableSolrRecord wsolr = wim.innerMap(key, value, reporter); // Ignore skipped records, where wsolr will be NULL: if (wsolr != null) { SolrRecord solr = wsolr.getSolrRecord(); // Wrap up the result: MDX mdx = MDX.fromWritableSolrRecord(solr); // Wrap up the key: Text oKey = new Text(mdx.getHash()); // Alternative key, based on record type + url + timestamp // Text oKey = new Text(mdx.getUrl() + "\t" + mdx.getTs() + "\t" // + mdx.getRecordType()); // Collect MDXWritable result = new MDXWritable(mdx); output.collect(oKey, result); } } }