package uk.bl.wa.hadoop.indexer; import java.io.IOException; import java.security.NoSuchAlgorithmException; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.PropertyConfigurator; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.util.SurtPrefixSet; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import uk.bl.wa.annotation.Annotations; import uk.bl.wa.annotation.Annotator; import uk.bl.wa.apache.solr.hadoop.Solate; import uk.bl.wa.hadoop.WritableArchiveRecord; import uk.bl.wa.indexer.WARCIndexer; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.solr.SolrWebServer; @SuppressWarnings( { "deprecation" } ) public class WARCIndexerMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, IntWritable, WritableSolrRecord> { private static final Log LOG = LogFactory.getLog( WARCIndexerMapper.class ); static enum MyCounters { NUM_RECORDS, NUM_ERRORS, NUM_NULLS, NUM_EMPTY_HEADERS } private String mapTaskId; private String inputFile; private int noRecords = 0; private WARCIndexer windex; private Solate sp = null; private int numShards = 1; private Config config; public WARCIndexerMapper() { try { // Re-configure logging: Properties props = new Properties(); props.load(getClass().getResourceAsStream("/log4j-override.properties")); PropertyConfigurator.configure(props); } catch (IOException e1) { LOG.error("Failed to load log4j config from properties file."); } } @Override public void configure( JobConf job ) { if (this.config == null) { innerConfigure(ConfigFactory.parseString(job .get(WARCIndexerRunner.CONFIG_PROPERTIES))); } // Other properties: mapTaskId = job.get("mapred.task.id"); inputFile = job.get("map.input.file"); LOG.info("Got task.id " + mapTaskId + " and input.file " + inputFile); } public void innerConfigure(Config jobConfig) { try { // Get config from job property: config = jobConfig; // Initialise indexer: this.windex = new WARCIndexer( config ); // Decide whether to try to apply annotations: boolean applyAnnotations = false; if( config.hasPath(WARCIndexerRunner.CONFIG_APPLY_ANNOTATIONS)) { applyAnnotations = config .getBoolean(WARCIndexerRunner.CONFIG_APPLY_ANNOTATIONS); } if (applyAnnotations) { LOG.info("Attempting to load annotations from 'annotations.json'..."); Annotations ann = Annotations.fromJsonFile("annotations.json"); LOG.info( "Attempting to load OA SURTS from 'openAccessSurts.txt'..."); SurtPrefixSet oaSurts = Annotator .loadSurtPrefix("openAccessSurts.txt"); windex.setAnnotations(ann, oaSurts); } // Set up sharding: numShards = config.getInt(SolrWebServer.NUM_SHARDS); if (config.hasPath(SolrWebServer.CONF_ZOOKEEPERS)) { String zkHost = config.getString(SolrWebServer.CONF_ZOOKEEPERS); String collection = config.getString(SolrWebServer.COLLECTION); sp = new Solate(zkHost, collection, numShards); } } catch( NoSuchAlgorithmException e ) { LOG.error("WARCIndexerMapper.configure(): " + e.getMessage()); } catch (JsonParseException e) { LOG.error("WARCIndexerMapper.configure(): " + e.getMessage()); } catch (JsonMappingException e) { LOG.error("WARCIndexerMapper.configure(): " + e.getMessage()); } catch (IOException e) { LOG.error("WARCIndexerMapper.configure(): " + e.getMessage()); } } public WritableSolrRecord innerMap(Text key, WritableArchiveRecord value, Reporter reporter) throws IOException { ArchiveRecordHeader header = value.getRecord().getHeader(); noRecords++; ArchiveRecord rec = value.getRecord(); SolrRecord solr = new SolrRecord(key.toString(), rec.getHeader()); try { if (!header.getHeaderFields().isEmpty()) { // Do the indexing: solr = windex.extract(key.toString(), value.getRecord()); // If there is no result, report it if (solr == null) { LOG.debug("WARCIndexer returned NULL for: " + header.getUrl()); reporter.incrCounter(MyCounters.NUM_NULLS, 1); return null; } // Increment record counter: reporter.incrCounter(MyCounters.NUM_RECORDS, 1); } else { // Report headerless records: reporter.incrCounter(MyCounters.NUM_EMPTY_HEADERS, 1); } } catch (Exception e) { LOG.error(e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset(), e); // Increment error counter reporter.incrCounter(MyCounters.NUM_ERRORS, 1); // Store it: solr.addParseException(e); } catch (OutOfMemoryError e) { // Allow processing to continue if a record causes OOME: LOG.error("OOME " + e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); // Increment error counter reporter.incrCounter(MyCounters.NUM_ERRORS, 1); // Store it: solr.addParseException(e); } // Wrap up and collect the result: WritableSolrRecord wsolr = new WritableSolrRecord(solr); // Get the right key for the right partition: if (sp != null) { wsolr.setPartition(sp.getPartition(null, solr.getSolrDocument())); } // Occasionally update application-level status if ((noRecords % 1000) == 0) { reporter.setStatus(noRecords + " processed from " + inputFile); // Also assure framework that we are making progress: reporter.progress(); } return wsolr; } @Override public void map(Text key, WritableArchiveRecord value, OutputCollector<IntWritable, WritableSolrRecord> output, Reporter reporter) throws IOException { WritableSolrRecord wsolr = this.innerMap(key, value, reporter); // Pass to reduce stage if successful: if (wsolr != null) { // Get the right key for the right partition: IntWritable oKey = null; if (sp != null) { oKey = new IntWritable(wsolr.getPartition()); } else { // Otherwise use a random assignment: int iKey = (int) (Math.round(Math.random() * numShards)); oKey = new IntWritable(iKey); } output.collect(oKey, wsolr); } } }