package uk.bl.wa.hadoop.indexer.mdx; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.PropertyConfigurator; import org.apache.solr.common.SolrInputField; import org.json.JSONException; import uk.bl.wa.hadoop.WritableArchiveRecord; import uk.bl.wa.hadoop.indexer.WARCIndexerMapper; import uk.bl.wa.hadoop.indexer.WritableSolrRecord; import uk.bl.wa.hadoop.mapreduce.mdx.MDX; import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; @SuppressWarnings( { "deprecation" } ) public class WARCMDXMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, Text> { private static final Log LOG = LogFactory.getLog( WARCMDXMapper.class ); private WARCIndexerMapper wim; public WARCMDXMapper() { try { // Re-configure logging: Properties props = new Properties(); props.load(getClass().getResourceAsStream("/log4j-override.properties")); PropertyConfigurator.configure(props); } catch (IOException e1) { LOG.error("Failed to load log4j config from properties file."); } } @Override public void configure(JobConf job) { if (wim == null) { wim = new WARCIndexerMapper(); wim.configure(job); } } @Override public void map(Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { // Use the main indexing code: WritableSolrRecord wsolr = wim.innerMap(key, value, reporter); // Ignore skipped records, where wsolr will be NULL: if (wsolr != null) { SolrRecord solr = wsolr.getSolrRecord(); // Wrap up the result: MDX mdx; // Wrap up the key: Text oKey; try { mdx = fromWritableSolrRecord(solr); oKey = new Text(mdx.getHash()); // Alternative key, based on record type + url + timestamp // Text oKey = new Text(mdx.getUrl() + "\t" + mdx.getTs() + "\t" // + mdx.getRecordType()); // Collect Text result = new Text(mdx.toString()); output.collect(oKey, result); } catch (JSONException e) { e.printStackTrace(); } } } /** * * @param solr * @return * @throws JSONException */ public static MDX fromWritableSolrRecord(SolrRecord solr) throws JSONException { MDX m = new MDX(); m.setHash(stringValueOrUnset(solr.getFieldValue(SolrFields.HASH))); m.setUrl(stringValueOrUnset(solr.getFieldValue(SolrFields.SOLR_URL))); m.setTs(stringValueOrUnset( solr.getFieldValue(SolrFields.WAYBACK_DATE))); m.setRecordType(stringValueOrUnset( solr.getFieldValue(SolrFields.SOLR_RECORD_TYPE))); // Pass though Solr fields: for (String f : solr.getSolrDocument().getFieldNames()) { SolrInputField v = solr.getSolrDocument().get(f); if (v.getValueCount() > 1) { Iterator<Object> i = v.getValues().iterator(); List<String> vals = new ArrayList<String>(); while (i.hasNext()) { vals.add(i.next().toString()); } m.put(f, vals); } else { m.put(f, v.getFirstValue()); } } return m; } private static String stringValueOrUnset(Object val) { if (val == null) { return "unset"; } else { return val.toString(); } } }