package uk.bl.wa.hadoop.recrawl; import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST; import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE; import static org.archive.modules.CoreAttributeConstants.A_FETCH_BEGAN_TIME; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ETAG_HEADER; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_REFERENCE_LENGTH; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_STATUS; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import org.apache.commons.codec.binary.Base64; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.lang.SerializationUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.archive.format.warc.WARCConstants.WARCRecordType; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.modules.CrawlURI; import org.archive.modules.extractor.LinkContext; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.SURT; import uk.bl.wa.hadoop.WritableArchiveRecord; /** * Generates persistlog-like data from a series of WARC records. Relies heavily * on the response->request->metadata sequence of records generated by Heritrix. * * @author rcoram * */ @SuppressWarnings({ "unchecked", "deprecation" }) public class PersistLogMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, Text> { private static final Log LOGGER = LogFactory.getLog(PersistLogMapper.class); ArchiveRecordHeader responseHeader = null; CrawlURI curi = null; BufferedReader input = null; HashMap<String, String> map = null; String line; String[] data; HashMap<String, Object> latestFetch = null; SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); @Override public void map(Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { ArchiveRecord record = value.getRecord(); String type = (String) record.getHeader().getHeaderValue( HEADER_KEY_TYPE); try { if (type.equals(WARCRecordType.response.toString()) || type.equals(WARCRecordType.revisit.toString())) { responseHeader = record.getHeader(); latestFetch = new HashMap<String, Object>(); if (responseHeader.getUrl().startsWith("http")) { String statusLine = HttpParser.readLine(record, "UTF-8"); if (statusLine != null && statusLine.startsWith("HTTP")) { int status = Integer .parseInt(statusLine.split("\\s+")[1].trim()); latestFetch.put(A_STATUS, status); } } } else { if (type.equals(WARCRecordType.metadata.toString())) { // We MUST have hit a response/revisit which matches this // metadata record. if (responseHeader == null || !record.getHeader().getUrl() .equals(responseHeader.getUrl()) || latestFetch == null || latestFetch.size() == 0) { LOGGER.warn("Found metadata:" + record.getHeader().getUrl() + " without response."); return; } // Store the key:value pairs... map = new HashMap<String, String>(); input = new BufferedReader(new InputStreamReader( value.getPayloadAsStream())); while ((line = input.readLine()) != null) { data = line.split(": ", 2); if (data != null && data.length == 2) map.put(data[0].toLowerCase(), data[1]); } UURI uuri = UURIFactory .getInstance(responseHeader.getUrl()); String pathFromSeed = map.get("hopsFromSeed"); UURI via = null; if (map.get("via") != null) via = UURIFactory.getInstance(map.get("via")); LinkContext viaContext = LinkContext.NAVLINK_MISC; curi = new CrawlURI(uuri, pathFromSeed, via, viaContext); curi.addPersistentDataMapKey(A_FETCH_HISTORY); Date date = format.parse(responseHeader.getDate()); latestFetch.put(A_FETCH_BEGAN_TIME, date.getTime()); latestFetch.put(A_CONTENT_DIGEST, responseHeader .getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST)); latestFetch.put(A_WRITE_TAG, responseHeader.getReaderIdentifier()); latestFetch.put(A_REFERENCE_LENGTH, responseHeader.getLength()); if (map.get(A_ETAG_HEADER) != null) latestFetch.put(A_ETAG_HEADER, map.get(A_ETAG_HEADER)); if (map.get(A_LAST_MODIFIED_HEADER) != null) latestFetch.put(A_LAST_MODIFIED_HEADER, map.get(A_LAST_MODIFIED_HEADER)); HashMap<String, Object>[] history = new HashMap[] { latestFetch }; curi.getData().put(A_FETCH_HISTORY, history); String surt = SURT.fromURI(curi.getUURI().getEscapedURI(), true); String persist = new String( Base64.encodeBase64(SerializationUtils .serialize((Serializable) curi .getPersistentDataMap()))); output.collect(new Text(surt), new Text(surt + " " + persist)); responseHeader = null; latestFetch = null; } } } catch (Exception e) { e.printStackTrace(); } } }