package org.archive.hadoop.mapreduce; import java.io.IOException; import java.util.logging.Logger; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class GlobalWaybackMergeMapper extends Mapper<Object, Text, Text, Text> implements Configurable { private static final Logger LOGGER = Logger.getLogger(GlobalWaybackMergeMapper.class.getName()); private Configuration conf; private Text outKey = new Text(); private Text outValue = new Text(); public static void setDailyLimit(Configuration conf, int limit) { conf.setInt(DAY_LIMIT_CONFIG, limit); } private static final int DEFAULT_DAY_LIMIT = 111; private static final String DAY_LIMIT_CONFIG = "cdx.daily.limit"; private int dayLimit; private String lastDayUrl = null; private String lastDay = null; private int lastDayCount = 0; public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; dayLimit = conf.getInt(DAY_LIMIT_CONFIG, DEFAULT_DAY_LIMIT); } public void map(Object y, Text value, Context context) throws IOException, InterruptedException { String vs = value.toString(); String parts[] = vs.split(" "); if((parts.length < 9) || (parts.length > 10)){ System.err.format("Bad input(%s)\n",vs); LOGGER.warning("Bad input(column count): " + vs); return; } int idx = 0; String urlKey = parts[idx++]; String timestamp = parts[idx++]; String activeDay = timestamp.substring(0,Math.min(timestamp.length(), 8)); if(dayLimit != 0) { if(lastDayUrl == null) { lastDayUrl = urlKey; lastDay = activeDay; lastDayCount = 0; } else { if(lastDayUrl.equals(urlKey)) { // on the same url, is it the same day? if(lastDay.equals(activeDay)) { // TODO: uniqueness checking - now we are just throwing away // anything - would be nice to omit dupes, first. if(lastDayCount > dayLimit) { // too many for this day.. LOGGER.fine("Too many for one day:" + vs); return; } } else { // a new day: lastDay = activeDay; lastDayCount = 0; } } else { // a new URL: lastDayUrl = urlKey; lastDay = activeDay; lastDayCount = 0; } } } String origUrl = parts[idx++]; String mime = parts[idx++]; String responseCode = parts[idx++]; String digest = parts[idx++]; String redirect = parts[idx++]; String robotFlags = null; if(parts.length == 10) { robotFlags = parts[idx++]; } String offset = parts[idx++]; String filename = parts[idx]; // now - do we output? if(robotFlags != null) { if(robotFlags.contains("A")) { LOGGER.fine("Skipping noArchive-record:" + vs); return; } } try { if(mime.contains("warc/")) { // let it ride -- no responseCode for many warc record types.. } else { int code = Integer.parseInt(responseCode); // erk.. let's filter out live web stuff that's 502/504: if((code == 502) || (code == 504)) { if(filename.startsWith("live-20") && filename.endsWith(".arc.gz")) { LOGGER.finer("Skipping live web 50X:" + vs); return; } } } } catch(NumberFormatException e) { LOGGER.fine("Bad input(response code): " + vs); return; } try { Long.parseLong(offset); } catch(NumberFormatException e) { LOGGER.warning("Bad input(offset): " + vs); return; } if(digest.length() > 3) { digest = digest.substring(0,3); } lastDayCount++; outKey.set(String.format("%s %s",urlKey,timestamp)); outValue.set(String.format("%s %s %s %s %s %s %s", origUrl,mime,responseCode,digest,redirect,offset,filename)); context.write(outKey, outValue); } }