package org.archive.hadoop.mapreduce; import java.io.IOException; import java.util.logging.Logger; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.archive.util.StringFieldExtractor; import org.archive.util.StringFieldExtractor.StringTuple; public class GlobalWaybackCDXReducer extends Reducer<Text, Text, Text, Text> implements Configurable { private static final Logger LOGGER = Logger.getLogger(GlobalWaybackCDXReducer.class.getName()); private Configuration conf; private static final int DEFAULT_DAY_LIMIT = 111; private static final String DAY_LIMIT_CONFIG = "cdx.daily.limit"; private static final char DELIMITER = ' '; private static final int DATE_FIELD = 2; private int dayLimit; private StringFieldExtractor sfe; private String lastDayUrl = null; private String lastDay = null; private int lastDayCount = 0; public static void setDailyLimit(Configuration conf, int limit) { conf.setInt(DAY_LIMIT_CONFIG, limit); } private static String dayPart(final String timestamp) { if(timestamp == null) { return null; } return timestamp.substring(0,Math.min(timestamp.length(), 8)); } public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { if(dayLimit != 0) { String ks = key.toString(); StringTuple st = sfe.split(ks); String activeDay = dayPart(st.second); String url = st.first; if(lastDayUrl == null) { lastDayUrl = url; lastDay = activeDay; lastDayCount = 0; } else { if(lastDayUrl.equals(url)) { // on the same url, is it the same day? if(lastDay.equals(activeDay)) { // leave counters alone: // TODO: uniqueness checking - now we are just throwing away // anything - would be nice to omit dupes, first. } else { // a new day: lastDay = activeDay; lastDayCount = 0; } } else { // a new URL: lastDayUrl = url; lastDay = activeDay; lastDayCount = 0; } } } for (Text value : values) { if(lastDayCount > dayLimit) { // to many for this day.. break; } String vs = value.toString(); String parts[] = vs.split(" "); if((parts.length < 7) ||(parts.length > 8)){ LOGGER.warning("Bad input(column count): " + key.toString() + " " + vs); continue; } // String urlKey = parts[0]; // String timestamp = parts[1]; int idx = 0; String origUrl = parts[idx++]; String mime = parts[idx++]; String responseCode = parts[idx++]; String digest = parts[idx++]; String redirect = parts[idx++]; String robotFlags = null; if(parts.length == 8) { robotFlags = parts[idx++]; } String offset = parts[idx++]; String filename = parts[idx]; // now - do we output? if(robotFlags != null) { if(robotFlags.contains("A")) { continue; } } try { int code = Integer.parseInt(responseCode); // erk.. let's filter out live web stuff that's 502/504: if((code == 502) || (code == 504)) { if(filename.startsWith("live-20") && filename.endsWith(".arc.gz")) { // discard: continue; } } } catch(NumberFormatException e) { LOGGER.warning("Bad input(response code): " + key.toString() + " " + vs); continue; } try { Long.parseLong(offset); } catch(NumberFormatException e) { LOGGER.warning("Bad input(offset): " + key.toString() + " " + vs); continue; } if(digest.length() > 3) { digest = digest.substring(0,3); } lastDayCount++; value.set(String.format("%s %s %s %s %s %s %s", origUrl,mime,responseCode,digest,redirect,offset,filename)); context.write(key, value); } } public void setConf(Configuration conf) { this.conf = conf; dayLimit = conf.getInt(DAY_LIMIT_CONFIG, DEFAULT_DAY_LIMIT); sfe = new StringFieldExtractor(DELIMITER, DATE_FIELD); } public Configuration getConf() { return conf; } }