package org.archive.hadoop.mapreduce;
import java.io.IOException;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.archive.url.URLKeyMaker;
import org.archive.url.WaybackURLKeyMaker;
public class CDXMapper extends Mapper<Object, Text, Text, Text>
implements Configurable {
private static final Logger LOG =
Logger.getLogger(CDXMapper.class.getName());
// Note the (unbelievably) new "S" for "Size in Compressed Bytes..."
public final static String NEW_CDX_HEADER =
"CDX N b a m s k r M S V g";
private static String TEXT_OUTPUT_DELIM_CONFIG = "text.output.delim";
public static int MODE_GLOBAL = 0;
public static int MODE_FULL = 1;
private Configuration conf;
private Text key = new Text();
private Text value = new Text();
private String delim = " ";
StringBuilder keySB = new StringBuilder();
StringBuilder valSB = new StringBuilder();
private boolean omitNoArchive = false;
private URLKeyMaker keyMaker = new WaybackURLKeyMaker();
public static String DEFAULT_GZ_LEN = "-";
public StringPair convert(String cdxLine) {
if(cdxLine.startsWith(" CDX ")) {
return new StringPair("", NEW_CDX_HEADER);
}
String[] parts = cdxLine.split(delim);
int offsetIdx = 8;
String metaInstructions = "-";
if(parts.length == 9) {
offsetIdx = 7;
} else if(parts.length == 10) {
metaInstructions = parts[7];
if(omitNoArchive) {
if(metaInstructions.contains("A")) {
return null;
}
}
} else {
LOG.warning("Skipping line:" + cdxLine);
return null;
}
// don't care about the old key:
// String urlKey = parts[0];
String timestamp = parts[1];
String origUrl = parts[2];
String mime = parts[3];
String responseCode = parts[4];
String digest = parts[5];
String redirect = parts[6];
String offset = parts[offsetIdx];
String filename = parts[offsetIdx+1];
String urlKey = keyMaker.makeKey(origUrl);
keySB.setLength(0);
keySB.append(urlKey).append(delim).append(timestamp);
valSB.setLength(0);
valSB.append(origUrl).append(delim);
valSB.append(mime).append(delim);
valSB.append(responseCode).append(delim);
valSB.append(digest).append(delim);
valSB.append(redirect).append(delim);
valSB.append(metaInstructions).append(delim);
valSB.append(DEFAULT_GZ_LEN).append(delim);
valSB.append(offset).append(delim);
valSB.append(filename);
return new StringPair(keySB.toString(), valSB.toString());
}
public void map(Object y, Text textLine, Context context) throws IOException,
InterruptedException {
String cdxLine = textLine.toString();
StringPair st = convert(cdxLine);
if(st != null) {
key.set(st.first);
value.set(st.second);
context.write(key, value);
}
}
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
delim = conf.get(TEXT_OUTPUT_DELIM_CONFIG, delim);
}
public class StringPair {
public String first;
public String second;
public StringPair(String first, String second) {
this.first = first;
this.second = second;
}
}
}