package org.archive.hadoop.pig.udf;
import java.io.IOException;
import org.apache.commons.lang.BooleanUtils;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.archive.hadoop.mapreduce.CDXMapper;
public class Recanonicalize extends EvalFunc<String> {
CDXMapper converter;
public Recanonicalize()
{
this(true);
}
public Recanonicalize(String isSurt)
{
this(BooleanUtils.toBoolean(isSurt));
}
public Recanonicalize(boolean isSurt)
{
converter = new CDXMapper(isSurt);
converter.setNoRedirect(true);
converter.setSkipOnCanonFail(true);
}
@Override
public String exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.isNull()) {
return null;
}
if (tuple.size() == 1) {
String line = (String)tuple.get(0);
// If only the url, then convert url instead of whole cdx line
if (!line.contains(" ")) {
return converter.canonicalizeUrl(line);
} else {
return converter.convertLine(line);
}
} else if (tuple.size() == 2) {
String key = (String)tuple.get(0);
String value = (String)tuple.get(1);
return converter.convertLine(key + " " + value);
} else {
throw new IOException("CDX tuple must be length 1 or 2");
}
}
}