package org.archive.hadoop.pig.udf;
import java.io.IOException;
import java.text.ParseException;
import java.util.Date;
import org.apache.pig.FilterFunc;
import org.apache.pig.data.Tuple;
import org.archive.accesscontrol.AccessControlClient;
import org.archive.accesscontrol.RobotsUnavailableException;
import org.archive.accesscontrol.RuleOracleUnavailableException;
import org.archive.util.ArchiveUtils;
public class AccessControlAllowCapture extends FilterFunc {
protected AccessControlClient client;
protected String accessGroup;
protected Date retrievalDate;
public final static String BLOCK = "block";
public final static String BLOCK_MESSAGE = "block-message";
public AccessControlAllowCapture(String oracleUrl, String accessGroup)
{
this.client = new AccessControlClient(oracleUrl);
this.accessGroup = accessGroup;
// not really used, so just initing once
this.retrievalDate = new Date();
}
@Override
public Boolean exec(Tuple input) throws IOException {
if (input == null || input.isNull() || (input.size() < 2)) {
return false;
}
String url = input.get(0).toString();
String date = input.get(1).toString();
Date captureDate = null;
String policy = null;
try {
captureDate = ArchiveUtils.getDate(date);
policy = client.getPolicy(ArchiveUtils.addImpliedHttpIfNecessary(url), captureDate, retrievalDate, accessGroup);
} catch (RobotsUnavailableException e) {
//should never happen here, not checking robots
throw new IOException("Oracle Failed", e);
} catch (RuleOracleUnavailableException e) {
throw new IOException("Oracle Failed", e);
} catch (ParseException e) {
throw new IOException("Date Parse Failed", e);
}
// Blocked policies are "block" and "block-message"
if (policy.equals(BLOCK) || policy.equals(BLOCK_MESSAGE)) {
return false;
}
return true;
}
}