package org.archive.accesscontrol;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import org.archive.accesscontrol.model.Rule;
import org.archive.accesscontrol.model.RuleSet;
import org.archive.accesscontrol.robotstxt.CachingRobotClient;
import org.archive.accesscontrol.robotstxt.RobotClient;
import org.archive.net.PublicSuffixes;
import org.archive.util.ArchiveUtils;
import org.archive.util.SURT;
/**
* The Exclusions Client provides a facade for accessing a remote or local
* exclusions oracle.
*
* In future it will perform heavy caching to prevent queries about related and
* recently-accessed pages from needing to hit the oracle.
*
* @author aosborne
*/
public class AccessControlClient {
protected RuleDao ruleDao;
protected RobotClient robotClient;
private boolean robotLookupsEnabled = true;
private boolean robotPreparationEnabled = true;
private String robotUserAgent = "wayback-access-control";
public AccessControlClient(RuleDao ruleDao, RobotClient robotClient) {
super();
this.ruleDao = ruleDao;
this.robotClient = robotClient;
}
/**
* Create a new (caching) client to query a remote oracle.
*
* @param oracleUrl
* Base url of the oracle webapp. eg.
* "http://localhost:8080/exclusions-oracle/"
*/
public AccessControlClient(String oracleUrl) {
this(new CachingRuleDao(oracleUrl), new CachingRobotClient());
}
private String getPolicy(String url, Rule rule)
throws RobotsUnavailableException {
if (robotLookupsEnabled && rule != null && "robots".equals(rule.getPolicy())) {
try {
if (robotClient.isRobotPermitted(url, robotUserAgent)) {
return "allow";
} else {
return "block";
}
} catch (IOException e) {
throw new RobotsUnavailableException(e);
}
}
if (rule == null) {
throw new RuntimeException("No applicable rule found."
+ "Please make sure you have a default rule set"
+ " on the root SURT '(' in the oracle.");
}
return rule.getPolicy();
}
/**
* Return the best-matching policy for the requested document.
*
* @param url
* URL of the requested document.
* @param captureDate
* Date the document was archived.
* @param retrievalDate
* Date of retrieval (usually now).
* @param who
* Group name of the user accessing the document.
* @return Access-control policy that should be enforced. eg "robots",
* "block" or "allow".
* @throws RobotsUnavailableException
* @throws RuleOracleUnavailableException
*/
public String getPolicy(String url, Date captureDate, Date retrievalDate,
String who) throws RobotsUnavailableException,
RuleOracleUnavailableException {
return getPolicy(url, getRule(url, captureDate, retrievalDate, who));
}
/**
* Return the best-matching policy for the requested document.
*
* @param url
* URL of the requested document.
* @param captureDate
* Date the document was archived.
* @param retrievalDate
* Date of retrieval (usually now).
* @param groups
* Group names of the user accessing the document.
* @return Access-control policy that should be enforced. eg "robots",
* "block" or "allow".
* @throws RobotsUnavailableException
* @throws RuleOracleUnavailableException
*/
public String getPolicy(String url, Date captureDate, Date retrievalDate,
Collection<String> groups) throws RobotsUnavailableException,
RuleOracleUnavailableException {
return getPolicy(url, getRule(url, captureDate, retrievalDate, groups));
}
/**
* Return the most specific matching rule for the requested document.
*
* @param url
* URL of the requested document.
* @param captureDate
* Date the document was archived.
* @param retrievalDate
* Date of retrieval (usually now).
* @param who
* Group name of the user accessing the document.
* @return
* @throws RuleOracleUnavailableException
*/
public Rule getRule(String url, Date captureDate, Date retrievalDate,
String who) throws RuleOracleUnavailableException {
url = ArchiveUtils.addImpliedHttpIfNecessary(url);
String surt = SURT.fromURI(url);
// PublicSuffixes.reduceSurtToAssignmentLevel(surt)
String publicSuffix = PublicSuffixes
.reduceSurtToAssignmentLevel(getSurtAuthority(surt));
RuleSet rules = ruleDao.getRuleTree(getScheme(surt) + "(" + publicSuffix);
Rule matchingRule = rules.getMatchingRule(surt, captureDate,
retrievalDate, who);
return matchingRule;
}
/**
* Return the most specific matching rule for the requested document.
*
* @param url
* URL of the requested document.
* @param captureDate
* Date the document was archived.
* @param retrievalDate
* Date of retrieval (usually now).
* @param groups
* Group names of the user accessing the document.
* @return
* @throws RuleOracleUnavailableException
*/
@Deprecated
public Rule getRule(String url, Date captureDate, Date retrievalDate,
Collection<String> groups)
throws RuleOracleUnavailableException {
Rule bestRule = null;
for (String who: groups) {
Rule rule = getRule(url, captureDate, retrievalDate, who);
/* We compare policies not the rules themselves as
* a user should have full access to something one of their
* groups has access to, even if another group they are
* member of does not.
*/
if (bestRule == null ||
rule.getPolicy().compareTo(bestRule.getPolicy()) < 0) {
bestRule = rule;
}
}
return bestRule;
}
/**
* This method allows the client to prepare for lookups from a given set of
* urls. This can warm up a cache and/or enable a mass data transfer to be done in
* parallel.
*
* @param surts
*/
public void prepare(Collection<String> urls) {
ArrayList<String> publicSuffixes = new ArrayList<String>(urls.size());
for (String url: urls) {
String surt = SURT.fromURI(ArchiveUtils.addImpliedHttpIfNecessary(url));
publicSuffixes.add(PublicSuffixes
.reduceSurtToAssignmentLevel(getSurtAuthority(surt)));
}
ruleDao.prepare(publicSuffixes);
if (robotPreparationEnabled) {
robotClient.prepare(urls, robotUserAgent);
}
}
protected String getSurtAuthority(String surt) {
int indexOfOpen = surt.indexOf("://(");
int indexOfClose = surt.indexOf(")");
if (indexOfOpen == -1 || indexOfClose == -1
|| ((indexOfOpen + 4) >= indexOfClose)) {
return surt;
}
return surt.substring(indexOfOpen + 4, indexOfClose);
}
protected static String getScheme(String surt) {
int i = surt.indexOf("://");
int j = surt.indexOf(":");
if (i >= 0 && i == j) {
return surt.substring(0, i + 3);
} else {
return "";
}
}
public String getRobotUserAgent() {
return robotUserAgent;
}
public void setRobotUserAgent(String robotUserAgent) {
this.robotUserAgent = robotUserAgent;
}
public boolean isRobotLookupsEnabled() {
return robotLookupsEnabled;
}
public void setRobotLookupsEnabled(boolean robotLookupsEnabled) {
this.robotLookupsEnabled = robotLookupsEnabled;
}
public boolean isRobotPreparationEnabled() {
return robotPreparationEnabled;
}
public void setRobotPreparationEnabled(boolean robotPreparationEnabled) {
this.robotPreparationEnabled = robotPreparationEnabled;
}
/**
* Use a proxy server when fetching robots.txt data.
* @param host
* @param port
*/
public void setRobotProxy(String host, int port) {
robotClient.setRobotProxy(host, port);
}
}