package org.archive.wayback.accesscontrol.robotstxt;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Class which acts as an aggregation of RobotsDirectives.
*
* If given a host String, will return a list of additional robot URLs that
* need to be added to the current aggregation.
*
* Allows a user to then add new RobotsDirectives for one or more robot URLs.
*
* Finally, allows the aggregation to be queried to see if any of the
* directives block a particular path.
*
*
* @author brad
*
*/
public class RobotsDirectiveAggregation {
private final static Logger LOGGER =
Logger.getLogger(RobotsDirectiveAggregation.class.getName());
private final static String HTTP_PREFIX = "http://";
private final static String ROBOT_SUFFIX = "/robots.txt";
private static String WWWN_REGEX = "^www[0-9]+\\.";
private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX);
private HashMap<String,RobotsDirectives> cache =
new HashMap<String, RobotsDirectives>();
private StringBuilder sb = new StringBuilder();
private String hostToRobotUrlString(final String host) {
sb.setLength(0);
sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
String robotUrl = sb.toString();
LOGGER.fine("Adding robot URL:" + robotUrl);
return robotUrl;
}
/*
*/
/**
* @param resultHost
* @return a List of all robots.txt urls to attempt for this HOST:
* If HOST starts with "www.DOMAIN":
* [
* http://HOST/robots.txt,
* http://DOMAIN/robots.txt
* ]
* If HOST starts with "www[0-9]+.DOMAIN":
* [
* http://HOST/robots.txt,
* http://www.DOMAIN/robots.txt,
* http://DOMAIN/robots.txt
* ]
* Otherwise:
* [
* http://HOST/robots.txt,
* http://www.HOST/robots.txt
* ]
*/
List<String> hostToRobotUrlStrings(final String resultHost) {
ArrayList<String> list = new ArrayList<String>();
list.add(hostToRobotUrlString(resultHost));
if(resultHost.startsWith("www")) {
if(resultHost.startsWith("www.")) {
list.add(hostToRobotUrlString(resultHost.substring(4)));
} else {
Matcher m = WWWN_PATTERN.matcher(resultHost);
if(m.find()) {
String massagedHost = resultHost.substring(m.end());
list.add(hostToRobotUrlString("www." + massagedHost));
list.add(hostToRobotUrlString(massagedHost));
}
}
} else {
list.add(hostToRobotUrlString("www." + resultHost));
}
return list;
}
public List<String> getMissingRobotUrls(String host) {
ArrayList<String> missing = new ArrayList<String>();
List<String> needed = hostToRobotUrlStrings(host);
for(String need : needed) {
if(!cache.containsKey(need)) {
missing.add(need);
}
}
return missing;
}
public void addDirectives(String url, RobotsDirectives directives) {
cache.put(url, directives);
}
public boolean isBlocked(String path) {
for(RobotsDirectives directives : cache.values()) {
if(!directives.allows(path)) {
return true;
}
}
return false;
}
}