package org.archive.accesscontrol.robotstxt; import java.io.IOException; import java.util.Collection; import org.apache.commons.httpclient.URIException; import org.archive.accesscontrol.RobotsUnavailableException; import org.archive.net.LaxURI; /** * A client for checking whether a robot is allowed by a robots.txt file. * * @author aosborne * */ public abstract class RobotClient { /** * Returns true if a robot with the given user-agent is allowed to access * the given url. * * @param url * @param userAgent * @return * @throws IOException * @throws RobotsUnavailableException */ public boolean isRobotPermitted(String url, String userAgent) throws IOException, RobotsUnavailableException { RobotRules rules = getRulesForUrl(url, userAgent); return !rules.blocksPathForUA(new LaxURI(url, false).getPath(), userAgent); } /** * Fetch the applicable ruleset for the given url and robot. * * @param url * @param userAgent * @return * @throws IOException a local problem occurred when attempting to fetch the robots.txt * @throws RobotsUnavailableException a remote problem, we found no robots.txt or the server is down. */ public abstract RobotRules getRulesForUrl(String url, String userAgent) throws IOException, RobotsUnavailableException; public static String robotsUrlForUrl(String url) throws URIException { LaxURI uri = new LaxURI(url, false); uri.setPath("/robots.txt"); uri.setQuery(null); uri.setFragment(null); return uri.toString(); } /** * Prepare the cache to lookup info for a given set of urls. The fetches * happen in parallel so this also makes a good option for speeding up bulk lookups. * * This may be a no-op. */ public abstract void prepare(Collection<String> urls, String userAgent); /** * Use a proxy server when fetching robots.txt data. * @param host * @param port */ public abstract void setRobotProxy(String host, int port); }