package org.archive.accesscontrol.robotstxt; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.logging.Logger; import org.archive.accesscontrol.LruCache; import org.archive.accesscontrol.RobotsUnavailableException; /** * The CchingRobotClient wraps another RobotClient and caches requests. * * @author aosborne * */ public class CachingRobotClient extends RobotClient { private static final Logger LOGGER = Logger.getLogger( CachingRobotClient.class.getName()); protected LruCache<String, RobotRules> cache = new LruCache<String, RobotRules>(); protected RobotClient client; private static final int PREPARE_THREAD_COUNT = 15; public RobotClient getClient() { return client; } public void setClient(RobotClient client) { this.client = client; } public CachingRobotClient() { this.client = new HttpRobotClient(); } public CachingRobotClient(RobotClient client) { this.client = client; } @Override public RobotRules getRulesForUrl(String url, String userAgent) throws IOException, RobotsUnavailableException { String robotsUrl = robotsUrlForUrl(url); RobotRules rules; synchronized(cache) { rules = cache.get(robotsUrl); } if (rules == null) { rules = client.getRulesForUrl(url, userAgent); synchronized(cache) { cache.put(robotsUrl, rules); } } return rules; } public LruCache<String, RobotRules> getCache() { return cache; } class FetchThread extends Thread { private List<String> urls; private String userAgent; public FetchThread(List<String> urls, String userAgent) { this.urls = urls; this.userAgent = userAgent; } public void run() { while (true) { String url; synchronized (urls) { if (urls.isEmpty()) break; url = urls.remove(0); } try { getRulesForUrl(url, userAgent); } catch (IOException e) { e.printStackTrace(); } catch (RobotsUnavailableException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } /** * Prepare the cache to lookup info for a given set of urls. The fetches * happen in parallel so this also makes a good option for speeding up bulk lookups. */ public void prepare(Collection<String> urls, String userAgent) { List<String> safeUrls = new ArrayList<String>(urls); FetchThread threads[] = new FetchThread[PREPARE_THREAD_COUNT ]; for (int i = 0; i < PREPARE_THREAD_COUNT ; i++) { threads[i] = new FetchThread(safeUrls, userAgent); threads[i].start(); } for (int i = 0; i < PREPARE_THREAD_COUNT ; i++) { try { threads[i].join(); } catch (InterruptedException e) { } } } @Override public void setRobotProxy(String host, int port) { client.setRobotProxy(host, port); } }