RobotExclusionFilter.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.accesscontrol.robotstxt;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
import org.archive.wayback.exception.LiveWebTimeoutException;
import org.archive.wayback.liveweb.LiveWebCache;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.url.UrlOperations;
import org.archive.wayback.webapp.PerformanceLogger;

/**
 * CaptureSearchResult Filter that uses a LiveWebCache to retrieve robots.txt
 * documents from the live web, and filters SearchResults based on the rules 
 * therein. 
 * 
 * This class caches parsed RobotRules that are retrieved, so using the same 
 * instance to filter multiple SearchResults from the same host will be more
 * efficient.
 * 
 * Instances are expected to be transient for each request: The internally
 * cached StringBuilder is not thread safe.
 *
 * @author brad
 * @version $Date$, $Revision$
 */
public class RobotExclusionFilter extends ExclusionFilter {

	private final static Logger LOGGER = 
		Logger.getLogger(RobotExclusionFilter.class.getName());

	protected final static String HTTP_PREFIX = "http://";
	protected final static String ROBOT_SUFFIX = "/robots.txt";

	protected static String WWWN_REGEX = "^www[0-9]+\\.";
	protected final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX);
	private LiveWebCache webCache = null;
	private HashMap<String,RobotRules> rulesCache = null;
	private long maxCacheMS = 0;
	private String userAgent = null;
	protected StringBuilder sb = null;
	private final static RobotRules emptyRules = new RobotRules();
	private boolean notifiedSeen = false;
	private boolean notifiedPassed = false;
	
	/**
	 * Construct a new RobotExclusionFilter that uses webCache to pull 
	 * robots.txt documents. filtering is based on userAgent, and cached 
	 * documents newer than maxCacheMS in the webCache are considered valid.
	 * 
	 * @param webCache LiveWebCache from which documents can be retrieved 
	 * @param userAgent String user agent to use for requests to the live web.
	 * @param maxCacheMS long number of milliseconds to cache documents in the
	 *                   LiveWebCache
	 */
	public RobotExclusionFilter(LiveWebCache webCache, String userAgent,
			long maxCacheMS) {

		rulesCache = new HashMap<String,RobotRules>();

		this.webCache = webCache;
		this.userAgent = userAgent;
		this.maxCacheMS = maxCacheMS;
		sb = new StringBuilder(100);
	}

	protected String hostToRobotUrlString(String host) {
		sb.setLength(0);
		sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
		String robotUrl = sb.toString();
		LOGGER.fine("Adding robot URL:" + robotUrl);
		return robotUrl;
	}
	
	/*
	 * Return a List of all robots.txt urls to attempt for this HOST:
	 * If HOST starts with "www.DOMAIN":
	 * 	   [
	 *        http://HOST/robots.txt,
	 *        http://DOMAIN/robots.txt
	 *     ]
	 * If HOST starts with "www[0-9]+.DOMAIN":
	 *     [
	 *        http://HOST/robots.txt,
	 *        http://www.DOMAIN/robots.txt,
	 *        http://DOMAIN/robots.txt
	 *     ]
	 * Otherwise:
	 *     [
	 *        http://HOST/robots.txt,
	 *        http://www.HOST/robots.txt
	 *     ]
	 */
	protected List<String> searchResultToRobotUrlStrings(String resultHost) {
		ArrayList<String> list = new ArrayList<String>();
		list.add(hostToRobotUrlString(resultHost));
		
		if(resultHost.startsWith("www")) {
			if(resultHost.startsWith("www.")) {
				list.add(hostToRobotUrlString(resultHost.substring(4)));
			} else {
				Matcher m = WWWN_PATTERN.matcher(resultHost);
				if(m.find()) {
					String massagedHost = resultHost.substring(m.end());
					list.add(hostToRobotUrlString("www." + massagedHost));
					list.add(hostToRobotUrlString(massagedHost));
				}
			}
		} else {
			list.add(hostToRobotUrlString("www." + resultHost));			
		}
		return list;
	}
	
	private RobotRules getRules(CaptureSearchResult result) {
		RobotRules rules = null;
		RobotRules tmpRules = null;
		String host;
		try {
			host = result.getOriginalHost();
		} catch(Exception e) {
			LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")");			
			return null;
		}
		List<String> urlStrings = searchResultToRobotUrlStrings(host);
		Iterator<String> itr = urlStrings.iterator();
		String firstUrlString = null;

		// loop through them all. As soon as we get a response, store that
		// in the cache for the FIRST url we tried and return it..
		// If we get no responses for any of the robot URLs, use "empty" rules,
		// and record that in the cache, too.
		
		while(rules == null && itr.hasNext()) {
			String urlString = (String) itr.next();
			if(firstUrlString == null) {
				firstUrlString = urlString;
			}
			if(rulesCache.containsKey(urlString)) {
				LOGGER.fine("ROBOT: Cached("+urlString+")");
				rules = rulesCache.get(urlString);
				if(!urlString.equals(firstUrlString)) {
					LOGGER.fine("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")");
					rulesCache.put(firstUrlString, rules);
				}
			} else {
				long start = System.currentTimeMillis();;
				try {
					LOGGER.fine("ROBOT: NotCached - Downloading("+urlString+")");
				
					tmpRules = new RobotRules();
					Resource resource = webCache.getCachedResource(new URL(urlString),
							maxCacheMS,true);
					//long elapsed = System.currentTimeMillis() - start;
					//PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString);

					if(resource.getStatusCode() != 200) {
						LOGGER.info("ROBOT: NotAvailable("+urlString+")");
						throw new LiveDocumentNotAvailableException(urlString);
					}
					tmpRules.parse(resource);
					rulesCache.put(firstUrlString,tmpRules);
					rules = tmpRules;
					LOGGER.fine("ROBOT: Downloaded("+urlString+")");

				} catch (LiveDocumentNotAvailableException e) {
					LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")");

				} catch (MalformedURLException e) {
//					e.printStackTrace();
					LOGGER.warning("ROBOT: MalformedURLException("+urlString+")");
					return null;
				} catch (IOException e) {
					LOGGER.warning("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage());
					return null;
				} catch (LiveWebCacheUnavailableException e) {
					LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("+urlString+")");
					if (filterGroup != null) {
						filterGroup.setLiveWebGone();
					}
					return null;
				} catch (LiveWebTimeoutException e) {
					LOGGER.severe("ROBOT: LiveDocumentTimedOutException("+urlString+")");
					if (filterGroup != null) {
						filterGroup.setRobotTimedOut();
					}
					return null;
				} finally {
					long elapsed = System.currentTimeMillis() - start;
					PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString);
				}
			}
		}
		if(rules == null) {
			// special-case, allow empty rules if no longer available.
			rulesCache.put(firstUrlString,emptyRules);
			rules = emptyRules;
			LOGGER.fine("No rules available, using emptyRules for:" + firstUrlString);
		}
		return rules;
	}
	
	/* (non-Javadoc)
	 * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult)
	 */
	public int filterObject(CaptureSearchResult r) {
		if(!notifiedSeen) {
			if(filterGroup != null) {
				filterGroup.setSawRobots();
			}
			notifiedSeen = true;
		}
		String resultURL = r.getOriginalUrl();
		String path = UrlOperations.getURLPath(resultURL);
		if(path.equals(ROBOT_SUFFIX)) {
			if(!notifiedPassed) {
				if(filterGroup != null) {
					filterGroup.setPassedRobots();
				}
				notifiedPassed = true;
			}
			return ObjectFilter.FILTER_INCLUDE;
		}
		int filterResult = ObjectFilter.FILTER_EXCLUDE; 
		RobotRules rules = getRules(r);
		if(rules == null) {
			if((filterGroup == null) || (filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone())) {
				return ObjectFilter.FILTER_ABORT;
			}
		} else {
			if(!rules.blocksPathForUA(path, userAgent)) {
				if(!notifiedPassed) {
					if(filterGroup != null) {
						filterGroup.setPassedRobots();
					}
					notifiedPassed = true;
				}
				filterResult = ObjectFilter.FILTER_INCLUDE;
				LOGGER.finer("ROBOT: ALLOWED("+resultURL+")");
			} else {
				LOGGER.fine("ROBOT: BLOCKED("+resultURL+")");
			}
		}
		return filterResult;
	}

	public LiveWebCache getWebCache() {
		return webCache;
	}
}