/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.accesscontrol.robotstxt;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
import org.archive.wayback.exception.LiveWebTimeoutException;
import org.archive.wayback.liveweb.LiveWebCache;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.url.UrlOperations;
import org.archive.wayback.webapp.PerfStats;
/**
* CaptureSearchResult Filter that uses a LiveWebCache to retrieve robots.txt
* documents from the live web, and filters SearchResults based on the rules
* therein.
*
* This class caches parsed RobotRules that are retrieved, so using the same
* instance to filter multiple SearchResults from the same host will be more
* efficient.
*
* Instances are expected to be transient for each request: The internally
* cached StringBuilder is not thread safe.
*
* @author brad
* @version $Date$, $Revision$
*/
public class RobotExclusionFilter extends ExclusionFilter {
private final static Logger LOGGER =
Logger.getLogger(RobotExclusionFilter.class.getName());
//protected final static String HTTP_PREFIX = "http://";
protected final static String ROBOT_SUFFIX = "/robots.txt";
protected static String WWWN_REGEX = "^www[0-9]+\\.";
protected final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX);
private LiveWebCache webCache = null;
private HashMap<String,RobotRules> rulesCache = null;
private long maxCacheMS = 0;
private String userAgent = null;
protected StringBuilder sb = null;
private final static RobotRules emptyRules = new RobotRules();
private boolean notifiedSeen = false;
private boolean notifiedPassed = false;
enum PerfStat
{
RobotsFetchTotal,
RobotsTotal;
}
protected HashMap<String, Integer> pathsCache = null;
/**
* Construct a new RobotExclusionFilter that uses webCache to pull
* robots.txt documents. filtering is based on userAgent, and cached
* documents newer than maxCacheMS in the webCache are considered valid.
*
* @param webCache LiveWebCache from which documents can be retrieved
* @param userAgent String user agent to use for requests to the live web.
* @param maxCacheMS long number of milliseconds to cache documents in the
* LiveWebCache
*/
public RobotExclusionFilter(LiveWebCache webCache, String userAgent,
long maxCacheMS) {
rulesCache = new HashMap<String,RobotRules>();
this.webCache = webCache;
this.userAgent = userAgent;
this.maxCacheMS = maxCacheMS;
sb = new StringBuilder(100);
}
protected String hostToRobotUrlString(String host, String scheme) {
sb.setLength(0);
sb.append(scheme);
sb.append(host);
if (host.endsWith(".")) {
sb.deleteCharAt(scheme.length() + host.length() - 1);
}
sb.append(ROBOT_SUFFIX);
String robotUrl = sb.toString();
LOGGER.fine("Adding robot URL:" + robotUrl);
return robotUrl;
}
/*
* Return a List of all robots.txt urls to attempt for this HOST:
* If HOST starts with "www.DOMAIN":
* [
* http://HOST/robots.txt,
* http://DOMAIN/robots.txt
* ]
* If HOST starts with "www[0-9]+.DOMAIN":
* [
* http://www.DOMAIN/robots.txt,
* http://DOMAIN/robots.txt,
* http://HOST/robots.txt,
* ]
* Otherwise:
* [
* http://www.HOST/robots.txt
* http://HOST/robots.txt,
* ]
*/
//TODO: Take a look at this again.. this is the current scheme
// (from RedisRobotExclusionFilter)
protected List<String> searchResultToRobotUrlStrings(String resultHost, String scheme) {
ArrayList<String> list = new ArrayList<String>();
if (resultHost.startsWith("www")) {
if (resultHost.startsWith("www.")) {
list.add(hostToRobotUrlString(resultHost, scheme));
list.add(hostToRobotUrlString(resultHost.substring(4), scheme));
} else {
Matcher m = WWWN_PATTERN.matcher(resultHost);
if(m.find()) {
String massagedHost = resultHost.substring(m.end());
list.add(hostToRobotUrlString("www." + massagedHost, scheme));
list.add(hostToRobotUrlString(massagedHost, scheme));
}
list.add(hostToRobotUrlString(resultHost, scheme));
}
} else {
list.add(hostToRobotUrlString(resultHost, scheme));
list.add(hostToRobotUrlString("www." + resultHost, scheme));
}
return list;
}
// Old scheme
// protected List<String> searchResultToRobotUrlStrings(String resultHost, String scheme) {
// ArrayList<String> list = new ArrayList<String>();
// list.add(hostToRobotUrlString(resultHost, scheme));
//
// if(resultHost.startsWith("www")) {
// if(resultHost.startsWith("www.")) {
// list.add(hostToRobotUrlString(resultHost.substring(4), scheme));
// } else {
// Matcher m = WWWN_PATTERN.matcher(resultHost);
// if(m.find()) {
// String massagedHost = resultHost.substring(m.end());
// list.add(hostToRobotUrlString("www." + massagedHost, scheme));
// list.add(hostToRobotUrlString(massagedHost, scheme));
// }
// }
// } else {
// list.add(hostToRobotUrlString("www." + resultHost, scheme));
// }
// return list;
// }
private RobotRules getRules(CaptureSearchResult result) {
RobotRules rules = null;
RobotRules tmpRules = null;
String host;
try {
host = result.getOriginalHost();
} catch(Exception e) {
LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")");
return null;
}
String scheme = UrlOperations.urlToScheme(result.getOriginalUrl());
List<String> urlStrings = searchResultToRobotUrlStrings(host, scheme);
Iterator<String> itr = urlStrings.iterator();
String firstUrlString = null;
// loop through them all. As soon as we get a response, store that
// in the cache for the FIRST url we tried and return it..
// If we get no responses for any of the robot URLs, use "empty" rules,
// and record that in the cache, too.
while(rules == null && itr.hasNext()) {
String urlString = (String) itr.next();
if(firstUrlString == null) {
firstUrlString = urlString;
}
if(rulesCache.containsKey(urlString)) {
LOGGER.fine("ROBOT: Cached("+urlString+")");
rules = rulesCache.get(urlString);
if(!urlString.equals(firstUrlString)) {
LOGGER.fine("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")");
rulesCache.put(firstUrlString, rules);
}
} else {
//long start = System.currentTimeMillis();;
Resource resource = null;
try {
PerfStats.timeStart(PerfStat.RobotsFetchTotal);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("ROBOT: NotCached - Downloading("+urlString+")");
}
tmpRules = new RobotRules();
resource = webCache.getCachedResource(new URL(urlString),
maxCacheMS,true);
//long elapsed = System.currentTimeMillis() - start;
//PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString);
if(resource.getStatusCode() != 200) {
LOGGER.info("ROBOT: NotAvailable("+urlString+")");
throw new LiveDocumentNotAvailableException(urlString);
}
tmpRules.parse(resource);
rulesCache.put(firstUrlString,tmpRules);
rules = tmpRules;
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("ROBOT: Downloaded("+urlString+")");
}
} catch (LiveDocumentNotAvailableException e) {
LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")");
} catch (MalformedURLException e) {
// e.printStackTrace();
LOGGER.warning("ROBOT: MalformedURLException("+urlString+")");
return null;
} catch (IOException e) {
LOGGER.warning("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage());
return null;
} catch (LiveWebCacheUnavailableException e) {
LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("+urlString+")");
if (filterGroup != null) {
filterGroup.setLiveWebGone();
}
return null;
} catch (LiveWebTimeoutException e) {
LOGGER.severe("ROBOT: LiveDocumentTimedOutException("+urlString+")");
if (filterGroup != null) {
filterGroup.setRobotTimedOut();
}
return null;
} finally {
if (resource != null) {
try {
resource.close();
} catch (IOException e) {
}
resource = null;
}
//long elapsed = System.currentTimeMillis() - start;
//PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString);
PerfStats.timeEnd(PerfStat.RobotsFetchTotal);
}
}
}
if(rules == null) {
// special-case, allow empty rules if no longer available.
rulesCache.put(firstUrlString,emptyRules);
rules = emptyRules;
LOGGER.fine("No rules available, using emptyRules for:" + firstUrlString);
}
return rules;
}
/* (non-Javadoc)
* @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult)
*/
public int filterObject(CaptureSearchResult r) {
int filterResult = ObjectFilter.FILTER_EXCLUDE;
try {
PerfStats.timeStart(PerfStat.RobotsTotal);
if(!notifiedSeen) {
if(filterGroup != null) {
filterGroup.setSawRobots();
}
notifiedSeen = true;
}
String resultURL = r.getOriginalUrl();
String path = UrlOperations.getURLPath(resultURL);
if (path.equals(ROBOT_SUFFIX) || r.isRobotIgnore()) {
if(!notifiedPassed) {
if(filterGroup != null) {
filterGroup.setPassedRobots();
}
notifiedPassed = true;
}
return ObjectFilter.FILTER_INCLUDE;
}
if (pathsCache == null) {
pathsCache = new HashMap<String,Integer>();
} else {
Integer result = pathsCache.get(r.getUrlKey());
if (result != null) {
return result;
}
}
RobotRules rules = getRules(r);
if(rules == null) {
if((filterGroup == null) || (filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone())) {
filterResult = ObjectFilter.FILTER_ABORT;
}
} else {
if(!rules.blocksPathForUA(path, userAgent)) {
if(!notifiedPassed) {
if(filterGroup != null) {
filterGroup.setPassedRobots();
}
notifiedPassed = true;
}
filterResult = ObjectFilter.FILTER_INCLUDE;
LOGGER.finer("ROBOT: ALLOWED("+resultURL+")");
} else {
LOGGER.fine("ROBOT: BLOCKED("+resultURL+")");
}
}
pathsCache.put(r.getUrlKey(), filterResult);
} finally {
PerfStats.timeEnd(PerfStat.RobotsTotal, false);
}
return filterResult;
}
public LiveWebCache getWebCache() {
return webCache;
}
}