package org.archive.wayback.accesscontrol.robotstxt;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
import org.archive.wayback.exception.LiveWebTimeoutException;
import org.archive.wayback.liveweb.LiveWebCache;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.url.UrlOperations;
import org.archive.wayback.webapp.PerformanceLogger;
public class HRobotExclusionFilter extends ExclusionFilter {
private final static String ROBOT_SUFFIX = "/robots.txt";
private final static Logger LOGGER =
Logger.getLogger(HRobotExclusionFilter.class.getName());
// TODO: this is not the right thing!
private Charset cs = Charset.forName("UTF-8");
private RobotsDirectiveAggregation aggregation = null;
private LiveWebCache webCache = null;
private String userAgent = null;
private boolean notifiedSeen = false;
private boolean notifiedPassed = false;
private static final FixedRobotsDirectives ALLOW_ROBOT_DIRECTIVE =
new FixedRobotsDirectives(true);
/**
* Construct a new HRobotExclusionFilter that uses webCache to pull
* robots.txt documents. filtering is based on userAgent, and cached
* documents newer than maxCacheMS in the webCache are considered valid.
*
* @param webCache LiveWebCache from which documents can be retrieved
* @param userAgent String user agent to use for requests to the live web.
* @param maxCacheMS long number of milliseconds to cache documents in the
* LiveWebCache
*/
public HRobotExclusionFilter(LiveWebCache webCache, String userAgent,
long maxCacheMS) {
aggregation = new RobotsDirectiveAggregation();
this.webCache = webCache;
this.userAgent = userAgent;
}
private void updateAggregation(String host)
throws LiveWebCacheUnavailableException,
LiveWebTimeoutException, MalformedURLException, IOException {
List<String> missing = aggregation.getMissingRobotUrls(host);
for(String robotUrl : missing) {
long start = System.currentTimeMillis();
Resource resource;
try {
resource = webCache.getCachedResource(new URL(robotUrl),
0,true);
if(resource.getStatusCode() != 200) {
LOGGER.info("ROBOT: Non200("+robotUrl+")");
// consider it an allow:
aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE);
} else {
InputStreamReader isr = new InputStreamReader(resource, cs);
BufferedReader br = new BufferedReader(isr);
Robotstxt robotsTxt = new Robotstxt(br);
RobotsDirectives directives = robotsTxt.getDirectivesFor(userAgent);
aggregation.addDirectives(robotUrl, directives);
}
} catch (LiveDocumentNotAvailableException e) {
if(LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("ROBOT: LiveDocumentNotAvailableException("
+ robotUrl + ")");
}
// consider it an allow:
aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE);
}
long elapsed = System.currentTimeMillis() - start;
PerformanceLogger.noteElapsed("RobotRequest", elapsed, robotUrl);
}
}
/* (non-Javadoc)
* @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult)
*/
public int filterObject(CaptureSearchResult r) {
if(!notifiedSeen) {
if(filterGroup != null) {
filterGroup.setSawRobots();
}
notifiedSeen = true;
}
String originalURL = r.getOriginalUrl();
String path = UrlOperations.getURLPath(originalURL);
if(path.equals(ROBOT_SUFFIX)) {
if(!notifiedPassed) {
if(filterGroup != null) {
filterGroup.setPassedRobots();
}
notifiedPassed = true;
}
return ObjectFilter.FILTER_INCLUDE;
}
String host = UrlOperations.urlToHost(originalURL);
boolean updated = false;
try {
updateAggregation(host);
if(!aggregation.isBlocked(path)) {
if(LOGGER.isLoggable(Level.INFO)) {
LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")");
}
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.finer("ROBOT: ALLOWED(" + originalURL + ")");
}
if(!notifiedPassed) {
if(filterGroup != null) {
filterGroup.setPassedRobots();
}
notifiedPassed = true;
}
return ObjectFilter.FILTER_INCLUDE;
}
// } catch (LiveDocumentNotAvailableException e) {
} catch (LiveWebCacheUnavailableException e) {
LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("
+ originalURL + ")");
filterGroup.setLiveWebGone();
} catch (LiveWebTimeoutException e) {
LOGGER.severe("ROBOT: LiveDocumentTimedOutException("
+ originalURL + ")");
filterGroup.setRobotTimedOut();
} catch (MalformedURLException e) {
LOGGER.warning("ROBOT: MalformedURLException(" +
originalURL + ")");
} catch (IOException e) {
e.printStackTrace();
return ObjectFilter.FILTER_EXCLUDE;
}
if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) {
return ObjectFilter.FILTER_ABORT;
}
if(LOGGER.isLoggable(Level.INFO)) {
LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")");
}
return ObjectFilter.FILTER_EXCLUDE;
}
}