/* RobotRules
*
* $Id$
*
* Created on 2:51:20 PM Mar 12, 2007.
*
* Copyright (C) 2007 Internet Archive.
*
* This file is part of wayback-svn.
*
* wayback-svn is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* wayback-svn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with wayback-svn; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.accesscontrol.robotstxt;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
/**
* Class which parses a robots.txt file, storing the rules contained therein,
* and then allows for testing if path/userAgent tuples are blocked by those
* rules.
*
* @author brad
* @version $Date$, $Revision$
*/
public class RobotRules {
private static final long serialVersionUID = 2917420727021840982L;
private static final Logger LOGGER = Logger.getLogger(RobotRules.class
.getName());
/**
* Special name for User-agent which matches all values
*/
public static final String GLOBAL_USER_AGENT = "*";
private boolean bSyntaxErrors = false;
private HashMap<String, ArrayList<String>> rules =
new HashMap<String, ArrayList<String>>();
private LinkedList<String> userAgents = new LinkedList<String>();
/**
* @return true if the robots.txt file looked suspicious, currently meaning
* we found a Disallow rule that was not preceded by a "User-agent:" line
*/
public boolean hasSyntaxErrors() {
return bSyntaxErrors;
}
/**
* @return a List of all UserAgents Found in the Robots.txt document
*/
public List<String> getUserAgentsFound() {
return userAgents;
}
/**
* Read rules from InputStream argument into this RobotRules, as a
* side-effect, sets the bSyntaxErrors property.
*
* @param is
* @throws IOException
*/
public void parse(InputStream is) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(
(InputStream) is));
String read;
ArrayList<String> current = null;
while (br != null) {
do {
read = br.readLine();
// Skip comments & blanks
} while ((read != null) && ((read = read.trim()).startsWith("#") ||
read.length() == 0));
if (read == null) {
br.close();
br = null;
} else {
int commentIndex = read.indexOf("#");
if (commentIndex > -1) {
// Strip trailing comment
read = read.substring(0, commentIndex);
}
read = read.trim();
if (read.matches("(?i)^User-agent:.*")) {
String ua = read.substring(11).trim().toLowerCase();
if (current == null || current.size() != 0) {
// only create new rules-list if necessary
// otherwise share with previous user-agent
current = new ArrayList<String>();
}
rules.put(ua, current);
LOGGER.fine("Found User-agent(" + ua + ") rules...");
continue;
}
if (read.matches("(?i)Disallow:.*")) {
if (current == null) {
// buggy robots.txt
bSyntaxErrors = true;
continue;
}
String path = read.substring(9).trim();
current.add(path);
continue;
}
// unknown line; do nothing for now
// TODO: check for "Allow" lines, and flag a syntax error if
// we encounter any unknown lines?
}
}
}
private boolean blocksPath(String path, String curUA, List<String> uaRules) {
Iterator<String> disItr = uaRules.iterator();
while (disItr.hasNext()) {
String disallowedPath = disItr.next();
if (disallowedPath.length() == 0) {
LOGGER.fine("UA(" + curUA
+ ") has empty disallow: Go for it!");
return false;
} else {
LOGGER.fine("UA(" + curUA + ") has ("
+ disallowedPath + ") blocked...("
+ disallowedPath.length() + ")");
if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) {
LOGGER.fine("THIS APPLIES!!!");
return true;
}
}
}
return false;
}
/**
* Checks first the specified ua UserAgent, if rules are present for it,
* and then falls back to using rules for the '*' UserAgent.
*
* @param path
* @param ua
* @return boolean value where true indicates the path is blocked for ua
*/
public boolean blocksPathForUA(String path, String ua) {
if(rules.containsKey(ua.toLowerCase())) {
return blocksPath(path,ua,rules.get(ua.toLowerCase()));
} else if(rules.containsKey(GLOBAL_USER_AGENT)) {
return blocksPath(path,GLOBAL_USER_AGENT,
rules.get(GLOBAL_USER_AGENT));
}
return false;
}
}