/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.accesscontrol.robotstxt;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.wayback.util.ByteOp;
/**
* Class which parses a robots.txt file, storing the rules contained therein,
* and then allows for testing if path/userAgent tuples are blocked by those
* rules.
*
* @author brad
* @version $Date$, $Revision$
*/
public class RobotRules {
private static final long serialVersionUID = 2917420727021840982L;
private static final Logger LOGGER = Logger.getLogger(RobotRules.class
.getName());
/**
* Special name for User-agent which matches all values
*/
public static final String GLOBAL_USER_AGENT = "*";
protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent\\s*:(.*)");
protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow\\s*:(.*)");
protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow\\s*:(.*)");
private boolean bSyntaxErrors = false;
private HashMap<String, ArrayList<String>> rules =
new HashMap<String, ArrayList<String>>();
private LinkedList<String> userAgents = new LinkedList<String>();
/**
* @return true if the robots.txt file looked suspicious, currently meaning
* we found a Disallow rule that was not preceded by a "User-agent:" line
*/
public boolean hasSyntaxErrors() {
return bSyntaxErrors;
}
/**
* @return a List of all UserAgents Found in the Robots.txt document
*/
public List<String> getUserAgentsFound() {
return userAgents;
}
/**
* Read rules from InputStream argument into this RobotRules, as a
* side-effect, sets the bSyntaxErrors property.
*
* @param is InputStream containing the robots.txt document
* @throws IOException for usual reasons
*/
public void parse(InputStream is) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(
(InputStream) is,ByteOp.UTF8));
String read;
boolean allowRuleFound = false;
// true if curr or last line read was a User-agent line
boolean currLineUA = false;
boolean lastLineUA = false;
ArrayList<String> current = null;
while (br != null) {
lastLineUA = currLineUA;
do {
read = br.readLine();
// Skip comments & blanks
} while ((read != null) && ((read = read.trim()).startsWith("#") ||
read.length() == 0));
if (read == null) {
br.close();
br = null;
} else {
currLineUA = false;
int commentIndex = read.indexOf("#");
if (commentIndex > -1) {
// Strip trailing comment
read = read.substring(0, commentIndex);
}
read = read.trim();
Matcher uaMatcher = USER_AGENT_PATTERN.matcher(read);
if (uaMatcher.matches()) {
String ua = uaMatcher.group(1).trim().toLowerCase();
if (current == null || current.size() != 0 || allowRuleFound || !lastLineUA) {
// only create new rules-list if necessary
// otherwise share with previous user-agent
current = new ArrayList<String>();
}
rules.put(ua, current);
allowRuleFound = false;
currLineUA = true;
LOGGER.fine("Found User-agent(" + ua + ") rules...");
continue;
}
Matcher disallowMatcher = DISALLOW_PATTERN.matcher(read);
if (disallowMatcher.matches()) {
if (current == null) {
// buggy robots.txt
bSyntaxErrors = true;
continue;
}
String path = disallowMatcher.group(1).trim();
// Disallow: without path is just ignored.
if (!path.isEmpty())
current.add(path);
continue;
}
Matcher allowMatcher = ALLOW_PATTERN.matcher(read);
if (allowMatcher.matches()) {
// Mark that there was an allow rule to clear the current list for next user-agent
allowRuleFound = true;
}
// unknown line; do nothing for now
// TODO: check for "Allow" lines, and flag a syntax error if
// we encounter any unknown lines?
}
}
}
private boolean blocksPath(String path, String curUA, List<String> uaRules) {
for (String disallowedPath : uaRules) {
if (disallowedPath.isEmpty()) {
// This is for extra caution. Empty path shouldn't be added
// to uaRules in the first place.
continue;
}
if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) {
return true;
}
}
return false;
}
/**
* Checks first the specified ua UserAgent, if rules are present for it,
* and then falls back to using rules for the '*' UserAgent.
*
* @param path String server relative path to check for access
* @param ua String user agent to check for access
* @return boolean value where true indicates the path is blocked for ua
*/
public boolean blocksPathForUA(String path, String ua) {
final String lcua = ua.toLowerCase();
if (rules.containsKey(lcua)) {
return blocksPath(path, ua, rules.get(lcua));
}
if (rules.containsKey(GLOBAL_USER_AGENT)) {
return blocksPath(path, GLOBAL_USER_AGENT,
rules.get(GLOBAL_USER_AGENT));
}
return false;
}
}