/* $Id: RobotsManager.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler; import java.nio.charset.StandardCharsets; import java.util.*; import java.io.*; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.*; import org.apache.manifoldcf.authorities.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.CacheKeyFactory; import org.apache.manifoldcf.crawler.system.ManifoldCF; import org.apache.manifoldcf.crawler.system.Logging; /** This class manages the database table into which we write robots.txt files for hosts. The data resides in the database, * as well as in cache (up to a certain point). The result is that there is a memory limited, database-backed repository * of robots files that we can draw on. * * <br><br> * <b>robotsdata</b> * <table border="1" cellpadding="3" cellspacing="0"> * <tr class="TableHeadingColor"> * <th>Field</th><th>Type</th><th>Description        </th> * <tr><td>hostname</td><td>VARCHAR(255)</td><td>Primary Key</td></tr> * <tr><td>robotsdata</td><td>BIGINT</td><td></td></tr> * <tr><td>expirationtime</td><td>BLOB</td><td></td></tr> * </table> * <br><br> * */ public class RobotsManager extends org.apache.manifoldcf.core.database.BaseTable { public static final String _rcsid = "@(#)$Id: RobotsManager.java 988245 2010-08-23 18:39:35Z kwright $"; // Robots cache class. Only one needed. protected static RobotsCacheClass robotsCacheClass = new RobotsCacheClass(); // Database fields protected final static String hostField = "hostname"; protected final static String robotsField = "robotsdata"; protected final static String expirationField = "expirationtime"; // Cache manager. This handle is set up during the constructor. ICacheManager cacheManager; /** Constructor. Note that one robotsmanager handle is only useful within a specific thread context, * so the calling connector object logic must recreate the handle whenever the thread context changes. *@param tc is the thread context. *@param database is the database handle. */ public RobotsManager(IThreadContext tc, IDBInterface database) throws ManifoldCFException { super(database,"robotsdata"); cacheManager = CacheManagerFactory.make(tc); } /** Install the manager. */ public void install() throws ManifoldCFException { // Standard practice: outer loop on install methods, no transactions while (true) { Map existing = getTableSchema(null,null); if (existing == null) { // Install the table. HashMap map = new HashMap(); map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false)); map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false)); map.put(robotsField,new ColumnDescription("BLOB",false,true,null,null,false)); performCreate(map,null); } else { // Upgrade code, if needed, goes here } // Handle indexes, if needed break; } } /** Uninstall the manager. */ public void deinstall() throws ManifoldCFException { performDrop(null); } /** Read robots.txt data from the cache or from the database. *@param hostName is the host for which the data is desired. *@param currentTime is the time of the check. *@return null if the record needs to be fetched, true if fetch is allowed. */ public Boolean checkFetchAllowed(String userAgent, String hostName, long currentTime, String pathString, IProcessActivity activities) throws ManifoldCFException { // Build description objects HostDescription[] objectDescriptions = new HostDescription[1]; StringSetBuffer ssb = new StringSetBuffer(); ssb.add(getRobotsKey(hostName)); objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb)); HostExecutor exec = new HostExecutor(this,activities,objectDescriptions[0]); cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID()); // We do the expiration check here, rather than in the query, so that caching // is possible. RobotsData rd = exec.getResults(); if (rd == null || rd.getExpirationTime() <= currentTime) return null; return new Boolean(rd.isFetchAllowed(userAgent,pathString)); } /** Write robots.txt, replacing any existing row. *@param hostName is the host. *@param expirationTime is the time this data should expire. *@param data is the robots data stream. May be null. */ public void writeRobotsData(String hostName, long expirationTime, InputStream data) throws ManifoldCFException, IOException { TempFileInput tfi = null; try { if (data != null) { try { tfi = new TempFileInput(data); } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) throw e; throw new IOException("Fetch failed: "+e.getMessage()); } } StringSetBuffer ssb = new StringSetBuffer(); ssb.add(getRobotsKey(hostName)); StringSet cacheKeys = new StringSet(ssb); ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID()); try { beginTransaction(); try { // See whether the instance exists ArrayList params = new ArrayList(); params.add(hostName); IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+ hostField+"=?",params,null,null); HashMap values = new HashMap(); values.put(expirationField,new Long(expirationTime)); if (tfi != null) values.put(robotsField,tfi); if (set.getRowCount() > 0) { // Update params.clear(); params.add(hostName); performUpdate(values," WHERE "+hostField+"=?",params,null); } else { // Insert values.put(hostField,hostName); // We only need the general key because this is new. performInsert(values,null); } cacheManager.invalidateKeys(ch); } catch (ManifoldCFException e) { signalRollback(); throw e; } catch (Error e) { signalRollback(); throw e; } finally { endTransaction(); } } finally { cacheManager.leaveCache(ch); } } finally { if (tfi != null) tfi.discard(); } } // Protected methods and classes /** Construct a key which represents an individual host name. *@param hostName is the name of the connector. *@return the cache key. */ protected static String getRobotsKey(String hostName) { return "ROBOTS_"+hostName; } /** Read robots data, if it exists. *@return null if the data doesn't exist at all. Return robots data if it does. */ protected RobotsData readRobotsData(String hostName, IProcessActivity activities) throws ManifoldCFException { try { ArrayList list = new ArrayList(); list.add(hostName); IResultSet set = performQuery("SELECT "+robotsField+","+expirationField+" FROM "+getTableName()+ " WHERE "+hostField+"=?",list,null,null); if (set.getRowCount() == 0) return null; if (set.getRowCount() > 1) throw new ManifoldCFException("Unexpected number of robotsdata rows matching '"+hostName+"': "+Integer.toString(set.getRowCount())); IResultRow row = set.getRow(0); long expiration = ((Long)row.getValue(expirationField)).longValue(); BinaryInput bi = (BinaryInput)row.getValue(robotsField); if (bi == null) return new RobotsData(null,expiration,hostName,activities); try { InputStream is = bi.getStream(); return new RobotsData(is,expiration,hostName,activities); } finally { bi.discard(); } } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO error reading robots data for "+hostName+": "+e.getMessage(),e); } } /** Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those). */ protected static String makeReadable(String inputString) { StringBuilder sb = new StringBuilder(); int i = 0; while (i < inputString.length()) { char y = inputString.charAt(i++); if (y >= ' ') sb.append(y); else { sb.append('^'); sb.append((char)(y + '@')); } } return sb.toString(); } /** This is a cached data item. */ protected static class RobotsData { protected long expiration; protected ArrayList records = null; /** Constructor. */ public RobotsData(InputStream is, long expiration, String hostName, IProcessActivity activities) throws IOException, ManifoldCFException { this.expiration = expiration; if (is == null) { records = null; return; } Reader r = new InputStreamReader(is, StandardCharsets.UTF_8); try { BufferedReader br = new BufferedReader(r); try { parseRobotsTxt(br,hostName,activities); } finally { br.close(); } } finally { r.close(); } } /** Check if fetch is allowed */ public boolean isFetchAllowed(String userAgent, String pathString) { if (records == null) return true; boolean wasDisallowed = false; boolean wasAllowed = false; // First matching user-agent takes precedence, according to the following chunk of spec: // "These name tokens are used in User-agent lines in /robots.txt to // identify to which specific robots the record applies. The robot // must obey the first record in /robots.txt that contains a User- // Agent line whose value contains the name token of the robot as a // substring. The name comparisons are case-insensitive. If no such // record exists, it should obey the first record with a User-agent // line with a "*" value, if present. If no record satisfied either // condition, or no records are present at all, access is unlimited." boolean sawAgent = false; String userAgentUpper = userAgent.toUpperCase(Locale.ROOT); int i = 0; while (i < records.size()) { Record r = (Record)records.get(i++); if (r.isAgentMatch(userAgentUpper,false)) { if (r.isDisallowed(pathString)) wasDisallowed = true; if (r.isAllowed(pathString)) wasAllowed = true; sawAgent = true; break; } } if (sawAgent == false) { i = 0; while (i < records.size()) { Record r = (Record)records.get(i++); if (r.isAgentMatch("*",true)) { if (r.isDisallowed(pathString)) wasDisallowed = true; if (r.isAllowed(pathString)) wasAllowed = true; sawAgent = true; break; } } } if (sawAgent == false) return true; // Allowed always overrides disallowed if (wasAllowed) return true; if (wasDisallowed) return false; // No match -> crawl allowed return true; } /** Get expiration */ public long getExpirationTime() { return expiration; } /** Parse the robots.txt file using a reader. * Is NOT expected to close the stream. */ protected void parseRobotsTxt(BufferedReader r, String hostName, IProcessActivity activities) throws IOException, ManifoldCFException { boolean parseCompleted = false; boolean robotsWasHtml = false; boolean foundErrors = false; String description = null; long startParseTime = System.currentTimeMillis(); try { records = new ArrayList(); Record record = null; boolean seenAction = false; while (true) { String x = r.readLine(); if (x == null) break; int numSignPos = x.indexOf("#"); if (numSignPos != -1) x = x.substring(0,numSignPos); String lowercaseLine = x.toLowerCase(Locale.ROOT).trim(); if (lowercaseLine.startsWith("user-agent:")) { if (seenAction) { records.add(record); record = null; seenAction = false; } if (record == null) record = new Record(); String agentName = x.substring("User-agent:".length()).trim(); record.addAgent(agentName); } else if (lowercaseLine.startsWith("user-agent")) { if (seenAction) { records.add(record); record = null; seenAction = false; } if (record == null) record = new Record(); String agentName = x.substring("User-agent".length()).trim(); record.addAgent(agentName); } else if (lowercaseLine.startsWith("disallow:")) { if (record == null) { description = "Disallow without User-agent"; Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description); foundErrors = true; } else { String disallowPath = x.substring("Disallow:".length()).trim(); // The spec says that a blank disallow means let everything through. if (disallowPath.length() > 0) record.addDisallow(disallowPath); seenAction = true; } } else if (lowercaseLine.startsWith("disallow")) { if (record == null) { description = "Disallow without User-agent"; Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description); foundErrors = true; } else { String disallowPath = x.substring("Disallow".length()).trim(); // The spec says that a blank disallow means let everything through. if (disallowPath.length() > 0) record.addDisallow(disallowPath); seenAction = true; } } else if (lowercaseLine.startsWith("allow:")) { if (record == null) { description = "Allow without User-agent"; Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description); foundErrors = true; } else { String allowPath = x.substring("Allow:".length()).trim(); // The spec says that a blank disallow means let everything through. if (allowPath.length() > 0) record.addAllow(allowPath); seenAction = true; } } else if (lowercaseLine.startsWith("allow")) { if (record == null) { description = "Allow without User-agent"; Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description); foundErrors = true; } else { String allowPath = x.substring("Allow".length()).trim(); // The spec says that a blank disallow means let everything through. if (allowPath.length() > 0) record.addAllow(allowPath); seenAction = true; } } else if (lowercaseLine.startsWith("crawl-delay:")) { // We don't complain about this, but right now we don't listen to it either. } else if (lowercaseLine.startsWith("crawl-delay")) { // We don't complain about this, but right now we don't listen to it either. } else { // If it's not just a blank line, complain if (x.trim().length() > 0) { String problemLine = makeReadable(x); description = "Unknown robots.txt line: '"+problemLine+"'"; Logging.connectors.warn("Web: Unknown robots.txt line from '"+hostName+"': '"+problemLine+"'"); if (x.indexOf("<html") != -1 || x.indexOf("<HTML") != -1) { // Looks like some kind of an html file, probably as a result of a redirection, so just abort as if we have a page error robotsWasHtml = true; parseCompleted = true; break; } foundErrors = true; } } } if (record != null) records.add(record); parseCompleted = true; } finally { // Log the fact that we attempted to parse robots.txt, as well as what happened // These are the following situations we will report: // (1) INCOMPLETE - Parsing did not complete - if the stream was interrupted // (2) HTML - Robots was html - if the robots data seemed to be html // (3) ERRORS - Robots had errors - if the robots data was accepted but had errors in it // (4) SUCCESS - Robots parsed successfully - if the robots data was parsed without problem String status; if (parseCompleted) { if (robotsWasHtml) { status = "HTML"; description = "Robots file contained HTML, skipped"; } else { if (foundErrors) { status = "ERRORS"; // description should already be set } else { status = "SUCCESS"; description = null; } } } else { status = "INCOMPLETE"; description = "Parsing was interrupted"; } activities.recordActivity(new Long(startParseTime),WebcrawlerConnector.ACTIVITY_ROBOTSPARSE, null,hostName,status,description,null); } } } /** Check if path matches specification */ protected static boolean doesPathMatch(String path, String spec) { // For robots 1.0, this function would do just this: // return path.startsWith(spec); // However, we implement the "google bot" spec, which allows wildcard matches that are, in fact, regular-expression-like in some ways. // The "specification" can be found here: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40367 return doesPathMatch(path,0,spec,0); } /** Recursive method for matching specification to path. */ protected static boolean doesPathMatch(String path, int pathIndex, String spec, int specIndex) { while (true) { if (specIndex == spec.length()) // Hit the end of the specification! We're done. return true; char specChar = spec.charAt(specIndex++); if (specChar == '*') { // Found a specification wildcard. // Eat up all the '*' characters at this position - otherwise each additional one increments the exponent of how long this can take, // making denial-of-service via robots parsing a possibility. while (specIndex < spec.length()) { if (spec.charAt(specIndex) != '*') break; specIndex++; } // It represents zero or more characters, so we must recursively try for a match against all remaining characters in the path string. while (true) { boolean match = doesPathMatch(path,pathIndex,spec,specIndex); if (match) return true; if (path.length() == pathIndex) // Nothing further to try, and no match return false; pathIndex++; // Try again } } else if (specChar == '$' && specIndex == spec.length()) { // Found a specification end-of-path character. // (It can only be legitimately the last character of the specification.) return pathIndex == path.length(); } if (pathIndex == path.length()) // Hit the end of the path! (but not the end of the specification!) return false; if (path.charAt(pathIndex) != specChar) return false; // On to the next match pathIndex++; } } /** This is the object description for a robots host object. * This is the key that is used to look up cached data. */ protected static class HostDescription extends org.apache.manifoldcf.core.cachemanager.BaseDescription { protected String hostName; protected String criticalSectionName; protected StringSet cacheKeys; public HostDescription(String hostName, StringSet invKeys) { super("robotscache"); this.hostName = hostName; criticalSectionName = getClass().getName()+"-"+hostName; cacheKeys = invKeys; } public String getHostName() { return hostName; } public int hashCode() { return hostName.hashCode(); } public boolean equals(Object o) { if (!(o instanceof HostDescription)) return false; HostDescription d = (HostDescription)o; return d.hostName.equals(hostName); } public String getCriticalSectionName() { return criticalSectionName; } /** Get the cache keys for an object (which may or may not exist yet in * the cache). This method is called in order for cache manager to throw the correct locks. * @return the object's cache keys, or null if the object should not * be cached. */ public StringSet getObjectKeys() { return cacheKeys; } /** Get the object class for an object. The object class is used to determine * the group of objects treated in the same LRU manner. * @return the newly created object's object class, or null if there is no * such class, and LRU behavior is not desired. */ public ICacheClass getObjectClass() { return robotsCacheClass; } } /** Cache class for robots. * An instance of this class describes the cache class for robots data caching. There's * only ever a need for one, so that will be created statically. */ protected static class RobotsCacheClass implements ICacheClass { /** Get the name of the object class. * This determines the set of objects that are treated in the same * LRU pool. *@return the class name. */ public String getClassName() { // We count all the robot data, so this is a constant string. return "ROBOTSCLASS"; } /** Get the maximum LRU count of the object class. *@return the maximum number of the objects of the particular class * allowed. */ public int getMaxLRUCount() { // Hardwired for the moment; 2000 robots data records will be cached, // and no more. return 2000; } } /** This is the executor object for locating robots host objects. * This object furnishes the operations the cache manager needs to rebuild objects that it needs that are * not in the cache at the moment. */ protected static class HostExecutor extends org.apache.manifoldcf.core.cachemanager.ExecutorBase { // Member variables protected RobotsManager thisManager; protected RobotsData returnValue; protected HostDescription thisHost; protected IProcessActivity activities; /** Constructor. *@param manager is the RobotsManager class instance. *@param objectDescription is the desired object description. */ public HostExecutor(RobotsManager manager, IProcessActivity activities, HostDescription objectDescription) { super(); thisManager = manager; this.activities = activities; thisHost = objectDescription; returnValue = null; } /** Get the result. *@return the looked-up or read cached instance. */ public RobotsData getResults() { return returnValue; } /** Create a set of new objects to operate on and cache. This method is called only * if the specified object(s) are NOT available in the cache. The specified objects * should be created and returned; if they are not created, it means that the * execution cannot proceed, and the execute() method will not be called. * @param objectDescriptions is the set of unique identifier of the object. * @return the newly created objects to cache, or null, if any object cannot be created. * The order of the returned objects must correspond to the order of the object descriptinos. */ public Object[] create(ICacheDescription[] objectDescriptions) throws ManifoldCFException { // I'm not expecting multiple values to be request, so it's OK to walk through the objects // and do a request at a time. RobotsData[] rval = new RobotsData[objectDescriptions.length]; int i = 0; while (i < rval.length) { HostDescription desc = (HostDescription)objectDescriptions[i]; // I need to cache both the data and the expiration date, and pick up both when I // do the query. This is because I don't want to cache based on request time, since that // would screw up everything! rval[i] = thisManager.readRobotsData(desc.getHostName(),activities); i++; } return rval; } /** Notify the implementing class of the existence of a cached version of the * object. The object is passed to this method so that the execute() method below * will have it available to operate on. This method is also called for all objects * that are freshly created as well. * @param objectDescription is the unique identifier of the object. * @param cachedObject is the cached object. */ public void exists(ICacheDescription objectDescription, Object cachedObject) throws ManifoldCFException { // Cast what came in as what it really is HostDescription objectDesc = (HostDescription)objectDescription; RobotsData robotsData = (RobotsData)cachedObject; if (objectDesc.equals(thisHost)) returnValue = robotsData; } /** Perform the desired operation. This method is called after either createGetObject() * or exists() is called for every requested object. */ public void execute() throws ManifoldCFException { // Does nothing; we only want to fetch objects in this cacher. } } /** This class represents a record in a robots.txt file. It contains one or * more user-agents, and one or more disallows. */ protected static class Record { protected ArrayList userAgents = new ArrayList(); protected ArrayList disallows = new ArrayList(); protected ArrayList allows = new ArrayList(); /** Constructor. */ public Record() { } /** Add a user-agent. */ public void addAgent(String agentName) { userAgents.add(agentName); } /** Add a disallow. */ public void addDisallow(String disallowPath) { disallows.add(disallowPath); } /** Add an allow. */ public void addAllow(String allowPath) { allows.add(allowPath); } /** See if user-agent matches. */ public boolean isAgentMatch(String agentNameUpper, boolean exactMatch) { int i = 0; while (i < userAgents.size()) { String agent = ((String)userAgents.get(i++)).toUpperCase(Locale.ROOT); if (exactMatch && agent.trim().equals(agentNameUpper)) return true; if (!exactMatch && agentNameUpper.indexOf(agent) != -1) return true; } return false; } /** See if path is disallowed. Only called if user-agent has already * matched. (This checks if there's an explicit match with one of the * Disallows clauses.) */ public boolean isDisallowed(String path) { int i = 0; while (i < disallows.size()) { String disallow = (String)disallows.get(i++); if (doesPathMatch(path,disallow)) return true; } return false; } /** See if path is allowed. Only called if user-agent has already * matched. (This checks if there's an explicit match with one of the * Allows clauses). */ public boolean isAllowed(String path) { int i = 0; while (i < allows.size()) { String allow = (String)allows.get(i++); if (doesPathMatch(path,allow)) return true; } return false; } } }