RobotRulesParser.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.service.crawler;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// JDK imports
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.StringTokenizer;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;


/**
 * This class handles the parsing of <code>robots.txt</code> files.
 * It emits RobotRules objects, which describe the download permissions
 * as described in RobotRulesParser.
 *
 * @author Tom Pierce
 * @author Mike Cafarella
 * @author Doug Cutting
 */
public final class RobotRulesParser implements Configurable {
  
  public static final Log LOG = LogFactory.getLog(RobotRulesParser.class);

  private boolean allowForbidden = false;

  private static final String CHARACTER_ENCODING= "UTF-8";
  private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
    
  private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();

  private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();

  private Configuration conf;
  private HashMap robotNames;

  /**
   * This class holds the rules which were parsed from a robots.txt
   * file, and can test paths against those rules.
   */
  public static class RobotRuleSet  {
    ArrayList tmpEntries = new ArrayList();
    RobotsEntry[] entries = null;
    long expireTime;
    long crawlDelay = -1;
    public boolean explicitMention = false;

    /**
     */
    private class RobotsEntry {
      String prefix;
      boolean allowed;

      RobotsEntry(String prefix, boolean allowed) {
        this.prefix= prefix;
        this.allowed= allowed;
      }
    }

    /**
     */
    private void addPrefix(String prefix, boolean allow) {
      if (tmpEntries == null) {
        tmpEntries= new ArrayList();
        if (entries != null) {
          for (int i= 0; i < entries.length; i++) 
            tmpEntries.add(entries[i]);
        }
        entries= null;
      }

      tmpEntries.add(new RobotsEntry(prefix, allow));
    }

    /**
     */
    private void clearPrefixes() {
      if (tmpEntries == null) {
        tmpEntries= new ArrayList();
        entries= null;
      } else {
        tmpEntries.clear();
      }
    }

    /**
     * Change when the ruleset goes stale.
     */
    public void setExpireTime(long expireTime) {
      this.expireTime = expireTime;
    }

    /**
     * Get expire time
     */
    public long getExpireTime() {
      return expireTime;
    }

    /**
     * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
     */
    public long getCrawlDelay() {
      return crawlDelay;
    }
    
    /**
     * Set Crawl-Delay, in milliseconds
     */
    public void setCrawlDelay(long crawlDelay) {
      this.crawlDelay = crawlDelay;
    }
    
    /**
     *  Returns <code>false</code> if the <code>robots.txt</code> file
     *  prohibits us from accessing the given <code>url</code>, or
     *  <code>true</code> otherwise.
     */
    public boolean isAllowed(URL url) {
      String path = url.getPath();                  // check rules
      if ((path == null) || "".equals(path)) {
        path= "/";
      }
      return isAllowed(path);
    }
    
    /** 
     *  Returns <code>false</code> if the <code>robots.txt</code> file
     *  prohibits us from accessing the given <code>path</code>, or
     *  <code>true</code> otherwise.
     */ 
    public boolean isAllowed(String path) {
      try {
        path= URLDecoder.decode(path, CHARACTER_ENCODING);
      } catch (Exception e) {
        // just ignore it- we can still try to match 
        // path prefixes
      }
      
      if (entries == null) {
        entries= new RobotsEntry[tmpEntries.size()];
        entries= (RobotsEntry[]) 
          tmpEntries.toArray(entries);
        tmpEntries= null;
      }

      int pos= 0;
      int end= entries.length;
      while (pos < end) {
        if (path.startsWith(entries[pos].prefix))
          return entries[pos].allowed;
        pos++;
      }

      return true;
    }

    /**
     */
    public String toString() {
      isAllowed("x");  // force String[] representation
      StringBuffer buf= new StringBuffer();
      for (int i= 0; i < entries.length; i++) 
        if (entries[i].allowed)
          buf.append("Allow: " + entries[i].prefix
                     + System.getProperty("line.separator"));
        else 
          buf.append("Disallow: " + entries[i].prefix
                     + System.getProperty("line.separator"));
      return buf.toString();
    }
  }


  RobotRulesParser() { }

  public RobotRulesParser(Configuration conf) {
    setConf(conf);
  }


  /* ---------------------------------- *
   * <implementation:Configurable> *
   * ---------------------------------- */

  public void setConf(Configuration conf) {
    this.conf = conf;
    allowForbidden = conf.getBoolean("http.robots.403.allow", false);
    //
    // Grab the agent names we advertise to robots files.
    //
    String agentName = conf.get("http.agent.name");
    if (null == agentName) {
      throw new RuntimeException("Agent name not configured!");
    }
    String agentNames = conf.get("http.robots.agents","");
    StringTokenizer tok = new StringTokenizer(agentNames, ",");
    ArrayList agents = new ArrayList();
    while (tok.hasMoreTokens()) {
      agents.add(tok.nextToken().trim());
    }

    //
    // If there are no agents for robots-parsing, use our
    // default agent-string.  If both are present, our agent-string
    // should be the first one we advertise to robots-parsing.
    //
    if (agents.size() == 0) {
      agents.add(agentName);
      if (LOG.isFatalEnabled()) {
        LOG.fatal("No agents listed in 'http.robots.agents' property!");
      }
    } else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
      agents.add(0, agentName);
      if (LOG.isFatalEnabled()) {
        LOG.fatal("Agent we advertise (" + agentName
                + ") not listed first in 'http.robots.agents' property!");
      }
    }
    setRobotNames((String[]) agents.toArray(new String[agents.size()]));
  }

  public Configuration getConf() {
    return conf;
  }

  /* ---------------------------------- *
   * <implementation:Configurable> *
   * ---------------------------------- */

  private void setRobotNames(String[] robotNames) {
    this.robotNames= new HashMap();
    for (int i= 0; i < robotNames.length; i++) {
      this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
    }
    // always make sure "*" is included
    if (!this.robotNames.containsKey("*"))
      this.robotNames.put("*", new Integer(robotNames.length));
  }

  /**
   *  Creates a new <code>RobotRulesParser</code> which will use the
   *  supplied <code>robotNames</code> when choosing which stanza to
   *  follow in <code>robots.txt</code> files.  Any name in the array
   *  may be matched.  The order of the <code>robotNames</code>
   *  determines the precedence- if many names are matched, only the
   *  rules associated with the robot name having the smallest index
   *  will be used.
   */
  public RobotRulesParser(String[] robotNames) {
    setRobotNames(robotNames); 
  }

  /**
   * Returns a {@link RobotRuleSet} object which encapsulates the
   * rules parsed from the supplied <code>robotContent</code>.
   */
  public RobotRuleSet parseRules(byte[] robotContent,int offset,int length) {
    if (robotContent == null) 
      return EMPTY_RULES;

    String content= new String (robotContent,offset,length);

    StringTokenizer lineParser= new StringTokenizer(content, "\n\r");

    RobotRuleSet bestRulesSoFar= null;
    int bestPrecedenceSoFar= NO_PRECEDENCE;

    RobotRuleSet currentRules= new RobotRuleSet();
    int currentPrecedence= NO_PRECEDENCE;

    boolean addRules= false;    // in stanza for our robot
    boolean doneAgents= false;  // detect multiple agent lines

    while (lineParser.hasMoreTokens()) {
      String line= lineParser.nextToken();

      // trim out comments and whitespace
      int hashPos= line.indexOf("#");
      if (hashPos >= 0) 
        line= line.substring(0, hashPos);
      line= line.trim();

      if ( (line.length() >= 11) 
           && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {

        if (doneAgents) {
          if (currentPrecedence < bestPrecedenceSoFar) {
            bestPrecedenceSoFar= currentPrecedence;
            bestRulesSoFar= currentRules;
            currentPrecedence= NO_PRECEDENCE;
            currentRules= new RobotRuleSet();
          }
          addRules= false;
        }
        doneAgents= false;

        String agentNames= line.substring(line.indexOf(":") + 1);
        agentNames= agentNames.trim();
        StringTokenizer agentTokenizer= new StringTokenizer(agentNames);

        while (agentTokenizer.hasMoreTokens()) {
          // for each agent listed, see if it's us:
          String agentName= agentTokenizer.nextToken().toLowerCase();

          Integer precedenceInt= (Integer) robotNames.get(agentName);

          if (precedenceInt != null) {
            int precedence= precedenceInt.intValue();
            
            if (precedence != robotNames.size()) {
            	if (!agentName.equals("*")) { 
            		currentRules.explicitMention = true;
            	}
            }
            if ( (precedence < currentPrecedence)
                 && (precedence < bestPrecedenceSoFar) )
              currentPrecedence= precedence;
          }
        }

        if (currentPrecedence < bestPrecedenceSoFar) 
          addRules= true;

      } else if ( (line.length() >= 9)
                  && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {

        doneAgents= true;
        String path= line.substring(line.indexOf(":") + 1);
        path= path.trim();
        try {
          path= URLDecoder.decode(path, CHARACTER_ENCODING);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("error parsing robots rules- can't decode path: " + path);
          }
        }

        if (path.length() == 0) { // "empty rule"
          if (addRules)
            currentRules.clearPrefixes();
        } else {  // rule with path
          if (addRules)
            currentRules.addPrefix(path, false);
        }

      } else if ( (line.length() >= 6)
                  && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {

        doneAgents= true;
        String path= line.substring(line.indexOf(":") + 1);
        path= path.trim();

        if (path.length() == 0) { 
          // "empty rule"- treat same as empty disallow
          if (addRules)
            currentRules.clearPrefixes();
        } else {  // rule with path
          if (addRules)
            currentRules.addPrefix(path, true);
        }
      } else if ( (line.length() >= 12)
                  && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) {
        doneAgents = true;
        if (addRules) {
          long crawlDelay = -1;
          String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
          if (delay.length() > 0) {
            try {
              crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
            } catch (Exception e) {
              LOG.debug("can not parse Crawl-Delay:" + e.toString());
            }
            currentRules.setCrawlDelay(crawlDelay);
          }
        }
      }
    }

    if (currentPrecedence < bestPrecedenceSoFar) {
      bestPrecedenceSoFar= currentPrecedence;
      bestRulesSoFar= currentRules;
    }

    if (bestPrecedenceSoFar == NO_PRECEDENCE) 
      return EMPTY_RULES;
    return bestRulesSoFar;
  }

  /**
   *  Returns a <code>RobotRuleSet</code> object appropriate for use
   *  when the <code>robots.txt</code> file is empty or missing; all
   *  requests are allowed.
   */
  static RobotRuleSet getEmptyRules() {
    return EMPTY_RULES;
  }

  /**
   *  Returns a <code>RobotRuleSet</code> object appropriate for use
   *  when the <code>robots.txt</code> file is not fetched due to a
   *  <code>403/Forbidden</code> response; all requests are
   *  disallowed.
   */
  static RobotRuleSet getForbidAllRules() {
    RobotRuleSet rules= new RobotRuleSet();
    rules.addPrefix("", false);
    return rules;
  }
  
  private final static int BUFSIZE= 2048;

  /** command-line main for testing */
  public static void main(String[] argv) {
    if (argv.length < 3) {
      System.out.println("Usage:");
      System.out.println("   java <robots-file> <url-file> <agent-name>+");
      System.out.println("");
      System.out.println("The <robots-file> will be parsed as a robots.txt file,");
      System.out.println("using the given <agent-name> to select rules.  URLs ");
      System.out.println("will be read (one per line) from <url-file>, and tested");
      System.out.println("against the rules.");
      System.exit(-1);
    }
    try { 
      FileInputStream robotsIn= new FileInputStream(argv[0]);
      File testsInFile = new File(argv[1]);
      LineNumberReader testsIn = null;
      String singleTestLine = null;
      if (testsInFile.isFile())  
        testsIn= new LineNumberReader(new FileReader(argv[1]));
      else 
        singleTestLine = argv[1];
      
      String[] robotNames= new String[argv.length - 2];

      for (int i= 0; i < argv.length - 2; i++) 
        robotNames[i]= argv[i+2];

      ArrayList bufs= new ArrayList();
      byte[] buf= new byte[BUFSIZE];
      int totBytes= 0;

      int rsize= robotsIn.read(buf);
      while (rsize >= 0) {
        totBytes+= rsize;
        if (rsize != BUFSIZE) {
          byte[] tmp= new byte[rsize];
          System.arraycopy(buf, 0, tmp, 0, rsize);
          bufs.add(tmp);
        } else {
          bufs.add(buf);
          buf= new byte[BUFSIZE];
        }
        rsize= robotsIn.read(buf);
      }

      byte[] robotsBytes= new byte[totBytes];
      int pos= 0;

      for (int i= 0; i < bufs.size(); i++) {
        byte[] currBuf= (byte[]) bufs.get(i);
        int currBufLen= currBuf.length;
        System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
        pos+= currBufLen;
      }

      RobotRulesParser parser= 
        new RobotRulesParser(robotNames);
      RobotRuleSet rules= parser.parseRules(robotsBytes,0,robotsBytes.length);
      System.out.println("Rules:");
      System.out.println(rules);
      System.out.println();
      if (testsIn != null) { 
        String testPath= testsIn.readLine().trim();
        while (testPath != null) {
          System.out.println( (rules.isAllowed(testPath) ? 
                               "allowed" : "not allowed")
                              + ":\t" + testPath);
          testPath= testsIn.readLine();
        }
      }
      else { 
        System.out.println( (rules.isAllowed(singleTestLine) ? 
            "allowed" : "not allowed")
           + ":\t" + singleTestLine);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}