/* Robots.java * * Created Sep 1, 2005 * * Copyright (C) 2005 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.limewire.http.httpclient.robots; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import org.limewire.io.InvalidDataException; import org.limewire.logging.Log; import org.limewire.logging.LogFactory; import org.limewire.util.StringUtils; /** * Utility class for parsing and representing 'robots.txt' format * directives, into a list of named user-agents and map from user-agents * to RobotsDirectives. * <p> * Based on Robotstxt from Heritrix the web crawler from * http://crawler.archive.org/ */ public class RobotsTxt { private static final Log LOG = LogFactory.getLog(RobotsTxt.class); // all user agents contained in this robots.txt // may be thinned of irrelevant entries private final List<String> userAgents = new ArrayList<String>(5); // map user-agents to directives private final Map<String,RobotsDirectives> agentsToDirectives = new HashMap<String,RobotsDirectives>(); private final static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives(); public RobotsTxt(String robotsTxt) throws InvalidDataException { // current is the disallowed paths for the preceding User-Agent(s) RobotsDirectives current = null; // whether a non-'User-Agent' directive has been encountered boolean hasDirectivesYet = false; String catchall = null; for (String line : robotsTxt.split("\n")) { if (StringUtils.isEmpty(line)) { continue; } if (line.trim().startsWith("#")) { LOG.debugf("skipping comment line {0}", line); continue; } // remove any html markup line = line.replaceAll("<[^>]+>",""); int commentIndex = line.indexOf("#"); if (commentIndex > -1) { // Strip trailing comment line = line.substring(0, commentIndex); } line = line.trim(); if (line.matches("(?i)^User-agent:.*")) { String ua = line.substring(11).trim().toLowerCase(Locale.US); if (current == null || hasDirectivesYet ) { // only create new rules-list if necessary // otherwise share with previous user-agent current = new RobotsDirectives(); hasDirectivesYet = false; } if (ua.equals("*")) { ua = ""; catchall = ua; } else { userAgents.add(ua); } agentsToDirectives.put(ua, current); } else if (line.matches("(?i)Disallow:.*")) { if (current == null) { throw new InvalidDataException(); } String path = line.substring(9).trim(); current.addDisallow(path); hasDirectivesYet = true; } else if (line.matches("(?i)Crawl-delay:.*")) { if (current == null) { throw new InvalidDataException(); } // consider a crawl-delay, even though we don't // yet understand it, as sufficient to end a // grouping of User-Agent lines hasDirectivesYet = true; String val = line.substring(12).trim(); val = val.split("[^\\d\\.]+")[0]; try { current.setCrawlDelay(Float.parseFloat(val)); } catch (NumberFormatException nfe) { // ignore } } else if (line.matches("(?i)Allow:.*")) { if (current == null) { throw new InvalidDataException(); } String path = line.substring(6).trim(); current.addAllow(path); hasDirectivesYet = true; } else { LOG.debugf("unknown line {0}", line); } } if (catchall != null) { userAgents.add(catchall); } } public List<String> getUserAgents() { return userAgents; } public RobotsDirectives getDirectivesFor(String ua) { // find matching ua for(String uaListed : userAgents) { if (ua.contains(uaListed)) { return agentsToDirectives.get(uaListed); } } // no applicable user-agents, so empty directives return NO_DIRECTIVES; } }