/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.protocol.http.api; import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet; import junit.framework.TestCase; public class TestRobotRulesParser extends TestCase { private static final String LF= "\n"; private static final String CR= "\r"; private static final String CRLF= "\r\n"; private static final boolean[] ACCEPT_ALL = { true, // "/a", true, // "/a/", true, // "/a/bloh/foo.html" true, // "/b", true, // "/b/a", true, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", true, // "/d", true, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", true, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", true, // "/foo/bar.html", true, // "/f/", }; private static final String[] ROBOTS_STRINGS= new String[] { "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR, null // Used to test EMPTY_RULES }; private static final String[] AGENT_STRINGS= new String[] { "Agent1", "Agent2", "Agent3", "Agent4", "Agent5", }; private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] { { false, false, false, false, true, }, { false, false, false, false, true, } }; private static final String[] TEST_PATHS= new String[] { "/a", "/a/", "/a/bloh/foo.html", "/b", "/b/a", "/b/a/index.html", "/b/b/foo.html", "/c", "/c/a", "/c/a/index.html", "/c/b/foo.html", "/d", "/d/a", "/e/a/index.html", "/e/d", "/e/d/foo.html", "/e/doh.html", "/f/index.html", "/foo/bar/baz.html", "/f/", }; private static final boolean[][][] ALLOWED= new boolean[][][] { { // ROBOTS_STRINGS[0] { // Agent1 false, // "/a", false, // "/a/", false, // "/a/bloh/foo.html" true, // "/b", false, // "/b/a", false, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", true, // "/d", true, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", true, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", true, // "/foo/bar.html", true, // "/f/", }, { // Agent2 true, // "/a", true, // "/a/", true, // "/a/bloh/foo.html" true, // "/b", true, // "/b/a", true, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", false, // "/d", false, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", false, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", true, // "/foo/bar.html", true, // "/f/", }, { // Agent3 true, // "/a", true, // "/a/", true, // "/a/bloh/foo.html" true, // "/b", true, // "/b/a", true, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", false, // "/d", false, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", false, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", true, // "/foo/bar.html", true, // "/f/", }, { // Agent4 true, // "/a", true, // "/a/", true, // "/a/bloh/foo.html" true, // "/b", true, // "/b/a", true, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", false, // "/d", false, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", false, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", true, // "/foo/bar.html", true, // "/f/", }, { // Agent5/"*" true, // "/a", true, // "/a/", true, // "/a/bloh/foo.html" true, // "/b", true, // "/b/a", true, // "/b/a/index.html", true, // "/b/b/foo.html", true, // "/c", true, // "/c/a", true, // "/c/a/index.html", true, // "/c/b/foo.html", true, // "/d", true, // "/d/a", true, // "/e/a/index.html", true, // "/e/d", true, // "/e/d/foo.html", true, // "/e/doh.html", true, // "/f/index.html", false, // "/foo/bar.html", true, // "/f/", } }, { // ROBOTS_STRINGS[1] ACCEPT_ALL, // Agent 1 ACCEPT_ALL, // Agent 2 ACCEPT_ALL, // Agent 3 ACCEPT_ALL, // Agent 4 ACCEPT_ALL, // Agent 5 } }; public TestRobotRulesParser(String name) { super(name); } public void testRobotsOneAgent() { for (int i= 0; i < ROBOTS_STRINGS.length; i++) { for (int j= 0; j < AGENT_STRINGS.length; j++) { testRobots(i, new String[] { AGENT_STRINGS[j] }, TEST_PATHS, ALLOWED[i][j]); } } } public void testRobotsTwoAgents() { for (int i= 0; i < ROBOTS_STRINGS.length; i++) { for (int j= 0; j < AGENT_STRINGS.length; j++) { for (int k= 0; k < AGENT_STRINGS.length; k++) { int key= j; if (NOT_IN_ROBOTS_STRING[i][j]) key= k; testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] }, TEST_PATHS, ALLOWED[i][key]); } } } } public void testCrawlDelay() { RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" }); String delayRule1 = "User-agent: nutchbot" + CR + "Crawl-delay: 10" + CR + "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR; String delayRule2 = "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR; RobotRuleSet rules = p.parseRules(delayRule1.getBytes()); long crawlDelay = rules.getCrawlDelay(); assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000)); rules = p.parseRules(delayRule2.getBytes()); crawlDelay = rules.getCrawlDelay(); assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1)); } // helper public void testRobots(int robotsString, String[] agents, String[] paths, boolean[] allowed) { String agentsString= agents[0]; for (int i= 1; i < agents.length; i++) agentsString= agentsString + "," + agents[i]; RobotRulesParser p= new RobotRulesParser(agents); RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null ? ROBOTS_STRINGS[robotsString].getBytes() : null); for (int i= 0; i < paths.length; i++) { assertTrue("testing robots file "+robotsString+", on agents (" + agentsString + "), and path " + TEST_PATHS[i] + "; got " + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF + rules, rules.isAllowed(TEST_PATHS[i]) == allowed[i]); } } }