/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.accesscontrol.robotstxt; import java.io.ByteArrayInputStream; import java.io.IOException; import junit.framework.TestCase; /** * Tests for {@link RobotRules} * <p>References: * <ul> * <li>https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt * - referred as "Google/Yahoo/Bing/Ask (GYBA) convention".</li> * <li>http://www.robotstxt.org/orig.html</li> * <li>http://www.robotstxt.org/norobots-rfc.txt - referred as "RFC".</li> * </ul> * </p> * <p>Note: GYBA convention document and RFC differs in terminology. What GYBA calls * "group" is called "record" in RFC.</p> * @author brad * */ public class RobotRulesTest extends TestCase { public static final String WB_UA = "ia_archiver"; RobotRules rr; public void setUp() { rr = new RobotRules(); } protected RobotRules load(String txt) throws IOException { rr.parse(new ByteArrayInputStream(txt.getBytes("UTF-8"))); assertFalse(rr.hasSyntaxErrors()); return rr; } /** * basic test */ public void testDirectivesAreCaseInsensitive() throws Exception { final String testString = "user-Agent: *\n" + "disAllow: /\n"; load(testString); assertTrue(rr.blocksPathForUA("/", WB_UA)); } /** * Disallow: with empty path has no effect. */ public void testEmptyDisallowHasNoEffect() throws Exception { final String testString = "User-agent: *\n" + "Disallow:\n" + "Disallow: /wp-admin\n"; load(testString); assertFalse(rr.blocksPathForUA("/", WB_UA)); // Path-less Disallow: should just be ignored. It shall not // disable other non-empty Disallow's. ARI-4212. assertTrue(rr.blocksPathForUA("/wp-admin/index.php", WB_UA)); } /** * optional white spaces before/after "{@code :}", before EOL. * @throws Exception */ public void testLessSpaceExtraSpace() throws Exception { final String testString = "User-agent :* \n" + "Disallow:/ \n"; load(testString); assertTrue(rr.blocksPathForUA("/index.html", WB_UA)); } /** * white spaces are allowed at the beginning of the line, too. * @throws Exception */ public void testExtraSpace2() throws Exception { final String testString = " User-agent: *\n" + " Disallow:/\n"; load(testString); assertTrue(rr.blocksPathForUA("/index.html", WB_UA)); } public void testComments() throws Exception { final String testString = "# User-agent: *\n" + "User-agent: google-news\n" + "Disallow: /post # block post CGI\n" + "\n" + "User-agent: ia_archiver # Wayback\n" + "Disallow: /dontshow/\n"; load(testString); assertFalse(rr.blocksPathForUA("/post", WB_UA)); assertTrue(rr.blocksPathForUA("/dontshow/secret.html", WB_UA)); } /** * LF, CRLF, CR are recognized as end-of-line. */ public void testEOLs() throws Exception { final String testString = "User-agent: *\r" + "Disallow: /\r"; load(testString); assertTrue(rr.blocksPathForUA("/", WB_UA)); } // ==== group parsing and user-agent matching tests ==== /** * user-agent name comparisons are case-insensitive. */ public void testUserAgentIsCaseInsensitive() throws Exception { final String testString = "User-agent: IA_Archiver\n" + "Disallow: /\n"; load(testString); assertTrue(rr.blocksPathForUA("/", WB_UA)); } /** * multiple records for different User-agent's. * <p>while RFC states "the format logically consists of a non-empty set * or records, separated by blank lines", Google's documentation has no * mention to blank lines as group separator - instead, it recognizes a sequence * of User-agent: as the start of "group". So this sample is syntax error per * RFC, but okay according to Google/Yahoo/Bing/Ask convention. * @throws Exception */ public void testNonBlocksPathForUA() throws Exception { final String testString = "User-agent: *\n" + "Allow: /\n" + "User-agent: Google-bot\n" + "Disallow: /\n"; load(testString); assertFalse(rr.blocksPathForUA("/", WB_UA)); } /** * there's sitemap (non-allow/disallow) directive after User-agent. * next user-agent should start a new group. * ia_archiver ALLOWED as disallow rule only applies to B. */ public void testMultiUAWithOtherLinesLine() throws Exception { final String testString = "User-agent: *\n" + "Sitemap: X\n" + "\n" + "User-agent: B\n" + "Disallow: /\n"; load(testString); // ALLOWED assertFalse(rr.blocksPathForUA("/", WB_UA)); } /** * similarly to previous test case, {@code Crawl-delay:} line shall * end the group. */ public void testMultiUAWithOtherLinesLine2() throws Exception { final String testString = "User-agent: *\n" + "Crawl-delay: 30\n" + "\n" + "User-agent: B\n" + "Disallow: /\n"; load(testString); // ALLOWED assertFalse(rr.blocksPathForUA("/", WB_UA)); } /** * this is a syntax error per RFC, but ok with Google/Yahoo/Bing/Ask convention. * blank lines are permitted within a group to improve readability. they are simply * ignored, don't end group. */ public void testBlankLineInGroup() throws Exception { final String testString = "User-agent: *\n" + "Disallow: /media/\n" + "\n" + "Disallow: /cgi-bin/\n" + "# images\n" + "Disallow: /img/\n" + "Disallow: /icon/\n" + "\n" + "\n" + "Disallow: /actions/\n" + "\n" + "# sensitive stuff\n" + "Disallow: /etc\n"; load(testString); assertTrue(rr.blocksPathForUA("/cgi-bin/", WB_UA)); assertTrue(rr.blocksPathForUA("/img/", WB_UA)); assertTrue(rr.blocksPathForUA("/icon/", WB_UA)); assertTrue(rr.blocksPathForUA("/etc/", WB_UA)); } /** * multiple User-agent: for a record. * ia_archiver BLOCKED as disallow rule applies to both all and B * @throws Exception */ public void testMultiUA() throws Exception { String testString = "User-agent: *\n" + "User-agent: B\n" + "Disallow: /\n"; load(testString); // BLOCKED assertTrue(rr.blocksPathForUA("/", WB_UA)); } // ==== path matching tests === /** * path matching basics. substring-based, <code>/</code> is no special, * and case-sensitive. */ public void testSubpath() throws Exception { String testString = "User-agent: *\n" + "Disallow: /media\n" + "Disallow: /hidden/\n"; load(testString); assertFalse(rr.blocksPathForUA("/", WB_UA)); assertTrue(rr.blocksPathForUA("/media/theme.mp3", WB_UA)); assertTrue(rr.blocksPathForUA("/media?order=2", WB_UA)); assertFalse(rr.blocksPathForUA("/images/logo.png", WB_UA)); assertFalse(rr.blocksPathForUA("/hidden", WB_UA)); assertTrue(rr.blocksPathForUA("/hidden/index.html", WB_UA)); // path match is case-sensitive assertFalse(rr.blocksPathForUA("/Hidden/notreally.mp3", WB_UA)); } /** * character may be %-escaped, but %2f (<code>/</code>) is special. * (TODO: additional tests: robots.txt is assumed to be UTF-8 encoded. * non-7bit-ascii characters are allowed, and also can be %-escaped.) * @throws Exception */ public void testPercentEncodedPath() throws Exception { String testString = "User-agent: *\n" + "Disallow: /%7Ethomas/\n" + "Disallow: /a%2fb.html\n" + "Disallow: /n/m.png\n"; load(testString); // (some assertions are disabled until we decide to fix it) // not sure about these two. if RobotRules expects canonicalized URL, // only either of these shall be tested. // assertTrue(rr.blocksPathForUA("/~thomas/welcome.html", WB_UA)); // assertTrue(rr.blocksPathForUA("/%7ethomas/", WB_UA)); // RFC states %2f does not match "/" assertTrue(rr.blocksPathForUA("/a%2fb.html", WB_UA)); assertFalse(rr.blocksPathForUA("/a/b.html", WB_UA)); // assertTrue(rr.blocksPathForUA("/n%2fm.png", WB_UA)); } /** * <p>By GYBA convention, if multiple * disallow and allow directives matches the URL, the most specific * rule based on the length of the path will win over less specific * (shorter) ones.</p> * <p>RFC says differently: "a robot must attempt to match * the paths in Allow and Disallow lines against the URL, in the * order they occur in the record. The first match found is used."</p> * <p>we follow GYBA convention here.</p> * @throws Exception */ public void testMostSpecificPathPrevails() throws Exception { final String testString = "User-agent: *\n" + "Disallow: /\n" + "Allow: /media\n" + "Disallow: /media/private\n"; load(testString); // (some assertions are disabled until we decide to fix it) assertTrue(rr.blocksPathForUA("/index.html", WB_UA)); // assertFalse(rr.blocksPathForUA("/media/toc.html", WB_UA)); // assertFalse(rr.blocksPathForUA("/media/priv", WB_UA)); assertTrue(rr.blocksPathForUA("/media/private.html", WB_UA)); assertTrue(rr.blocksPathForUA("/media/private/voice.mp3", WB_UA)); } /** * wildcard in path, matches any chars including <code>/</code>. * <code>/*</code> is the same as <code>/</code>. * @throws Exception */ public void testWildcardMatch() throws Exception { String testString = "User-agent: *\n" + "Disallow: /media/*\n" + "Disallow: /cgi/*.php\n"; load(testString); // (some assertions are disabled until we decide to fix it) assertFalse(rr.blocksPathForUA("/media", WB_UA)); // assertTrue(rr.blocksPathForUA("/media/", WB_UA)); // assertTrue(rr.blocksPathForUA("/media/theme.mp3", WB_UA)); // perhaps we don't implement non-trailing wildcard. comment out // following three if we don't. // assertTrue(rr.blocksPathForUA("/cgi/messy.php", WB_UA)); // assertTrue(rr.blocksPathForUA("/cgi/really.phpt", WB_UA)); assertFalse(rr.blocksPathForUA("/cgi/noexec.txt", WB_UA)); } /** * Google/Bing/Yahoo/Ask extension: <code>$</code> matches the end of path. */ public void testEndOfPath() throws Exception { String testString = "User-agent: *\n" + "Disallow: /exactly$\n"; load(testString); // (some assertions are disabled until we decide to fix it) // perhaps we don't support end-of-path marker. comment out these // three asserts if that's the case. // assertTrue(rr.blocksPathForUA("/exactly", WB_UA)); assertFalse(rr.blocksPathForUA("/exactly/", WB_UA)); assertFalse(rr.blocksPathForUA("/exactly/it.html", WB_UA)); } }