package org.apache.solr.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.util.SimplePostTool.PageFetcher; import org.apache.solr.util.SimplePostTool.PageFetcherResult; import org.junit.Before; import org.junit.Test; public class SimplePostToolTest extends SolrTestCaseJ4 { SimplePostTool t_file, t_file_auto, t_file_rec, t_web, t_test; PageFetcher pf; @Before public void setUp() throws Exception { super.setUp(); String[] args = {"-"}; System.setProperty("data", "files"); t_file = SimplePostTool.parseArgsAndInit(args); System.setProperty("auto", "yes"); t_file_auto = SimplePostTool.parseArgsAndInit(args); System.setProperty("recursive", "yes"); t_file_rec = SimplePostTool.parseArgsAndInit(args); System.setProperty("data", "web"); t_web = SimplePostTool.parseArgsAndInit(args); System.setProperty("params", "param1=foo¶m2=bar"); t_test = SimplePostTool.parseArgsAndInit(args); pf = new MockPageFetcher(); SimplePostTool.pageFetcher = pf; SimplePostTool.mockMode = true; } @Test public void testParseArgsAndInit() { assertEquals(false, t_file.auto); assertEquals(true, t_file_auto.auto); assertEquals(0, t_file_auto.recursive); assertEquals(999, t_file_rec.recursive); assertEquals(true, t_file.commit); assertEquals(false, t_file.optimize); assertEquals(null, t_file.out); assertEquals(1, t_web.recursive); assertEquals(10, t_web.delay); assertNotNull(t_test.solrUrl); } @Test public void testNormalizeUrlEnding() { assertEquals("http://example.com", SimplePostTool.normalizeUrlEnding("http://example.com/")); assertEquals("http://example.com", SimplePostTool.normalizeUrlEnding("http://example.com/#foo?bar=baz")); assertEquals("http://example.com/index.html", SimplePostTool.normalizeUrlEnding("http://example.com/index.html#hello")); } @Test public void testComputeFullUrl() throws MalformedURLException { assertEquals("http://example.com/index.html", t_web.computeFullUrl(new URL("http://example.com/"), "/index.html")); assertEquals("http://example.com/index.html", t_web.computeFullUrl(new URL("http://example.com/foo/bar/"), "/index.html")); assertEquals("http://example.com/fil.html", t_web.computeFullUrl(new URL("http://example.com/foo.htm?baz#hello"), "fil.html")); // TODO: How to know what is the base if URL path ends with "foo"?? // assertEquals("http://example.com/fil.html", t_web.computeFullUrl(new URL("http://example.com/foo?baz#hello"), "fil.html")); assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "fil.jpg")); assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "mailto:hello@foo.bar")); assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "ftp://server/file")); } @Test public void testTypeSupported() { assertTrue(t_web.typeSupported("application/pdf")); assertTrue(t_web.typeSupported("text/xml")); assertFalse(t_web.typeSupported("text/foo")); t_web.fileTypes = "doc,xls,ppt"; t_web.globFileFilter = t_web.getFileFilterFromFileTypes(t_web.fileTypes); assertFalse(t_web.typeSupported("application/pdf")); assertTrue(t_web.typeSupported("application/msword")); } @Test public void testIsOn() { assertTrue(SimplePostTool.isOn("true")); assertTrue(SimplePostTool.isOn("1")); assertFalse(SimplePostTool.isOn("off")); } @Test public void testAppendParam() { assertEquals("http://example.com?foo=bar", SimplePostTool.appendParam("http://example.com", "foo=bar")); assertEquals("http://example.com/?a=b&foo=bar", SimplePostTool.appendParam("http://example.com/?a=b", "foo=bar")); } @Test public void testAppendUrlPath() throws MalformedURLException { assertEquals(new URL("http://example.com/a?foo=bar"), SimplePostTool.appendUrlPath(new URL("http://example.com?foo=bar"), "/a")); } @Test public void testGuessType() { File f = new File("foo.doc"); assertEquals("application/msword", SimplePostTool.guessType(f)); f = new File("foobar"); assertEquals(null, SimplePostTool.guessType(f)); } @Test public void testDoFilesMode() { t_file_auto.recursive = 0; File dir = getFile("exampledocs"); int num = t_file_auto.postFiles(new File[] {dir}, 0, null, null); assertEquals(2, num); } @Test public void testDoWebMode() { // Uses mock pageFetcher t_web.delay = 0; t_web.recursive = 5; int num = t_web.postWebPages(new String[] {"http://example.com/#removeme"}, 0, null); assertEquals(5, num); t_web.recursive = 1; num = t_web.postWebPages(new String[] {"http://example.com/"}, 0, null); assertEquals(3, num); // Without respecting robots.txt SimplePostTool.pageFetcher.robotsCache.clear(); t_web.recursive = 5; num = t_web.postWebPages(new String[] {"http://example.com/#removeme"}, 0, null); assertEquals(6, num); } @Test public void testRobotsExclusion() throws MalformedURLException { assertFalse(SimplePostTool.pageFetcher.isDisallowedByRobots(new URL("http://example.com/"))); assertTrue(SimplePostTool.pageFetcher.isDisallowedByRobots(new URL("http://example.com/disallowed"))); assertTrue("There should be two entries parsed from robots.txt", SimplePostTool.pageFetcher.robotsCache.get("example.com").size() == 2); } class MockPageFetcher extends PageFetcher { HashMap<String,String> htmlMap = new HashMap<String,String>(); HashMap<String,Set<URL>> linkMap = new HashMap<String,Set<URL>>(); public MockPageFetcher() throws IOException { (new SimplePostTool()).super(); htmlMap.put("http://example.com", "<html><body><a href=\"http://example.com/page1\">page1</a><a href=\"http://example.com/page2\">page2</a></body></html>"); htmlMap.put("http://example.com/index.html", "<html><body><a href=\"http://example.com/page1\">page1</a><a href=\"http://example.com/page2\">page2</a></body></html>"); htmlMap.put("http://example.com/page1", "<html><body><a href=\"http://example.com/page1/foo\"></body></html>"); htmlMap.put("http://example.com/page1/foo", "<html><body><a href=\"http://example.com/page1/foo/bar\"></body></html>"); htmlMap.put("http://example.com/page1/foo/bar", "<html><body><a href=\"http://example.com/page1\"></body></html>"); htmlMap.put("http://example.com/page2", "<html><body><a href=\"http://example.com/\"><a href=\"http://example.com/disallowed\"/></body></html>"); htmlMap.put("http://example.com/disallowed", "<html><body><a href=\"http://example.com/\"></body></html>"); Set<URL> s = new HashSet<URL>(); s.add(new URL("http://example.com/page1")); s.add(new URL("http://example.com/page2")); linkMap.put("http://example.com", s); linkMap.put("http://example.com/index.html", s); s = new HashSet<URL>(); s.add(new URL("http://example.com/page1/foo")); linkMap.put("http://example.com/page1", s); s = new HashSet<URL>(); s.add(new URL("http://example.com/page1/foo/bar")); linkMap.put("http://example.com/page1/foo", s); s = new HashSet<URL>(); s.add(new URL("http://example.com/disallowed")); linkMap.put("http://example.com/page2", s); // Simulate a robots.txt file with comments and a few disallows StringBuilder sb = new StringBuilder(); sb.append("# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n"); sb.append("User-agent: * # match all bots\n"); sb.append("Disallow: # This is void\n"); sb.append("Disallow: /disallow # Disallow this path\n"); sb.append("Disallow: /nonexistingpath # Disallow this path\n"); this.robotsCache.put("example.com", SimplePostTool.pageFetcher. parseRobotsTxt(new ByteArrayInputStream(sb.toString().getBytes("UTF-8")))); } @Override public PageFetcherResult readPageFromUrl(URL u) { PageFetcherResult res = (new SimplePostTool()).new PageFetcherResult(); if (isDisallowedByRobots(u)) { res.httpStatus = 403; return res; } res.httpStatus = 200; res.contentType = "text/html"; try { res.content = htmlMap.get(u.toString()).getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(); } return res; } @Override public Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) { Set<URL> s = linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString())); if(s == null) s = new HashSet<URL>(); return s; } } }