/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.util; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.util.SimplePostTool.PageFetcher; import org.apache.solr.util.SimplePostTool.PageFetcherResult; import org.junit.Before; import org.junit.Test; /** * NOTE: do *not* use real hostnames, not even "example.com", in this test. * * Even though a MockPageFetcher is used to prevent real HTTP requests from being * executed, the use of the URL class in SimplePostTool causes attempted resolution of * the hostnames. */ public class SimplePostToolTest extends SolrTestCaseJ4 { SimplePostTool t_file, t_file_auto, t_file_rec, t_web, t_test; PageFetcher pf; @Before public void initVariousPostTools() throws Exception { String[] args = {"-"}; // Add a dummy core/collection property so that the SimplePostTool // doesn't fail fast. System.setProperty("c", "testcollection"); System.setProperty("data", "files"); t_file = SimplePostTool.parseArgsAndInit(args); System.setProperty("auto", "yes"); t_file_auto = SimplePostTool.parseArgsAndInit(args); System.setProperty("recursive", "yes"); t_file_rec = SimplePostTool.parseArgsAndInit(args); System.setProperty("data", "web"); t_web = SimplePostTool.parseArgsAndInit(args); System.setProperty("params", "param1=foo¶m2=bar"); System.setProperty("url", "http://user:password@localhost:5150/solr/update"); t_test = SimplePostTool.parseArgsAndInit(args); pf = new MockPageFetcher(); for (SimplePostTool mockable : new SimplePostTool[]{t_web, t_file_auto}) { mockable.pageFetcher = pf; mockable.mockMode = true; } } @Test public void testParseArgsAndInit() { assertEquals(false, t_file.auto); assertEquals(true, t_file_auto.auto); assertEquals(0, t_file_auto.recursive); assertEquals(999, t_file_rec.recursive); assertEquals(true, t_file.commit); assertEquals(false, t_file.optimize); assertEquals(null, t_file.out); assertEquals(1, t_web.recursive); assertEquals(10, t_web.delay); assertEquals("http://user:password@localhost:5150/solr/update?param1=foo¶m2=bar",t_test.solrUrl.toExternalForm()); } @Test public void testNormalizeUrlEnding() { assertEquals("http://[ff01::114]", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/")); assertEquals("http://[ff01::114]", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz")); assertEquals("http://[ff01::114]/index.html", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello")); } @Test public void testComputeFullUrl() throws MalformedURLException { assertEquals("http://[ff01::114]/index.html", t_web.computeFullUrl(new URL("http://[ff01::114]/"), "/index.html")); assertEquals("http://[ff01::114]/index.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo/bar/"), "/index.html")); assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo.htm?baz#hello"), "fil.html")); // TODO: How to know what is the base if URL path ends with "foo"?? // assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo?baz#hello"), "fil.html")); assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "fil.jpg")); assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "mailto:hello@foo.bar")); assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "ftp://server/file")); } @Test public void testTypeSupported() { assertTrue(t_web.typeSupported("application/pdf")); assertTrue(t_web.typeSupported("application/xml")); assertFalse(t_web.typeSupported("text/foo")); t_web.fileTypes = "doc,xls,ppt"; t_web.fileFilter = t_web.getFileFilterFromFileTypes(t_web.fileTypes); assertFalse(t_web.typeSupported("application/pdf")); assertTrue(t_web.typeSupported("application/msword")); } @Test public void testIsOn() { assertTrue(SimplePostTool.isOn("true")); assertTrue(SimplePostTool.isOn("1")); assertFalse(SimplePostTool.isOn("off")); } @Test public void testAppendParam() { assertEquals("http://[ff01::114]?foo=bar", SimplePostTool.appendParam("http://[ff01::114]", "foo=bar")); assertEquals("http://[ff01::114]/?a=b&foo=bar", SimplePostTool.appendParam("http://[ff01::114]/?a=b", "foo=bar")); } @Test public void testAppendUrlPath() throws MalformedURLException { assertEquals(new URL("http://[ff01::114]/a?foo=bar"), SimplePostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar"), "/a")); } @Test public void testGuessType() { File f = new File("foo.doc"); assertEquals("application/msword", SimplePostTool.guessType(f)); f = new File("foobar"); assertEquals("application/octet-stream", SimplePostTool.guessType(f)); f = new File("foo.jsonl"); assertEquals("application/json", SimplePostTool.guessType(f)); } @Test public void testDoFilesMode() { t_file_auto.recursive = 0; File dir = getFile("exampledocs"); int num = t_file_auto.postFiles(new File[] {dir}, 0, null, null); assertEquals(2, num); } @Test public void testDoWebMode() { // Uses mock pageFetcher t_web.delay = 0; t_web.recursive = 5; int num = t_web.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); assertEquals(5, num); t_web.recursive = 1; num = t_web.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null); assertEquals(3, num); // Without respecting robots.txt t_web.pageFetcher.robotsCache.clear(); t_web.recursive = 5; num = t_web.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); assertEquals(6, num); } @Test public void testRobotsExclusion() throws MalformedURLException { assertFalse(t_web.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/"))); assertTrue(t_web.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/disallowed"))); assertTrue("There should be two entries parsed from robots.txt", t_web.pageFetcher.robotsCache.get("[ff01::114]").size() == 2); } static class MockPageFetcher extends PageFetcher { HashMap<String,String> htmlMap = new HashMap<>(); HashMap<String,Set<URL>> linkMap = new HashMap<>(); public MockPageFetcher() throws IOException { (new SimplePostTool()).super(); htmlMap.put("http://[ff01::114]", "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); htmlMap.put("http://[ff01::114]/index.html", "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); htmlMap.put("http://[ff01::114]/page1", "<html><body><a href=\"http://[ff01::114]/page1/foo\"></body></html>"); htmlMap.put("http://[ff01::114]/page1/foo", "<html><body><a href=\"http://[ff01::114]/page1/foo/bar\"></body></html>"); htmlMap.put("http://[ff01::114]/page1/foo/bar", "<html><body><a href=\"http://[ff01::114]/page1\"></body></html>"); htmlMap.put("http://[ff01::114]/page2", "<html><body><a href=\"http://[ff01::114]/\"><a href=\"http://[ff01::114]/disallowed\"/></body></html>"); htmlMap.put("http://[ff01::114]/disallowed", "<html><body><a href=\"http://[ff01::114]/\"></body></html>"); Set<URL> s = new HashSet<>(); s.add(new URL("http://[ff01::114]/page1")); s.add(new URL("http://[ff01::114]/page2")); linkMap.put("http://[ff01::114]", s); linkMap.put("http://[ff01::114]/index.html", s); s = new HashSet<>(); s.add(new URL("http://[ff01::114]/page1/foo")); linkMap.put("http://[ff01::114]/page1", s); s = new HashSet<>(); s.add(new URL("http://[ff01::114]/page1/foo/bar")); linkMap.put("http://[ff01::114]/page1/foo", s); s = new HashSet<>(); s.add(new URL("http://[ff01::114]/disallowed")); linkMap.put("http://[ff01::114]/page2", s); // Simulate a robots.txt file with comments and a few disallows StringBuilder sb = new StringBuilder(); sb.append("# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n"); sb.append("User-agent: * # match all bots\n"); sb.append("Disallow: # This is void\n"); sb.append("Disallow: /disallow # Disallow this path\n"); sb.append("Disallow: /nonexistingpath # Disallow this path\n"); this.robotsCache.put("[ff01::114]", super. parseRobotsTxt(new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)))); } @Override public PageFetcherResult readPageFromUrl(URL u) { PageFetcherResult res = new PageFetcherResult(); if (isDisallowedByRobots(u)) { res.httpStatus = 403; return res; } res.httpStatus = 200; res.contentType = "text/html"; res.content = ByteBuffer.wrap( htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8)); return res; } @Override public Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) { Set<URL> s = linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString())); if(s == null) s = new HashSet<>(); return s; } } }