/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package focusedCrawler.crawler.crawlercommons.fetcher.http; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.net.ConnectException; import java.nio.charset.Charset; import java.util.HashSet; import java.util.Set; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.http.HttpHeaders; import org.apache.http.HttpStatus; import org.eclipse.jetty.server.Handler; import org.eclipse.jetty.server.Request; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.server.handler.AbstractHandler; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException; import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchReason; import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetcher; import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; import focusedCrawler.crawler.crawlercommons.fetcher.IOFetchException; import focusedCrawler.crawler.crawlercommons.fetcher.Payload; import focusedCrawler.crawler.crawlercommons.fetcher.RedirectFetchException; import focusedCrawler.crawler.crawlercommons.fetcher.RedirectFetchException.RedirectExceptionReason; import focusedCrawler.crawler.crawlercommons.fetcher.http.BaseHttpFetcher.RedirectMode; import focusedCrawler.crawler.crawlercommons.test.RandomResponseHandler; import focusedCrawler.crawler.crawlercommons.test.ResourcesResponseHandler; import focusedCrawler.crawler.crawlercommons.test.SimulationWebServer; import focusedCrawler.crawler.crawlercommons.test.TestUtils; public class SimpleHttpFetcherTest { private SimulationWebServer _webServer; @Before public void setUp() throws Exception { _webServer = new SimulationWebServer(); } @After public void shutDown() throws Exception { _webServer.stopServer(); } private void startServer(Handler handler, int port) throws Exception { _webServer.startServer(handler, port); // Thread.sleep(100000); } private void stopServer() throws Exception { _webServer.stopServer(); } private Server getServer() { return _webServer.getServer(); } // TODO - merge this code with RedirectResponseHandler class in // crawlercommons.test package. private class RedirectResponseHandler extends AbstractHandler { private boolean _permanent; public RedirectResponseHandler() { this(false); } public RedirectResponseHandler(boolean permanent) { super(); _permanent = permanent; } @Override public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException { if (pathInContext.endsWith("base")) { if (_permanent) { // Can't use sendRedirect, as that forces it to be a temp // redirect. response.setStatus(HttpServletResponse.SC_MOVED_PERMANENTLY); response.addHeader("Location", "http://localhost:8089/redirect"); response.flushBuffer(); } else { response.sendRedirect("http://localhost:8089/redirect"); } } else { response.setStatus(HttpServletResponse.SC_OK); response.setContentType("text/plain"); String content = "redirected"; response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes("UTF-8")); } } } private class LanguageResponseHandler extends AbstractHandler { private String _englishContent; private String _foreignContent; public LanguageResponseHandler(String englishContent, String foreignContent) { _englishContent = englishContent; _foreignContent = foreignContent; } @Override public void handle(String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { String language = request.getHeader(HttpHeaders.ACCEPT_LANGUAGE); String content; if ((language != null) && (language.contains("en"))) { content = _englishContent; } else { content = _foreignContent; } response.setStatus(HttpStatus.SC_OK); response.setContentType("text/plain"); response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes("UTF-8")); } } private class MimeTypeResponseHandler extends AbstractHandler { private String _mimeType; public MimeTypeResponseHandler(String mimeType) { _mimeType = mimeType; } @Override public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException { String content = "test"; response.setStatus(HttpStatus.SC_OK); if (_mimeType != null) { response.setContentType(_mimeType); } response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes("UTF-8")); } } @Test public final void testConnectionTimeout() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8088/simple-page.html"; try { fetcher.get(url); fail("Exception not thrown"); } catch (IOFetchException e) { assertTrue(e.getCause() instanceof ConnectException); } } @Test public final void testStaleConnection() throws Exception { startServer(new ResourcesResponseHandler(), 8089); ServerConnector sc = (ServerConnector) getServer().getConnectors()[0]; sc.setSoLingerTime(-1); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; fetcher.get(url); // TODO KKr - control keep-alive (linger?) value for Jetty, so we can // set it // to something short and thus make this sleep delay much shorter. Thread.sleep(2000); fetcher.get(url); } @Test public final void testSlowServerTermination() throws Exception { // Need to read in more than 2 8K blocks currently, due to how // HttpClientFetcher // is designed...so use 20K bytes. And the duration is 2 seconds, so 10K // bytes/sec. startServer(new RandomResponseHandler(20000, 2 * 1000L), 8089); // Set up for a minimum response rate of 20000 bytes/second. BaseHttpFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); fetcher.setMinResponseRate(20000); String url = "http://localhost:8089/test.html"; try { fetcher.get(url); fail("Aborted fetch exception not thrown"); } catch (AbortedFetchException e) { assertEquals(AbortedFetchReason.SLOW_RESPONSE_RATE, e.getAbortReason()); } } @Test public final void testNotTerminatingSlowServers() throws Exception { // Return 1K bytes at 2K bytes/second - would normally trigger an // error. startServer(new RandomResponseHandler(1000, 500), 8089); // Set up for no minimum response rate. BaseHttpFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); fetcher.setMinResponseRate(BaseHttpFetcher.NO_MIN_RESPONSE_RATE); String url = "http://localhost:8089/test.html"; fetcher.get(url); } @Test public final void testLargeContent() throws Exception { BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); startServer(new RandomResponseHandler(fetcher.getDefaultMaxContentSize() * 2), 8089); String url = "http://localhost:8089/test.html"; FetchedResult result = fetcher.get(url); assertEquals(HttpStatus.SC_OK, result.getStatusCode()); assertTrue("Content size should be truncated", result.getContent().length <= fetcher.getDefaultMaxContentSize()); } @Test public final void testTruncationWithKeepAlive() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); fetcher.setDefaultMaxContentSize(1000); fetcher.setMaxContentSize("image/png", 5000); String urlToFetch = "http://localhost:8089/karlie.html"; FetchedResult result1 = fetcher.get(urlToFetch); assertEquals(HttpStatus.SC_OK, result1.getStatusCode()); FetchedResult result2 = fetcher.get(urlToFetch); assertEquals(HttpStatus.SC_OK, result2.getStatusCode()); // Verify that we got the same data from each fetch request. assertEquals(1000, result1.getContent().length); assertEquals(1000, result2.getContent().length); byte[] bytes1 = result1.getContent(); byte[] bytes2 = result2.getContent(); for (int i = 0; i < bytes1.length; i++) { assertEquals(bytes1[i], bytes2[i]); } urlToFetch = "http://localhost:8089/bixolabs_mining.png"; FetchedResult result3 = fetcher.get(urlToFetch); assertTrue(result3.getContent().length > 1000); fetcher.setMaxContentSize("image/png", 1500); try { fetcher.get(urlToFetch); fail("Aborted fetch exception not thrown"); } catch (AbortedFetchException e) { Assert.assertEquals(AbortedFetchReason.CONTENT_SIZE, e.getAbortReason()); } } @Test public final void testLargeHtml() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/karlie.html"; FetchedResult result = fetcher.get(url); assertTrue("Content size should be truncated", result.getContentLength() <= fetcher.getDefaultMaxContentSize()); } @Test public final void testContentTypeHeader() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; FetchedResult result = fetcher.get(url); String contentType = result.getHeaders().get(HttpHeaders.CONTENT_TYPE); assertNotNull(contentType); assertEquals("text/html", contentType); } @Test public final void testTempRedirectHandling() throws Exception { startServer(new RedirectResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/base"; FetchedResult result = fetcher.get(url); assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl()); assertNull(result.getNewBaseUrl()); assertEquals(1, result.getNumRedirects()); } @Test public final void testPermRedirectHandling() throws Exception { startServer(new RedirectResponseHandler(true), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/base"; Payload payload = new Payload(); payload.put("payload-field-1", 1); FetchedResult result = fetcher.get(url, payload); assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl()); assertEquals("New base URL", "http://localhost:8089/redirect", result.getNewBaseUrl()); assertEquals(1, result.getNumRedirects()); assertEquals(1, result.getPayload().get("payload-field-1")); } @Test public final void testRedirectPolicy() throws Exception { startServer(new RedirectResponseHandler(true), 8089); BaseHttpFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); fetcher.setRedirectMode(RedirectMode.FOLLOW_TEMP); String url = "http://localhost:8089/base"; try { fetcher.get(url); fail("Exception should have been thrown"); } catch (RedirectFetchException e) { assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl()); assertEquals(RedirectExceptionReason.PERM_REDIRECT_DISALLOWED, e.getReason()); } stopServer(); // Now try setting the mode to follow none startServer(new RedirectResponseHandler(false), 8089); fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); fetcher.setRedirectMode(RedirectMode.FOLLOW_NONE); try { fetcher.get(url); fail("Exception should have been thrown"); } catch (RedirectFetchException e) { assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl()); assertEquals(RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED, e.getReason()); } } @Test public final void testAcceptLanguage() throws Exception { final String englishContent = "English"; final String foreignContent = "Foreign"; startServer(new LanguageResponseHandler(englishContent, foreignContent), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/"; FetchedResult result = fetcher.get(url); String contentStr = new String(result.getContent(), 0, result.getContentLength(), Charset.defaultCharset()); assertTrue(englishContent.equals(contentStr)); } @Test public final void testMimeTypeFiltering() throws Exception { startServer(new MimeTypeResponseHandler("text/xml"), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); fetcher.setValidMimeTypes(validMimeTypes); String url = "http://localhost:8089/"; try { fetcher.get(url); fail("Fetch should have failed"); } catch (AbortedFetchException e) { assertEquals(AbortedFetchReason.INVALID_MIMETYPE, e.getAbortReason()); } } @Test public final void testMimeTypeFilteringNoContentType() throws Exception { startServer(new MimeTypeResponseHandler(null), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); validMimeTypes.add(""); // We want unknown (not reported) mime-types // too. fetcher.setValidMimeTypes(validMimeTypes); String url = "http://localhost:8089/"; try { fetcher.get(url); } catch (AbortedFetchException e) { fail("Fetch should not have failed if no mime-type is specified"); } } @Test public final void testMimeTypeFilteringWithCharset() throws Exception { startServer(new MimeTypeResponseHandler("text/html; charset=UTF-8"), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); fetcher.setValidMimeTypes(validMimeTypes); String url = "http://localhost:8089/"; try { fetcher.get(url); } catch (AbortedFetchException e) { fail("Fetch should have worked"); } } @Test public final void testHostAddress() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; FetchedResult result = fetcher.get(url); String hostAddress = result.getHostAddress(); assertNotNull(hostAddress); assertEquals("127.0.0.1", hostAddress); } @Test public final void testMissingPage() throws Exception { startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT); String url = "http://localhost:8089/this-page-will-not-exist.html"; FetchedResult result = fetcher.get(url); assertEquals(HttpStatus.SC_NOT_FOUND, result.getStatusCode()); assertEquals(url, result.getFetchedUrl()); assertEquals("127.0.0.1", result.getHostAddress()); assertTrue(new String(result.getContent(), "UTF-8").contains("Error 404")); } }