/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.bolt; import java.io.IOException; import java.util.HashMap; import java.util.List; import org.apache.storm.task.OutputCollector; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.TestUtil; import com.digitalpebble.stormcrawler.parse.ParsingTester; import com.digitalpebble.stormcrawler.util.RobotsTags; public class JSoupParserBoltTest extends ParsingTester { /* * * some sample tags: * * <meta name="robots" content="index,follow"> <meta name="robots" * content="noindex,follow"> <meta name="robots" content="index,nofollow"> * <meta name="robots" content="noindex,nofollow"> * * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> */ public static String[] tests = { "<html><head><title>test page</title>" + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"all\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"noindex,follow\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"index,nofollow\"> " + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\" content=\"index,follow\"> " + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + " some text" + "</body></html>", "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + " some text" + "</body></html>", }; public static final boolean[][] answers = { { true, true, true }, // NONE { false, false, false }, // all { true, true, true }, // nOnE { true, true, true }, // none { true, true, false }, // noindex,nofollow { true, false, false }, // noindex,follow { false, true, false }, // index,nofollow { false, false, false }, // index,follow { false, false, false }, // missing! }; @Before public void setupParserBolt() { bolt = new JSoupParserBolt(); setupParserBolt(bolt); } @Test /** * Checks that content in script is not included in the text representation **/ public void testNoScriptInText() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match String text = (String) parsedTuple.get(3); Assert.assertFalse( "Text should not contain the content of script tags", text.contains("urchinTracker")); } @Test /** * Checks that individual links marked as rel="nofollow" are not followed **/ public void testNoFollowOutlinks() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); Assert.assertEquals(10, statusTuples.size()); } @Test public void testHTTPRobots() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); metadata.setValues("X-Robots-Tag", new String[] { "noindex", "nofollow" }); parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // no outlinks at all Assert.assertEquals(0, statusTuples.size()); Assert.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match metadata = (Metadata) parsedTuple.get(2); Assert.assertNotNull(metadata); boolean isNoIndex = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_INDEX)); boolean isNoFollow = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_FOLLOW)); boolean isNoCache = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_CACHE)); Assert.assertEquals("incorrect noIndex", true, isNoIndex); Assert.assertEquals("incorrect noFollow", true, isNoFollow); Assert.assertEquals("incorrect noCache", false, isNoCache); } @Test public void testRobotsMetaProcessor() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); for (int i = 0; i < tests.length; i++) { byte[] bytes = tests[i].getBytes(); parse("http://www.digitalpebble.com", bytes, new Metadata()); Assert.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match Metadata metadata = (Metadata) parsedTuple.get(2); Assert.assertNotNull(metadata); boolean isNoIndex = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_INDEX)); boolean isNoFollow = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_FOLLOW)); boolean isNoCache = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_CACHE)); Assert.assertEquals("incorrect noIndex value on doc " + i, answers[i][0], isNoIndex); Assert.assertEquals("incorrect noFollow value on doc " + i, answers[i][1], isNoFollow); Assert.assertEquals("incorrect noCache value on doc " + i, answers[i][2], isNoCache); } } @Test public void testHTMLRedir() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.somesite.com", "redir.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // one for the redir + one for the discovered Assert.assertEquals(2, statusTuples.size()); } }