/** * */ package uk.bl.wa.parsers; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2015 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.HashMap; import java.util.Map; import com.typesafe.config.ConfigFactory; import junit.framework.Assert; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.ParseError; import org.jsoup.parser.Parser; import org.junit.Before; import org.junit.Test; import org.xml.sax.SAXException; /** * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class HtmlFeatureParserTest { /** * @throws java.lang.Exception */ @Before public void setUp() throws Exception { } @Test public void JSoupElementsAddedTest() throws Exception { String testHtml = "<b>test<p></b>"; String baseUri = "http://example.com/dummy.html"; // ByteArrayInputStream stream = new ByteArrayInputStream( testHtml.getBytes()); stream.mark(100000); Parser parser = Parser.xmlParser(); parser.setTrackErrors(1000); Document doc = Jsoup.parse(stream, null, baseUri, parser); System.out.println("Parsed as:\n" + doc); System.out.println("Got " + parser.getErrors().size() + " under isTrackErrors = " + parser.isTrackErrors()); for (ParseError err : parser.getErrors()) { System.out.println("Got: " + err); } // Also run in through the handler class: stream.reset(); innerBasicParseTest(stream, baseUri, 2); } @Test public void testNormalise() { final String[][] TESTS = new String[][] { // Expected, input // {"http://example.org", "http://www.example.org"}, {"http://example.org/foo", "http://www.example.org/foo"}, {"http://example.org", "http://example.org"}, {"http://example.org", "http://example.org?"}, //{"http://example.org", "https://example.org?"}, {"http://example.org", "http://user@example.org"}, {"http://example.org/foo", "http://user@www.example.org/foo"}, // {"http://example.org", "http://user@www.example.org"}, {"http://example.org", "http://eXample.org"}, {"http://example.org", "http://example.ORG"}, // {"http://example.org", "http://example.org/"}, // {"http://example.org", "http://example.org/index.html"} }; Map<String, String> normMap = new HashMap<String, String>(); normMap.put(HtmlFeatureParser.CONF_LINKS_NORMALISE, "true"); HtmlFeatureParser normParser = new HtmlFeatureParser(ConfigFactory.parseMap(normMap)); normMap.put(HtmlFeatureParser.CONF_LINKS_NORMALISE, "false"); HtmlFeatureParser skipParser = new HtmlFeatureParser(ConfigFactory.parseMap(normMap)); for (String[] test: TESTS) { Assert.assertEquals("Normalisation of '" + test[1] + "'", test[0], normParser.normaliseLink(test[1])); Assert.assertEquals("Non-normalisation of '" + test[1] + "'", test[1], skipParser.normaliseLink(test[1])); } } private static void printMetadata(Metadata metadata) { for (String name : metadata.names()) { for (String value : metadata.getValues(name)) { System.out.println(name + ": " + value); } } } private void innerBasicParseTest(InputStream stream, String uri, int numElements) throws IOException, SAXException, TikaException { HtmlFeatureParser hfp = new HtmlFeatureParser(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, uri); hfp.parse(stream, null, metadata, null); printMetadata(metadata); // for (ParseError err : hfp.getParseErrors()) { // System.out.println("PARSE-ERROR: " + err); // } Assert.assertEquals("Number of distinct elements was wrong,", numElements, metadata.getValues(HtmlFeatureParser.DISTINCT_ELEMENTS).length ); } /** * Test method for * {@link uk.bl.wa.parsers.HtmlFeatureParser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext)} * . * * @throws Exception */ @Test public void testParseInputStreamContentHandlerMetadataParseContext() throws Exception { String baseUri = "http://en.wikipedia.org/wiki/Mona_Lisa"; File ml = new File( "src/test/resources/wikipedia-mona-lisa/Mona_Lisa.html"); URL url = ml.toURI().toURL(); innerBasicParseTest(url.openStream(), baseUri, 43); } }