/* Copyright 2012 Tim Garrett, Mothsoft LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mothsoft.alexis.engine.textual; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.junit.Test; public class WebContentParserTest { private WebContentParser wcp = new WebContentParserImpl(); @Test public void testParseInputStreamText() throws IOException { final String text = "Hello, I am a document."; final InputStream is = new ByteArrayInputStream(text.getBytes(Charset.forName("UTF-8"))); assertEquals("Hello, I am a document.", wcp.parse(is)); } // FIXME - Boilerpipe isn't doing that good of job. The first sentence of // this article doesn't even make it in with the ArticleExtractor. Consider // writing a stack-based parser that tracks probable content tags and // discards on inferred HTML semantic structure rather than trying to do it // with Boilerpipe's algorithms @Test public void testParseInputStreamHTML() throws IOException { final InputStream is = this.getClass().getClassLoader().getResourceAsStream("test-article.html"); final String document = wcp.parse(is); System.out.println(document); assertTrue(document.contains("The self-proclaimed mastermind of")); } @Test public void testParseHTML() throws IOException { final String html = "I hate <b>HTML</b> when I am expecting <em>only</em> plain text."; assertEquals("I hate HTML when I am expecting only plain text.", wcp.parseHTML(html)); } }