/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.tika; import com.tamingtext.TamingTextTestJ4; import junit.framework.TestCase; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.FileInputStream; import java.io.File; import java.nio.charset.Charset; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.ToHTMLContentHandler; import org.junit.*; import org.xml.sax.ContentHandler; /** * Demonstrate basic Tika usage * **/ public class TikaTest extends TamingTextTestJ4 { @Test public void testTika() throws Exception { //<start id="tika"/> InputStream input = new FileInputStream( new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/> ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/> Metadata metadata = new Metadata();//<co id="tika.metadata"/> Parser parser = new AutoDetectParser();//<co id="tika.parser"/> ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/> System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/> System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/> /* <calloutlist> <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout> <callout arearefs="tika.handler"><para>The <classname>BodyContentHandler</classname> is a Tika-provided <classname>ContentHandler</classname> that extracts just the "body" of the InputStream</para></callout> <callout arearefs="tika.metadata"><para>The <classname>Metadata</classname> object will hold metadata like author, title, etc. about the content in a map.</para></callout> <callout arearefs="tika.parser"><para>The <classname>AutoDetectParser</classname> will figure out the MIME type of the document automatically when parse is called. Since we know our input is a PDF file, we could have used the <classname>PDFParser</classname> instead.</para></callout> <callout arearefs="tika.parse"><para>Execute the parse</para></callout> <callout arearefs="tika.title"><para>Get the title from the <classname>Metadata</classname> instance</para></callout> <callout arearefs="tika.body"><para>Print out the body from the <classname>ContentHandler</classname></para></callout> </calloutlist> */ //<end id="tika"/> } @Test public void testHtml() throws Exception { String html = "<html><head><title>The Big Brown Shoe</title></head><body><p>The best pizza place " + "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" + "<p>It is located in Amherst, MA.</p></body></html>"; //<start id="tika-html"/> InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8"))); ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/> LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/> ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/> Metadata metadata = new Metadata();//<co id="html.store"/> Parser parser = new HtmlParser();//<co id="html.parser"/> ParseContext context = new ParseContext(); parser.parse(input, handler, metadata, context);//<co id="html.parse"/> System.out.println("Title: " + metadata.get(Metadata.TITLE)); System.out.println("Body: " + text.toString()); System.out.println("Links: " + links.getLinks()); /* <calloutlist> <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout> <callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout> <callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout> <callout arearefs="html.store"><para>Metadata is a simple storage mechanism where the extracted metadata gets stored</para></callout> <callout arearefs="html.parser"><para>We know the input is HTML, so construct a Parser to parse it</para></callout> <callout arearefs="html.parse"><para>Do the parse</para></callout> </calloutlist> */ //<end id="tika-html"/> } }