TikaTest.java example

Explorer

book-master
- src
  - main
    - java
      - com
        tamingtext
        classifier
        bayes
        BayesUpdateRequestProcessor.java
        BayesUpdateRequestProcessorFactory.java
        ClassifyDocument.java
        ExtractTrainingData.java
        maxent
        CategoryDataStream.java
        NameFinderFeatureGenerator.java
        TestMaxent.java
        TrainMaxent.java
        mlt
        CategoryHits.java
        MoreLikeThisCategorizer.java
        TestMoreLikeThis.java
        TrainMoreLikeThis.java
        fuzzy
        LevenshteinDistance.java
        MovieMatcher.java
        OverlapMeasures.java
        SpellCorrector.java
        TrieNode.java
        TypeAheadResponseWriter.java
        opennlp
        PooledGenericModelReader.java
        PooledGenericModelSerializer.java
        PooledTokenNameFinderModel.java
        qa
        AnswerTypeClassifier.java
        AnswerTypeContextGenerator.java
        AnswerTypeEventStream.java
        ChunkParser.java
        PassageRankingComponent.java
        QAParams.java
        QuestionQParser.java
        QuestionQParserPlugin.java
        WexWikiContentSource.java
        WikipediaIndexer.java
        WikipediaWexIndexer.java
        tagging
        LuceneCategoryExtractor.java
        LuceneTagExtractor.java
        tagrecommender
        CountStackOverflowTags.java
        ExtractStackOverflowData.java
        MoreLikeThisRequest.java
        StackOverflowParser.java
        StackOverflowPost.java
        StackOverflowStream.java
        StackOverflowTagTransformer.java
        TagRecommenderClient.java
        TestStackOverflowTagger.java
        Util.java
        texttamer
        solr
        NameFilter.java
        NameFilterFactory.java
        SentenceTokenizer.java
        SentenceTokenizerFactory.java
        util
        Constants.java
        FileUtil.java
        MemoryStatus.java
        NameFinderFactory.java
        OpenNLPUtil.java
        SentenceDetectorFactory.java
        SplitInput.java
        StringUtil.java
        TamingTextDriver.java
  - test
    - java
      - com
        tamingtext
        TTTestCaseJ4.java
        TamingTextTestJ4.java
        carrot2
        Carrot2ExampleTest.java
        classifier
        bayes
        BayesUpdateRequestProcessorTest.java
        ExtractTrainingDataTest.java
        mlt
        MoreLikeThisQueryTest.java
        frankenstein
        Frankenstein.java
        fuzzy
        LevenshteinDistanceTest.java
        OverlapMeasuresTest.java
        TrieNodeTest.java
        mahout
        VectorExamplesTest.java
        opennlp
        AnswerTypeTest.java
        ChunkParserTest.java
        NameFinderTest.java
        POSTaggerTest.java
        ParserTest.java
        qa
        MockQuestionQParserPlugin.java
        PassageRankingComponentTest.java
        QATest.java
        sentences
        SentenceDetectionTest.java
        snowball
        SnowballStemmerTest.java
        solr
        SolrJTest.java
        texttamer
        solr
        NameFilterTest.java
        SentenceTokenizerTest.java
        tika
        TikaTest.java
        util
        StringUtilTest.java

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.tika;

import com.tamingtext.TamingTextTestJ4;
import junit.framework.TestCase;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.File;
import java.nio.charset.Charset;

import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.junit.*;
import org.xml.sax.ContentHandler;


/**
 * Demonstrate basic Tika usage
 *
 **/
public class TikaTest extends TamingTextTestJ4 {
  @Test
  public void testTika() throws Exception {
    //<start id="tika"/>
    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
    <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout>
    <callout arearefs="tika.handler"><para>The <classname>BodyContentHandler</classname> is a Tika-provided <classname>ContentHandler</classname> that extracts just the "body" of the InputStream</para></callout>
  <callout arearefs="tika.metadata"><para>The <classname>Metadata</classname> object will hold metadata like author, title, etc. about the content in a map.</para></callout>
  <callout arearefs="tika.parser"><para>The <classname>AutoDetectParser</classname> will figure out the MIME type of the document automatically when parse is called.  Since we know our input is a PDF file, we could have used the <classname>PDFParser</classname> instead.</para></callout>
  <callout arearefs="tika.parse"><para>Execute the parse</para></callout>
  <callout arearefs="tika.title"><para>Get the title from the <classname>Metadata</classname> instance</para></callout>
  <callout arearefs="tika.body"><para>Print out the body from the <classname>ContentHandler</classname></para></callout>

</calloutlist>
*/
    //<end id="tika"/>
  }

  @Test
  public void testHtml() throws Exception {
    String html = "<html><head><title>The Big Brown Shoe</title></head><body><p>The best pizza place " +
            "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" +
            "<p>It is located in Amherst, MA.</p></body></html>";
    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
        <callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout>
        <callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout>
        <callout arearefs="html.store"><para>Metadata is a simple storage mechanism where the extracted metadata gets stored</para></callout>
        <callout arearefs="html.parser"><para>We know the input is HTML, so construct a Parser to parse it</para></callout>
        <callout arearefs="html.parse"><para>Do the parse</para></callout>
    </calloutlist>
    */
    //<end id="tika-html"/>
  }

}