/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.example; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.net.URL; import java.nio.CharBuffer; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.zip.GZIPInputStream; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.html.IdentityHtmlMapper; import org.apache.tika.parser.txt.TXTParser; import org.apache.tika.parser.xml.XMLParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class TIAParsingExample { public static String parseToStringExample() throws Exception { File document = new File("example.doc"); String content = new Tika().parseToString(document); System.out.print(content); return content; } public static void parseToReaderExample() throws Exception { File document = new File("example.doc"); try (Reader reader = new Tika().parse(document)) { char[] buffer = new char[1000]; int n = reader.read(buffer); while (n != -1) { System.out.append(CharBuffer.wrap(buffer, 0, n)); n = reader.read(buffer); } } } public static void parseFileInputStream(String filename) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = new FileInputStream(new File(filename))) { parser.parse(stream, handler, metadata, context); } } public static void parseURLStream(String address) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) { parser.parse(stream, handler, metadata, context); } } public static void parseTikaInputStream(String filename) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = TikaInputStream.get(new File(filename))) { parser.parse(stream, handler, metadata, context); } } public static File tikaInputStreamGetFile(String filename) throws Exception { try (InputStream stream = TikaInputStream.get(new File(filename))) { TikaInputStream tikaInputStream = TikaInputStream.get(stream); File file = tikaInputStream.getFile(); return file; } } public static void useHtmlParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new HtmlParser(); parser.parse(stream, handler, metadata, context); } public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); } public static void useAutoDetectParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); parser.parse(stream, handler, metadata, context); } public static void testTeeContentHandler(String filename) throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); LinkContentHandler linkCollector = new LinkContentHandler(); try (OutputStream output = new FileOutputStream(new File(filename))) { ContentHandler handler = new TeeContentHandler( new BodyContentHandler(output), linkCollector); parser.parse(stream, handler, metadata, context); } } public static void testLocale() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.ENGLISH); parser.parse(stream, handler, metadata, context); } public static void testHtmlMapper() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(HtmlMapper.class, new IdentityHtmlMapper()); parser.parse(stream, handler, metadata, context); } public static void testCompositeDocument() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(Parser.class, new ParserDecorator(parser) { private static final long serialVersionUID = 4424210691523343833L; @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // custom processing of the component document } }); parser.parse(stream, handler, metadata, context); } }