/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.utils; //JDK imports import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMimeKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Contains utility methods for parsing documents. Intended to provide simple * entry points into the Tika framework. */ public class ParseUtils implements TikaMimeKeys { /** * Returns a parser that can handle the specified MIME type, and is set to * receive input from a stream opened from the specified URL. NB: Close the * input stream when it is no longer needed! * * @param config * @param mimeType * the document's MIME type * @return a parser appropriate to this MIME type * @throws TikaException */ public static Parser getParser(String mimeType, TikaConfig config) throws TikaException { return config.getParser(MediaType.parse(mimeType)); } /** * Returns a parser that can handle the specified MIME type, and is set to * receive input from a stream opened from the specified URL. The MIME type * is determined automatically. NB: Close the input stream when it is no * longer needed! * * @param documentUrl * URL pointing to the document to parse * @param config * @return a parser appropriate to this MIME type and ready to read input * from the specified document * @throws TikaException */ public static Parser getParser(URL documentUrl, TikaConfig config) throws TikaException { String mimetype = config.getMimeRepository().getMimeType(documentUrl) .getName(); return getParser(mimetype, config); } /** * Returns a parser that can handle the specified MIME type, and is set to * receive input from a stream opened from the specified URL. NB: Close the * input stream when it is no longer needed! * * @param documentFile * File object pointing to the document to parse * @param config * @return a parser appropriate to this MIME type and ready to read input * from the specified document * @throws TikaException */ public static Parser getParser(File documentFile, TikaConfig config) throws TikaException { String mimetype = config.getMimeRepository().getMimeType(documentFile) .getName(); return getParser(mimetype, config); } /** * Gets the string content of a document read from an input stream. * * @param stream the stream from which to read document data * @param config * @param mimeType MIME type of the data * @return the string content parsed from the document */ public static String getStringContent( InputStream stream, TikaConfig config, String mimeType) throws TikaException, IOException { try { Parser parser = config.getParser(MediaType.parse(mimeType)); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, new Metadata()); return handler.toString(); } catch (SAXException e) { throw new TikaException("Unexpected SAX error", e); } } /** * Gets the string content of a document read from an input stream. * * @param documentUrl * URL pointing to the document to parse * @param config * @return the string content parsed from the document */ public static String getStringContent(URL documentUrl, TikaConfig config) throws TikaException, IOException { String mime = config.getMimeRepository().getMimeType(documentUrl) .getName(); return getStringContent(documentUrl, config, mime); } /** * Gets the string content of a document read from an input stream. * * @param documentUrl * URL pointing to the document to parse * @param config * @param mimeType * MIME type of the data * @return the string content parsed from the document */ public static String getStringContent( URL documentUrl, TikaConfig config, String mimeType) throws TikaException, IOException { InputStream stream = documentUrl.openStream(); try { return getStringContent(stream, config, mimeType); } finally { stream.close(); } } /** * Gets the string content of a document read from an input stream. * * @param documentFile * File object pointing to the document to parse * @param config * @param mimeType * MIME type of the data * @return the string content parsed from the document */ public static String getStringContent( File documentFile, TikaConfig config, String mimeType) throws TikaException, IOException { InputStream stream = new BufferedInputStream(new FileInputStream( documentFile)); try { return getStringContent(stream, config, mimeType); } finally { stream.close(); } } /** * Gets the string content of a document read from an input stream. * * @param documentFile * File object pointing to the document to parse * @param config * @return the string content parsed from the document */ public static String getStringContent(File documentFile, TikaConfig config) throws TikaException, IOException { String mime = config.getMimeRepository().getMimeType(documentFile).getName(); return getStringContent(documentFile, config, mime); } }