/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.extractor; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.Map; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Utility class to handle common issues with embedded documents. * <p/> * Use statically if all that is needed is getting the EmbeddedDocumentExtractor. * Otherwise, instantiate an instance. * <p/> * Note: This is not thread safe. Make sure to instantiate one per thread. */ public class EmbeddedDocumentUtil implements Serializable { private final ParseContext context; private final EmbeddedDocumentExtractor embeddedDocumentExtractor; //these are lazily initialized and can be null private TikaConfig tikaConfig; private MimeTypes mimeTypes; private Detector detector; public EmbeddedDocumentUtil(ParseContext context) { this.context = context; this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context); } /** * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. * As of Tika 1.15, an AutoDetectParser will automatically be added to parse * embedded documents if no Parser.class is specified in the ParseContext. * <p/> * If you'd prefer not to parse embedded documents, set Parser.class * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. * * @param context * @return EmbeddedDocumentExtractor */ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); if (extractor == null) { //ensure that an AutoDetectParser is //available for parsing embedded docs TIKA-2096 Parser embeddedParser = context.get(Parser.class); if (embeddedParser == null) { TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { context.set(Parser.class, new AutoDetectParser()); } else { context.set(Parser.class, new AutoDetectParser(tikaConfig)); } } extractor = new ParsingEmbeddedDocumentExtractor(context); } return extractor; } public PasswordProvider getPasswordProvider() { return context.get(PasswordProvider.class); } public Detector getDetector() { //be as lazy as possible and cache the detector if (detector == null) { detector = context.get(Detector.class); if (detector == null) { detector = getTikaConfig().getDetector(); } } return detector; } public MimeTypes getMimeTypes() { //be as lazy as possible and cache the mimeTypes if (mimeTypes == null) { mimeTypes = context.get(MimeTypes.class); if (mimeTypes == null) { mimeTypes = getTikaConfig().getMimeRepository(); } } return mimeTypes; } public TikaConfig getTikaConfig() { //be as lazy as possible and cache the TikaConfig if (tikaConfig == null) { tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } } return tikaConfig; } public String getExtension(TikaInputStream is, Metadata metadata) { String mimeString = metadata.get(Metadata.CONTENT_TYPE); TikaConfig config = getConfig(); MimeType mimeType = null; MimeTypes types = config.getMimeRepository(); boolean detected = false; if (mimeString != null) { try { mimeType = types.forName(mimeString); } catch (MimeTypeException e) { //swallow } } if (mimeType == null) { Detector detector = config.getDetector(); try { MediaType mediaType = detector.detect(is, metadata); mimeType = types.forName(mediaType.toString()); detected = true; is.reset(); } catch (IOException e) { //swallow } catch (MimeTypeException e) { //swallow } } if (mimeType != null) { if (detected) { //set or correct the mime type metadata.set(Metadata.CONTENT_TYPE, mimeType.toString()); } return mimeType.getExtension(); } return ".bin"; } public TikaConfig getConfig() { TikaConfig config = context.get(TikaConfig.class); if (config == null) { config = TikaConfig.getDefaultConfig(); } return config; } public static void recordException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex); } public static void recordEmbeddedStreamException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, ex); } public boolean shouldParseEmbedded(Metadata m) { return getEmbeddedDocumentExtractor().shouldParseEmbedded(m); } private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { return embeddedDocumentExtractor; } public void parseEmbedded(InputStream inputStream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws IOException, SAXException { embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml); } /** * Tries to find an existing parser within the ParseContext. * It looks inside of CompositeParsers and ParserDecorators. * The use case is when a parser needs to parse an internal stream * that is _part_ of the document, e.g. rtf body inside an msg. * <p/> * Can return <code>null</code> if the context contains no parser or * the correct parser can't be found. * * @param clazz parser class to search for * @param context * @return */ public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) { Parser p = context.get(Parser.class); if (equals(p, clazz)) { return p; } Parser returnParser = null; if (p != null) { if (p instanceof ParserDecorator) { p = ((ParserDecorator)p).getWrappedParser(); } if (equals(p, clazz)) { return p; } if (p instanceof CompositeParser) { returnParser = findInComposite((CompositeParser) p, clazz, context); } } if (returnParser != null && equals(returnParser, clazz)) { return returnParser; } return null; } private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) { Map<MediaType, Parser> map = p.getParsers(context); for (Map.Entry<MediaType, Parser> e : map.entrySet()) { Parser candidate = e.getValue(); if (equals(candidate, clazz)) { return candidate; } if (candidate instanceof ParserDecorator) { candidate = ((ParserDecorator)candidate).getWrappedParser(); } if (equals(candidate, clazz)) { return candidate; } if (candidate instanceof CompositeParser) { candidate = findInComposite((CompositeParser) candidate, clazz, context); } if (equals(candidate, clazz)) { return candidate; } } return null; } private static boolean equals(Parser parser, Class clazz) { if (parser == null) { return false; } return parser.getClass().equals(clazz); } }