/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.extractor.tika; import java.io.IOException; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import javax.jcr.RepositoryException; import org.apache.tika.config.ServiceLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.modeshape.common.collection.Collections; import org.modeshape.common.logging.Logger; import org.modeshape.common.util.StringUtil; import org.modeshape.jcr.api.Binary; import org.modeshape.jcr.api.text.TextExtractor; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * A {@link TextExtractor} that uses the Apache Tika library. * <p> * This extractor will automatically discover all of the Tika {@link Parser} implementations that are defined in * <code>META-INF/services/org.apache.tika.parser.Parser</code> text files accessible via the current classloader and that contain * the class names of the Parser implementations (one class name per line in each file). * </p> * <p> * This text extractor can be configured in a ModeShape configuration by specifying several optional properties: * <ul> * <li><strong>excludedMimeTypes</strong> - The comma- or whitespace-separated list of MIME types that should be excluded from * text extraction, even if there is a Tika Parser available for that MIME type. By default, the MIME types for * {@link #DEFAULT_EXCLUDED_MIME_TYPES package files} are excluded, though explicitly setting any excluded MIME types will * override these default.</li> * <li><strong>includedMimeTypes</strong> - The comma- or whitespace-separated list of MIME types that should be included in text * extraction. This extractor will ignore any MIME types in this list that are not covered by Tika Parser implementations.</li> * </ul> * </p> */ public class TikaTextExtractor extends TextExtractor { protected static final Logger LOGGER = Logger.getLogger(TikaTextExtractor.class); /** * The MIME types that are excluded by default. Currently, this list consists of: * <ul> * <li>application/x-archive</li> * <li>application/x-bzip</li> * <li>application/x-bzip2</li> * <li>application/x-cpio</li> * <li>application/x-gtar</li> * <li>application/x-gzip</li> * <li>application/x-tar</li> * <li>application/zip</li> * <li>application/vnd.teiid.vdb</li> * <li>image/*</li> * <li>audio/*</li> * <li>video/*</li> * </ul> */ protected static final Set<MediaType> DEFAULT_EXCLUDED_MIME_TYPES = Collections.unmodifiableSet( MediaType.application("x-archive"), MediaType.application("x-bzip"), MediaType.application("x-bzip2"), MediaType.application("x-cpio"), MediaType.application("x-gtar"), MediaType.application("x-gzip"), MediaType.application("x-tar"), MediaType.application("zip"), MediaType.application("vnd.teiid.vdb"), MediaType.image("*"), MediaType.audio("*"), MediaType.video("*")); private final Set<MediaType> excludedMediaTypes = new HashSet<>(); private final Set<MediaType> includedMediaTypes = new HashSet<>(); private final Set<MediaType> parserSupportedMediaTypes = new HashSet<>(); /** * The write limit for the Tika parser, representing the maximum number of characters that should be extracted by the * TIKA parser; set via reflection */ private Integer writeLimit; private final AtomicReference<DefaultParser> parser = new AtomicReference<>(); /** * No-arg constructor is required because this is instantiated by reflection. */ public TikaTextExtractor() { this.excludedMediaTypes.addAll(DEFAULT_EXCLUDED_MIME_TYPES); } @Override public boolean supportsMimeType( String mimeType ) { MediaType mediaType = MediaType.parse(mimeType); if (mediaType == null) { logger().debug("Invalid mime-type: {0}", mimeType); return false; } initialize(); for (MediaType excludedMediaType : excludedMediaTypes) { if (excludedMediaType.equals(mediaType)) { return false; } if (excludedMediaType.getSubtype().equalsIgnoreCase("*") && mediaType.getType().equalsIgnoreCase(excludedMediaType.getType())) { return false; } } return includedMediaTypes.isEmpty() ? parserSupportedMediaTypes.contains(mediaType) : parserSupportedMediaTypes.contains(mediaType) && includedMediaTypes.contains(mediaType); } @Override public void extractFrom( final Binary binary, final TextExtractor.Output output, final Context context ) throws Exception { final DefaultParser parser = initialize(); final Integer writeLimit = this.writeLimit; processStream(binary, stream -> { Metadata metadata = prepareMetadata(binary, context); //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed //https://issues.apache.org/jira/browse/TIKA-1069 ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1); try { LOGGER.debug("Using TikaTextExtractor to extract text"); // Parse the input stream ... parser.parse(stream, textHandler, metadata, new ParseContext()); } catch (SAXException sae) { LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage()); } catch (NoClassDefFoundError ncdfe) { LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage()); } catch (Throwable e) { LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage()); } finally { // Record all of the text in the body ... String text = textHandler.toString().trim(); if (!StringUtil.isBlank(text)) { output.recordText(text); LOGGER.debug("TikaTextExtractor found text: " + text); } } return null; }); } /** * Creates a new tika metadata object used by the parser. This will contain the mime-type of the content being parsed, if this * is available to the underlying context. If not, Tika's autodetection mechanism is used to try and get the mime-type. * * @param binary a <code>org.modeshape.jcr.api.Binary</code> instance of the content being parsed * @param context the extraction context; may not be null * @return a <code>Metadata</code> instance. * @throws java.io.IOException if auto-detecting the mime-type via Tika fails * @throws RepositoryException if error obtaining MIME-type of the binary parameter */ protected final Metadata prepareMetadata( final Binary binary, final Context context ) throws IOException, RepositoryException { Metadata metadata = new Metadata(); String mimeType = binary.getMimeType(); if (StringUtil.isBlank(mimeType)) { // Call the detector (we don't know the name) ... mimeType = context.mimeTypeOf(null, binary); } if (!StringUtil.isBlank(mimeType)) { metadata.set(Metadata.CONTENT_TYPE, mimeType); } return metadata; } /** * This class lazily initializes the {@link DefaultParser} instance. * * @return the default parser; same as {@link #parser} */ protected DefaultParser initialize() { parser.compareAndSet(null, newDefaultParser()); return parser.get(); } private DefaultParser newDefaultParser() { ServiceLoader serviceLoader = new ServiceLoader(this.getClass().getClassLoader(), (classname, throwable) -> LOGGER.debug(throwable, "error while loading parser for {0}", classname)); DefaultParser defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), serviceLoader); LOGGER.debug("Initializing Tika Text Extractor"); Map<MediaType, Parser> parsers = defaultParser.getParsers(); LOGGER.debug("Tika parsers found: {0}",parsers.size()); for (MediaType mediaType : parsers.keySet()) { parserSupportedMediaTypes.add(mediaType); LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType); } convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes); convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes); LOGGER.debug("Initialized {0}", this); return defaultParser; } private void convertStringMimeTypesToMediaTypes(Set<String> mimeTypes, Set<MediaType> mediaTypes) { for (String mimeTypeEntry : mimeTypes) { //allow each mime type entry to be an array in itself String[] multipleMimeTypes = mimeTypeEntry.split("[,\\s]"); for (String mimeType : multipleMimeTypes) { if (StringUtil.isBlank(mimeType)) { continue; } MediaType mediaType = MediaType.parse(mimeType.trim()); if (mediaType == null) { logger().debug("Invalid media type: {0}", mimeType); continue; } mediaTypes.add(mediaType); } } } /** * Sets the write limit for the Tika parser, representing the maximum number of characters that should be extracted by the * TIKA parser. * * @param writeLimit an {@link Integer} which represents the write limit; may be null * @see BodyContentHandler#BodyContentHandler(int) */ protected void setWriteLimit( Integer writeLimit ) { this.writeLimit = writeLimit; } protected Set<MediaType> getExcludedMediaTypes() { return excludedMediaTypes; } protected Set<MediaType> getIncludedMediaTypes() { return includedMediaTypes; } protected Set<MediaType> getParserSupportedMediaTypes() { return parserSupportedMediaTypes; } @Override public String toString() { final StringBuilder sb = new StringBuilder("TikaTextExtractor{"); sb.append("excludedMediaTypes=").append(excludedMediaTypes); sb.append(", includedMediaTypes=").append(includedMediaTypes); sb.append(", parserSupportedMediaTypes=").append(parserSupportedMediaTypes); sb.append(", writeLimit=").append(writeLimit != null ? writeLimit : "unlimited"); sb.append('}'); return sb.toString(); } }