/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.cxf.jaxrs.ext.search.tika;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.cxf.common.logging.LogUtils;
import org.apache.cxf.jaxrs.ext.search.SearchBean;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ToTextContentHandler;
public class TikaContentExtractor {
private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class);
private final List<Parser> parsers;
private final Detector detector;
/**
* Create new Tika-based content extractor using AutoDetectParser.
*/
public TikaContentExtractor() {
this(new AutoDetectParser(), false);
}
/**
* Create new Tika-based content extractor using the provided parser instance.
* @param parser parser instance
*/
public TikaContentExtractor(final Parser parser) {
this(parser, false);
}
/**
* Create new Tika-based content extractor using the provided parser instances.
* @param parsers parser instances
*/
public TikaContentExtractor(final List<Parser> parsers) {
this(parsers, new DefaultDetector());
}
/**
* Create new Tika-based content extractor using the provided parser instances.
* @param parsers parser instances
*/
public TikaContentExtractor(final List<Parser> parsers, Detector detector) {
this.parsers = parsers;
this.detector = detector;
}
/**
* Create new Tika-based content extractor using the provided parser instance and
* optional media type validation. If validation is enabled, the implementation parser
* will try to detect the media type of the input and validate it against media types
* supported by the parser.
* @param parser parser instance
* @param validateMediaType enabled or disable media type validationparser
*/
public TikaContentExtractor(final Parser parser, final boolean validateMediaType) {
this(Collections.singletonList(parser), validateMediaType ? new DefaultDetector() : null);
}
/**
* Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the content and metadata from
* @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
*/
public TikaContent extract(final InputStream in) {
return extract(in, (javax.ws.rs.core.MediaType)null);
}
/**
* Extract the content and metadata from the input stream with a media type hint.
* @param in input stream to extract the content and metadata from
* @param mt JAX-RS MediaType of the stream content
* @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
*/
public TikaContent extract(final InputStream in, javax.ws.rs.core.MediaType mt) {
return extract(in, new ToTextContentHandler(), mt);
}
/**
* Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the content and metadata from
* @param handler custom ContentHandler
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, final ContentHandler handler) {
return extract(in, handler, (javax.ws.rs.core.MediaType)null);
}
/**
* Extract the content and metadata from the input stream with a media type hint.
* @param in input stream to extract the content and metadata from
* @param handler custom ContentHandler
* @param mt JAX-RS MediaType of the stream content
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, final ContentHandler handler,
javax.ws.rs.core.MediaType mt) {
return extract(in, handler, mt, (ParseContext)null);
}
/**
* Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the content and metadata from
* @param handler custom ContentHandler
* @param context custom context
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, final ContentHandler handler, ParseContext context) {
return extract(in, handler, (javax.ws.rs.core.MediaType)null, context);
}
/**
* Extract the content and metadata from the input stream with a media type hint
* type of content.
* @param in input stream to extract the metadata from
* @param handler custom ContentHandler
* @param mt JAX-RS MediaType of the stream content
* @param context custom context
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, ContentHandler handler,
javax.ws.rs.core.MediaType mtHint, ParseContext context) {
if (in == null) {
return null;
}
final Metadata metadata = new Metadata();
try {
// Try to validate that input stream media type is supported by the parser
MediaType mediaType = null;
if (mtHint != null) {
mediaType = MediaType.parse(mtHint.toString());
} else if (detector != null && in.markSupported()) {
mediaType = detector.detect(in, metadata);
}
if (mediaType != null) {
metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
}
Parser parser = null;
if (parsers.size() == 1) {
parser = parsers.get(0);
} else {
for (Parser p : parsers) {
if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
continue;
}
parser = p;
break;
}
}
if (parser == null) {
return null;
}
if (context == null) {
context = new ParseContext();
}
if (context.get(Parser.class) == null) {
// to process the embedded attachments
context.set(Parser.class,
parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
}
try {
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {
// Starting from Tika 1.6 PDFParser (with other parsers to be updated in the future) will skip
// the content processing if the content handler is null. This can be used to optimize the
// extraction process. If we get an exception with a null handler then a given parser is still
// not ready to accept null handlers so lets retry with IgnoreContentHandler.
if (handler == null) {
handler = new IgnoreContentHandler();
parser.parse(in, handler, metadata, context);
} else {
throw ex;
}
}
return new TikaContent(handler, metadata, mediaType);
} catch (final IOException ex) {
LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
} catch (final SAXException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
} catch (final TikaException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
}
return null;
}
/**
* Extract the metadata only from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the metadata from
* @return the extracted content or null if extraction is not possible or was unsuccessful
*/
public TikaContent extractMetadata(final InputStream in) {
return extract(in, (ContentHandler)null);
}
/**
* Extract the metadata only from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the metadata from
* @return the extracted metadata converted to SearchBean or null if extraction is not possible
* or was unsuccessful
*/
public SearchBean extractMetadataToSearchBean(final InputStream in) {
TikaContent tc = extractMetadata(in);
if (tc == null) {
return null;
}
Metadata metadata = tc.getMetadata();
SearchBean bean = new SearchBean();
for (final String property: metadata.names()) {
bean.set(property, metadata.get(property));
}
return bean;
}
/**
* Extracted content, metadata and media type container
*/
public static class TikaContent implements Serializable {
private static final long serialVersionUID = -1240120543378490963L;
private ContentHandler content;
private Metadata metadata;
private MediaType mediaType;
public TikaContent(ContentHandler content, Metadata metadata, MediaType mediaType) {
this.content = content;
this.metadata = metadata;
this.mediaType = mediaType;
}
/**
* Return the content cached by ContentHandler
* @return the content, may be empty or null if a custom non-caching ContentHandler was used
* to parse the content
*/
public String getContent() {
return content instanceof ToTextContentHandler ? content.toString() : null;
}
/**
* Return the metadata
* @return the metadata
*/
public Metadata getMetadata() {
return metadata;
}
/**
* Return the detected media type of the content
* @return the media type, null if no auto-detection was done
*/
public MediaType getMediaType() {
return mediaType;
}
}
private static class IgnoreContentHandler extends ToTextContentHandler {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
// Complete
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
// Complete
}
@Override
public String toString() {
return "";
}
}
}