/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.cxf.jaxrs.ext.search.tika; import java.io.InputStream; import java.util.Date; import java.util.List; import javax.ws.rs.ext.ParamConverterProvider; import org.apache.cxf.common.util.StringUtils; import org.apache.cxf.jaxrs.ext.search.ParamConverterUtils; import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleField; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FloatField; import org.apache.lucene.document.IntField; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.apache.tika.sax.ToTextContentHandler; public class TikaLuceneContentExtractor { private final LuceneDocumentMetadata defaultDocumentMetadata; private final TikaContentExtractor extractor; /** * Create new Tika-based content extractor using the provided parser instance. * @param parser parser instance */ public TikaLuceneContentExtractor(final Parser parser) { this(parser, true); } /** * Create new Tika-based content extractor using the provided parser instance and * optional media type validation. If validation is enabled, the implementation * will try to detect the media type of the input and validate it against media typesthis.contentFieldName * supported by the parser. * @param parser parser instance * @param validateMediaType enabled or disable media type validation */ public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType) { this(parser, validateMediaType, new LuceneDocumentMetadata()); } /** * Create new Tika-based content extractor using the provided parser instance and * optional media type validation. If validation is enabled, the implementation * will try to detect the media type of the input and validate it against media types * supported by the parser. * @param parser parser instancethis.contentFieldName * @param documentMetadata documentMetadata */ public TikaLuceneContentExtractor(final Parser parser, final LuceneDocumentMetadata documentMetadata) { this(parser, false, new LuceneDocumentMetadata()); } /** * Create new Tika-based content extractor using the provided parser instance and * optional media type validation. If validation is enabled, the implementation * will try to detect the media type of the input and validate it against media types * supported by the parser. * @param parser parser instancethis.contentFieldName * @param validateMediaType enabled or disable media type validation * @param documentMetadata documentMetadata */ public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType, final LuceneDocumentMetadata documentMetadata) { this.extractor = new TikaContentExtractor(parser, validateMediaType); this.defaultDocumentMetadata = documentMetadata; } /** * Create new Tika-based content extractor using the provided parser instance and * optional media type validation. If validation is enabled, the implementation * will try to detect the media type of the input and validate it against media types * supported by the parser. * @param parser parser instancethis.contentFieldName * @param validateMediaType enabled or disable media type validation * @param documentMetadata documentMetadata */ public TikaLuceneContentExtractor(final List<Parser> parsers, final LuceneDocumentMetadata documentMetadata) { this.extractor = new TikaContentExtractor(parsers); this.defaultDocumentMetadata = documentMetadata; } /** * Extract the content and metadata from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the content and metadata from * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extract(final InputStream in) { return extractAll(in, null, true, true); } /** * Extract the content and metadata from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the content and metadata from * @param documentMetadata documentMetadata * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extract(final InputStream in, final LuceneDocumentMetadata documentMetadata) { return extractAll(in, documentMetadata, true, true); } /** * Extract the content only from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the content from * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extractContent(final InputStream in) { return extractAll(in, null, true, false); } /** * Extract the metadata only from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the metadata from * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extractMetadata(final InputStream in) { return extractAll(in, null, false, true); } /** * Extract the metadata only from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the metadata from * @param documentMetadata documentMetadata * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extractMetadata(final InputStream in, final LuceneDocumentMetadata documentMetadata) { return extractAll(in, documentMetadata, false, true); } private Document extractAll(final InputStream in, LuceneDocumentMetadata documentMetadata, boolean extractContent, boolean extractMetadata) { TikaContent content = extractor.extract(in, extractContent ? new ToTextContentHandler() : null); if (content == null) { return null; } final Document document = new Document(); if (documentMetadata == null) { documentMetadata = defaultDocumentMetadata; } if (content.getContent() != null) { document.add(getContentField(documentMetadata, content.getContent())); } if (extractMetadata) { Metadata metadata = content.getMetadata(); for (final String property: metadata.names()) { document.add(getField(documentMetadata, property, metadata.get(property))); } } if (!StringUtils.isEmpty(documentMetadata.getSource())) { document.add(new StringField(documentMetadata.getSourceFieldName(), documentMetadata.getSource(), Store.YES)); } return document; } private static Field getContentField(final LuceneDocumentMetadata documentMetadata, final String content) { return new TextField(documentMetadata.getContentFieldName(), content, Store.YES); } private static Field getField(final LuceneDocumentMetadata documentMetadata, final String name, final String value) { final Class< ? > type = documentMetadata.getFieldType(name); final ParamConverterProvider provider = documentMetadata.getFieldTypeConverter(); if (type != null) { if (Number.class.isAssignableFrom(type)) { if (Double.class.isAssignableFrom(type)) { return new DoubleField(name, ParamConverterUtils.getValue(Double.class, provider, value), Store.YES); } else if (Float.class.isAssignableFrom(type)) { return new FloatField(name, ParamConverterUtils.getValue(Float.class, provider, value), Store.YES); } else if (Long.class.isAssignableFrom(type)) { return new LongField(name, ParamConverterUtils.getValue(Long.class, provider, value), Store.YES); } else if (Integer.class.isAssignableFrom(type) || Byte.class.isAssignableFrom(type)) { return new IntField(name, ParamConverterUtils.getValue(Integer.class, provider, value), Store.YES); } } else if (Date.class.isAssignableFrom(type)) { final Date date = ParamConverterUtils.getValue(Date.class, provider, value); Field field = null; if (date != null) { field = new StringField(name, ParamConverterUtils.getString(Date.class, provider, date), Store.YES); } else { field = new StringField(name, value, Store.YES); } return field; } } return new StringField(name, value, Store.YES); } }