/*
* Hibernate, Relational Persistence for Idiomatic Java
*
* Copyright (c) 2012, Red Hat, Inc. and/or its affiliates or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors. All third-party contributions are
* distributed under license by Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.hibernate.search.bridge.builtin;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URI;
import java.sql.Blob;
import java.sql.SQLException;
import org.apache.lucene.document.Document;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.hibernate.search.bridge.FieldBridge;
import org.hibernate.search.bridge.LuceneOptions;
import org.hibernate.search.bridge.TikaMetadataProcessor;
import org.hibernate.search.bridge.TikaParseContextProvider;
import org.hibernate.search.util.impl.ClassLoaderHelper;
import org.hibernate.search.util.logging.impl.Log;
import org.hibernate.search.util.logging.impl.LoggerFactory;
import static com.google.common.base.Throwables.propagate;
import static org.apache.tika.io.IOUtils.closeQuietly;
/**
* Bridge implementation which uses Apache Tika to extract data from provided input.
*
* @author Hardy Ferentschik
*/
public class TikaBridge implements FieldBridge {
private static final Log log = LoggerFactory.make();
private TikaMetadataProcessor metadataProcessor;
private TikaParseContextProvider parseContextProvider;
public TikaBridge() {
setMetadataProcessorClass( null );
setParseContextProviderClass( null );
}
public void setParseContextProviderClass(Class<?> parseContextProviderClass) {
if ( parseContextProviderClass == null ) {
parseContextProvider = new NoopParseContextProvider();
}
else {
parseContextProvider = ClassLoaderHelper.instanceFromClass(
TikaParseContextProvider.class,
parseContextProviderClass,
"Tika metadata processor"
);
}
}
public void setMetadataProcessorClass(Class<?> metadataProcessorClass) {
if ( metadataProcessorClass == null ) {
metadataProcessor = new NoopTikaMetadataProcessor();
}
else {
metadataProcessor = ClassLoaderHelper.instanceFromClass(
TikaMetadataProcessor.class,
metadataProcessorClass,
"Tika parse context provider"
);
}
}
@Override
public void set(String name, Object value, Document document, LuceneOptions luceneOptions) {
if ( value == null ) {
throw new IllegalArgumentException( "null cannot be passed to Tika bridge" );
}
InputStream in = null;
try {
in = getInputStreamForData( value );
Metadata metadata = metadataProcessor.prepareMetadata();
ParseContext parseContext = parseContextProvider.getParseContext( name, value );
StringWriter writer = new StringWriter();
WriteOutContentHandler contentHandler = new WriteOutContentHandler( writer );
Parser parser = new AutoDetectParser();
parser.parse( in, contentHandler, metadata, parseContext );
luceneOptions.addFieldToDocument( name, writer.toString(), document );
// allow for optional indexing of metadata by the user
metadataProcessor.set( name, value, document, luceneOptions, metadata );
}
catch ( Exception e ) {
throw propagate( e );
}
finally {
closeQuietly( in );
}
}
private InputStream getInputStreamForData(Object object) throws Exception {
InputStream in;
if ( object instanceof Blob ) {
try {
in = ( ( Blob ) object ).getBinaryStream();
}
catch ( SQLException e ) {
throw log.unableToGetInputStreamFromBlob( e );
}
}
else if ( object instanceof byte[] ) {
byte[] data = ( byte[] ) object;
in = new ByteArrayInputStream( data );
}
else if ( object instanceof String ) {
String path = ( String ) object;
File file = new File( path );
in = openInputStream( file );
}
else if ( object instanceof URI ) {
URI uri = ( URI ) object;
File file = new File( uri );
in = openInputStream( file );
}
else {
throw log.unsupportedTikaBridgeType();
}
return in;
}
private FileInputStream openInputStream(File file) throws IOException {
if ( file.exists() ) {
if ( file.isDirectory() ) {
throw log.fileIsADirectory( file.toString() );
}
if ( !file.canRead() ) {
throw log.fileIsNotReadable( file.toString() );
}
}
else {
throw log.fileDoesNotExist( file.toString() );
}
return new FileInputStream( file );
}
private static class NoopTikaMetadataProcessor implements TikaMetadataProcessor {
@Override
public Metadata prepareMetadata() {
return new Metadata();
}
@Override
public void set(String name, Object value, Document document, LuceneOptions luceneOptions, Metadata metadata) {
}
}
private static class NoopParseContextProvider implements TikaParseContextProvider {
@Override
public ParseContext getParseContext(String name, Object value) {
return new ParseContext();
}
}
}