/* * Copyright 2014 JBoss Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.artificer.repository.hibernate.query; import org.apache.lucene.document.Document; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.WriteOutContentHandler; import org.hibernate.search.bridge.FieldBridge; import org.hibernate.search.bridge.LuceneOptions; import org.hibernate.search.bridge.TikaMetadataProcessor; import org.hibernate.search.bridge.TikaParseContextProvider; import org.hibernate.search.util.impl.ClassLoaderHelper; import org.hibernate.search.util.logging.impl.Log; import org.hibernate.search.util.logging.impl.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.StringWriter; import java.net.URI; import java.sql.Blob; import java.sql.SQLException; import static org.apache.tika.io.IOUtils.closeQuietly; /** * This is nearly an exact copy of Hibernate Search's built-in TikaBridge. However, due to a classloading * issue (HSEARCH-1885), the Tika jars are not visible to WF/EAP's Search module. So, we're providing it here... * * Also note that #set gracefully handles null values. Since ArtificerArtifact's 'content' * and 'contentPath' are mutually exclusive, one will always have a null. * * @author Brett Meyer. */ public class ArtificerTikaBridge implements FieldBridge { private static final Log log = LoggerFactory.make(); // Expensive, so only do it once... private static final Parser PARSER = new AutoDetectParser(); private TikaMetadataProcessor metadataProcessor; private TikaParseContextProvider parseContextProvider; public ArtificerTikaBridge() { setMetadataProcessorClass( null ); setParseContextProviderClass( null ); } public void setParseContextProviderClass(Class<?> parseContextProviderClass) { if ( parseContextProviderClass == null ) { parseContextProvider = new NoopParseContextProvider(); } else { parseContextProvider = ClassLoaderHelper.instanceFromClass( TikaParseContextProvider.class, parseContextProviderClass, "Tika metadata processor" ); } } public void setMetadataProcessorClass(Class<?> metadataProcessorClass) { if ( metadataProcessorClass == null ) { metadataProcessor = new NoopTikaMetadataProcessor(); } else { metadataProcessor = ClassLoaderHelper.instanceFromClass( TikaMetadataProcessor.class, metadataProcessorClass, "Tika parse context provider" ); } } @Override public void set(String name, Object value, Document document, LuceneOptions luceneOptions) { if ( value == null ) { // throw new IllegalArgumentException( "null cannot be passed to Tika bridge" ); return; } InputStream in = getInputStreamForData( value ); try { Metadata metadata = metadataProcessor.prepareMetadata(); ParseContext parseContext = parseContextProvider.getParseContext( name, value ); StringWriter writer = new StringWriter(); WriteOutContentHandler contentHandler = new WriteOutContentHandler( writer ); PARSER.parse( in, contentHandler, metadata, parseContext ); luceneOptions.addFieldToDocument( name, writer.toString(), document ); // allow for optional indexing of metadata by the user metadataProcessor.set( name, value, document, luceneOptions, metadata ); } catch ( Exception e ) { // throw log.unableToParseDocument( e ); log.warn("Tika was unable to parse the document -- full-text search may not work properly.", e); } finally { closeQuietly( in ); } } private InputStream getInputStreamForData(Object object) { if ( object instanceof Blob) { try { return ( (Blob) object ).getBinaryStream(); } catch ( SQLException e ) { throw log.unableToGetInputStreamFromBlob( e ); } } else if ( object instanceof byte[] ) { byte[] data = (byte[]) object; return new ByteArrayInputStream( data ); } else if ( object instanceof String ) { String path = (String) object; File file = new File( path ); return openInputStream( file ); } else if ( object instanceof URI) { URI uri = (URI) object; File file = new File( uri ); return openInputStream( file ); } else { throw log.unsupportedTikaBridgeType( object != null ? object.getClass() : null ); } } private FileInputStream openInputStream(File file) { if ( file.exists() ) { if ( file.isDirectory() ) { throw log.fileIsADirectory( file.toString() ); } if ( !file.canRead() ) { throw log.fileIsNotReadable( file.toString() ); } } else { throw log.fileDoesNotExist( file.toString() ); } try { return new FileInputStream( file ); } catch ( FileNotFoundException e ) { throw log.fileDoesNotExist( file.toString() ); } } private static class NoopTikaMetadataProcessor implements TikaMetadataProcessor { @Override public Metadata prepareMetadata() { return new Metadata(); } @Override public void set(String name, Object value, Document document, LuceneOptions luceneOptions, Metadata metadata) { } } private static class NoopParseContextProvider implements TikaParseContextProvider { @Override public ParseContext getParseContext(String name, Object value) { return new ParseContext(); } } }