/*
* Hibernate Search, full-text search for your domain model
*
* License: GNU Lesser General Public License (LGPL), version 2.1 or later
* See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
*/
package org.hibernate.search.test.bridge.builtin;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.net.URI;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.hibernate.search.annotations.Store;
import org.hibernate.search.bridge.LuceneOptions;
import org.hibernate.search.bridge.TikaMetadataProcessor;
import org.hibernate.search.bridge.TikaParseContextProvider;
import org.hibernate.search.bridge.builtin.TikaBridge;
import org.hibernate.search.engine.impl.LuceneOptionsImpl;
import org.hibernate.search.engine.metadata.impl.BackReference;
import org.hibernate.search.engine.metadata.impl.DocumentFieldMetadata;
import org.hibernate.search.engine.metadata.impl.DocumentFieldPath;
import org.hibernate.search.exception.SearchException;
import org.hibernate.search.test.util.impl.ClasspathResourceAsFile;
import org.hibernate.search.engine.metadata.impl.PropertyMetadata;
import org.hibernate.search.engine.metadata.impl.TypeMetadata;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
/**
* @author Hardy Ferentschik
*/
public class TikaBridgeTest {
private static final String TEST_DOCUMENT_PDF = "/org/hibernate/search/test/bridge/builtin/test-document-1.pdf";
@Rule
public ClasspathResourceAsFile testDocumentPdf = new ClasspathResourceAsFile( getClass(), TEST_DOCUMENT_PDF );
private final String testFieldName = "content";
private TikaBridge bridgeUnderTest;
private Document testDocument;
private LuceneOptions options;
@Before
public void setUp() {
bridgeUnderTest = new TikaBridge();
testDocument = new Document();
DocumentFieldMetadata fieldMetadata =
new DocumentFieldMetadata.Builder(
new BackReference<TypeMetadata>(),
new BackReference<PropertyMetadata>(), null,
new DocumentFieldPath( "", "" ), // No field path
Store.YES, Field.Index.ANALYZED, Field.TermVector.NO
)
.boost( 0F )
.build();
options = new LuceneOptionsImpl( fieldMetadata, 1f, 1f );
CustomTikaMetadataProcessor.invocationCount = 0;
CustomTikaParseContextProvider.invocationCount = 0;
}
@Test
public void testPdfToString() throws Exception {
URI pdfUri = testDocumentPdf.get().toURI();
bridgeUnderTest.set( testFieldName, pdfUri, testDocument, options );
assertEquals(
"Wrong extracted text",
"Hibernate Search pdf test document",
testDocument.get( testFieldName ).trim()
);
}
@Test
public void testUnknownTikaMetadataProcessor() throws Exception {
try {
bridgeUnderTest.setMetadataProcessorClass( this.getClass() );
fail();
}
catch (SearchException e) {
assertEquals(
"Wrong error message",
"Wrong configuration of Tika parse context provider: class org.hibernate.search.test.bridge.builtin.TikaBridgeTest does not implement interface org.hibernate.search.bridge.TikaMetadataProcessor",
e.getMessage()
);
}
}
@Test
public void testPrepareMetadata() {
bridgeUnderTest.setMetadataProcessorClass( CustomTikaMetadataProcessor.class );
bridgeUnderTest.set( testFieldName, testDocumentPdf.get().getPath(), testDocument, options );
assertEquals(
"The set method of the custom metadata processor should have been called",
1,
CustomTikaMetadataProcessor.invocationCount
);
}
@Test
public void testIndexingMetadata() {
bridgeUnderTest.setMetadataProcessorClass( CustomTikaMetadataProcessor.class );
bridgeUnderTest.set( testFieldName, testDocumentPdf.get().getPath(), testDocument, options );
assertEquals(
"The content type should have been indexed",
"application/pdf",
testDocument.get( "type" )
);
}
@Test
public void testUnknownTikaParseContextProvider() throws Exception {
try {
bridgeUnderTest.setParseContextProviderClass( this.getClass() );
fail();
}
catch (SearchException e) {
assertEquals(
"Wrong error message",
"Wrong configuration of Tika metadata processor: class org.hibernate.search.test.bridge.builtin.TikaBridgeTest does not implement interface org.hibernate.search.bridge.TikaParseContextProvider",
e.getMessage()
);
}
}
@Test
public void testCustomTikaParseContextProvider() throws Exception {
bridgeUnderTest.setParseContextProviderClass( CustomTikaParseContextProvider.class );
bridgeUnderTest.set( testFieldName, testDocumentPdf.get().getPath(), testDocument, options );
assertEquals(
"The getParseContext method of the custom parse context provider should have been called",
1,
CustomTikaParseContextProvider.invocationCount
);
}
@Test
public void testInvalidPath() throws Exception {
try {
bridgeUnderTest.set( testFieldName, "/foo", testDocument, options );
}
catch (SearchException e) {
assertTrue( "Wrong error type", e.getMessage().startsWith( "HSEARCH000152" ) );
}
}
public static class CustomTikaMetadataProcessor implements TikaMetadataProcessor {
public static int invocationCount = 0;
@Override
public Metadata prepareMetadata() {
Metadata meta = new Metadata();
meta.add( Metadata.RESOURCE_NAME_KEY, "foo" );
return meta;
}
@Override
public void set(String name, Object value, Document document, LuceneOptions luceneOptions, Metadata metadata) {
invocationCount++;
assertEquals(
"Metadata.RESOURCE_NAME_KEY should be set in the metadata",
"foo",
metadata.get( Metadata.RESOURCE_NAME_KEY )
);
// indexing the discovered content type
luceneOptions.addFieldToDocument( "type", metadata.get( Metadata.CONTENT_TYPE ), document );
}
}
public static class CustomTikaParseContextProvider implements TikaParseContextProvider {
public static int invocationCount = 0;
@Override
public ParseContext getParseContext(String name, Object value) {
invocationCount++;
return new ParseContext();
}
}
}