/*
* Hibernate, Relational Persistence for Idiomatic Java
*
* Copyright (c) 2012, Red Hat, Inc. and/or its affiliates or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors. All third-party contributions are
* distributed under license by Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.hibernate.search.test.bridge.builtin;
import java.io.File;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.junit.Before;
import org.junit.Test;
import org.hibernate.search.SearchException;
import org.hibernate.search.annotations.Store;
import org.hibernate.search.bridge.LuceneOptions;
import org.hibernate.search.bridge.TikaMetadataProcessor;
import org.hibernate.search.bridge.TikaParseContextProvider;
import org.hibernate.search.bridge.builtin.TikaBridge;
import org.hibernate.search.engine.impl.LuceneOptionsImpl;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
import static junit.framework.Assert.fail;
/**
* @author Hardy Ferentschik
*/
public class TikaBridgeTest {
private static final String TEST_DOCUMENT_PDF = "/org/hibernate/search/test/bridge/builtin/test-document-1.pdf";
private static final String PATH_TO_TEST_DOCUMENT_PDF;
static {
try {
File pdfFile = new File( TikaBridgeTest.class.getResource( TEST_DOCUMENT_PDF ).toURI() );
PATH_TO_TEST_DOCUMENT_PDF = pdfFile.getAbsolutePath();
}
catch ( URISyntaxException e ) {
throw new RuntimeException( "Unable to determine file path for test document" );
}
}
private final String testFieldName = "content";
private TikaBridge bridgeUnderTest;
private Document testDocument;
private LuceneOptions options;
@Before
public void setUp() {
bridgeUnderTest = new TikaBridge();
testDocument = new Document();
options = new LuceneOptionsImpl( Store.YES, Field.Index.ANALYZED, Field.TermVector.NO, 0f );
CustomTikaMetadataProcessor.invocationCount = 0;
CustomTikaParseContextProvider.invocationCount = 0;
}
@Test(expected = IllegalArgumentException.class)
public void testNullDataThrowsException() {
bridgeUnderTest.set( testFieldName, null, testDocument, options );
}
@Test
public void testPdfToString() throws Exception {
URI pdfUri = TikaBridgeTest.class.getResource( TEST_DOCUMENT_PDF ).toURI();
bridgeUnderTest.set( testFieldName, pdfUri, testDocument, options );
assertEquals(
"Wrong extracted text",
"Hibernate Search pdf test document",
testDocument.get( testFieldName ).trim()
);
}
@Test
public void testUnknownTikaMetadataProcessor() throws Exception {
try {
bridgeUnderTest.setMetadataProcessorClass( this.getClass() );
fail();
}
catch ( SearchException e ) {
assertEquals(
"Wrong error message",
"Wrong configuration of Tika parse context provider: class org.hibernate.search.test.bridge.builtin.TikaBridgeTest does not implement interface org.hibernate.search.bridge.TikaMetadataProcessor",
e.getMessage()
);
}
}
@Test
public void testPrepareMetadata() {
bridgeUnderTest.setMetadataProcessorClass( CustomTikaMetadataProcessor.class );
bridgeUnderTest.set( testFieldName, PATH_TO_TEST_DOCUMENT_PDF, testDocument, options );
assertEquals(
"The set method of the custom metadata processor should have been called",
1,
CustomTikaMetadataProcessor.invocationCount
);
}
@Test
public void testIndexingMetadata() {
bridgeUnderTest.setMetadataProcessorClass( CustomTikaMetadataProcessor.class );
bridgeUnderTest.set( testFieldName, PATH_TO_TEST_DOCUMENT_PDF, testDocument, options );
assertEquals(
"The content type should have been indexed",
"application/pdf",
testDocument.get( "type" )
);
}
@Test
public void testUnknownTikaParseContextProvider() throws Exception {
try {
bridgeUnderTest.setParseContextProviderClass( this.getClass() );
fail();
}
catch ( SearchException e ) {
assertEquals(
"Wrong error message",
"Wrong configuration of Tika metadata processor: class org.hibernate.search.test.bridge.builtin.TikaBridgeTest does not implement interface org.hibernate.search.bridge.TikaParseContextProvider",
e.getMessage()
);
}
}
@Test
public void testCustomTikaParseContextProvider() throws Exception {
bridgeUnderTest.setParseContextProviderClass( CustomTikaParseContextProvider.class );
bridgeUnderTest.set( testFieldName, PATH_TO_TEST_DOCUMENT_PDF, testDocument, options );
assertEquals(
"The getParseContext method of the custom parse context provider should have been called",
1,
CustomTikaParseContextProvider.invocationCount
);
}
@Test
public void testInvalidPath() throws Exception {
try {
bridgeUnderTest.set( testFieldName, "/foo", testDocument, options );
} catch ( SearchException e ) {
assertTrue( "Wrong error type", e.getMessage().startsWith( "HSEARCH000152" ) );
}
}
public static class CustomTikaMetadataProcessor implements TikaMetadataProcessor {
public static int invocationCount = 0;
@Override
public Metadata prepareMetadata() {
Metadata meta = new Metadata();
meta.add( Metadata.RESOURCE_NAME_KEY, PATH_TO_TEST_DOCUMENT_PDF );
return meta;
}
@Override
public void set(String name, Object value, Document document, LuceneOptions luceneOptions, Metadata metadata) {
invocationCount++;
assertEquals(
"Metadata.RESOURCE_NAME_KEY should be set in the metadata",
PATH_TO_TEST_DOCUMENT_PDF,
metadata.get( Metadata.RESOURCE_NAME_KEY )
);
// indexing the discovered content type
luceneOptions.addFieldToDocument( "type", metadata.get( Metadata.CONTENT_TYPE ), document );
}
}
public static class CustomTikaParseContextProvider implements TikaParseContextProvider {
public static int invocationCount = 0;
@Override
public ParseContext getParseContext(String name, Object value) {
invocationCount++;
return new ParseContext();
}
}
}