/*
* Hibernate Search, full-text search for your domain model
*
* License: GNU Lesser General Public License (LGPL), version 2.1 or later
* See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
*/
package org.hibernate.search.test.bridge.tika;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.sql.Blob;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.hibernate.CacheMode;
import org.hibernate.FlushMode;
import org.hibernate.ScrollMode;
import org.hibernate.ScrollableResults;
import org.hibernate.Session;
import org.hibernate.Transaction;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.hibernate.search.cfg.Environment;
import org.hibernate.search.test.SearchTestBase;
import org.hibernate.search.test.util.impl.ClasspathResourceAsFile;
import org.hibernate.search.testsupport.TestConstants;
import org.junit.Rule;
import org.junit.Test;
/**
* @author Hardy Ferentschik
*/
public class TikaBridgeInputTypeTest extends SearchTestBase {
private static final String TEST_DOCUMENT_PDF_1 = "/org/hibernate/search/test/bridge/tika/test-document-1.pdf";
private static final String TEST_DOCUMENT_PDF_2 = "/org/hibernate/search/test/bridge/tika/test-document-2.pdf";
@Rule
public ClasspathResourceAsFile testDocumentPdf1 = new ClasspathResourceAsFile( getClass(), TEST_DOCUMENT_PDF_1 );
@Rule
public ClasspathResourceAsFile testDocumentPdf2 = new ClasspathResourceAsFile( getClass(), TEST_DOCUMENT_PDF_2 );
@Test
public void testDefaultTikaBridgeWithListOfString() throws Exception {
try ( Session session = openSession() ) {
String content1 = testDocumentPdf1.get().getAbsolutePath();
String content2 = testDocumentPdf2.get().getAbsolutePath();
persistBook( session, new Book( content1, content2 ) );
indexBook( session );
List<Book> resultWithLucene = search( session, "contentAsListOfString", "Lucene" );
assertEquals( "there should be a match", 1, resultWithLucene.size() );
List<Book> resultWithTika = search( session, "contentAsListOfString", "Tika" );
assertEquals( "there should be a match", 1, resultWithTika.size() );
}
}
private List<Book> search(Session session, String field, String keyword) throws ParseException {
FullTextSession fullTextSession = Search.getFullTextSession( session );
Transaction transaction = fullTextSession.beginTransaction();
QueryParser parser = new QueryParser( field, TestConstants.standardAnalyzer );
Query query = parser.parse( keyword );
@SuppressWarnings("unchecked")
List<Book> result = fullTextSession.createFullTextQuery( query ).list();
transaction.commit();
fullTextSession.clear();
return result;
}
@Test
public void testDefaultTikaBridgeWithBlob() throws Exception {
try ( Session session = openSession() ) {
Blob content = dataAsBlob( testDocumentPdf1.get(), session );
persistBook( session, new Book( content ) );
persistBook( session, new Book() );
// we have to index manually. Using the Blob (streaming approach) the indexing would try to re-read the
// input stream of the blob after it was persisted into the database
indexBook( session );
searchBook( session, "contentAsBlob" );
}
}
@Test
public void testDefaultTikaBridgeWithByteArray() throws Exception {
try ( Session session = openSession() ) {
byte[] content = dataAsBytes( testDocumentPdf1.get() );
persistBook( session, new Book( content ) );
persistBook( session, new Book() );
indexBook( session );
searchBook( session, "contentAsBytes" );
}
}
@Test
public void testDefaultTikaBridgeWithURI() throws Exception {
try ( Session session = openSession() ) {
URI content = testDocumentPdf1.get().toURI();
persistBook( session, new Book( content ) );
persistBook( session, new Book() );
indexBook( session );
searchBook( session, "contentAsURI" );
}
}
@SuppressWarnings("unchecked")
private void searchBook(Session session, String field) throws ParseException {
FullTextSession fullTextSession = Search.getFullTextSession( session );
Transaction transaction = fullTextSession.beginTransaction();
QueryParser parser = new QueryParser(
field,
TestConstants.standardAnalyzer
);
Query query = parser.parse( "foo" );
List<Book> result = fullTextSession.createFullTextQuery( query ).list();
assertEquals( "there should be no match", 0, result.size() );
query = parser.parse( "Lucene" );
result = fullTextSession.createFullTextQuery( query ).list();
assertEquals( "there should be match", 1, result.size() );
query = parser.parse( "<NULL>" );
result = fullTextSession.createFullTextQuery( query ).list();
assertEquals( "there should be match", 1, result.size() );
result = fullTextSession.createFullTextQuery( new MatchAllDocsQuery() ).list();
assertEquals( "there should be match", 2, result.size() );
transaction.commit();
fullTextSession.clear();
}
private void persistBook(Session session, Book book) throws IOException {
Transaction tx = session.beginTransaction();
session.save( book );
session.flush();
tx.commit();
session.clear();
}
void indexBook(Session session) {
FullTextSession fullTextSession = org.hibernate.search.Search.getFullTextSession( session );
fullTextSession.setFlushMode( FlushMode.MANUAL );
fullTextSession.setCacheMode( CacheMode.IGNORE );
Transaction transaction = fullTextSession.beginTransaction();
int BATCH_SIZE = 10;
ScrollableResults results = fullTextSession.createCriteria( Book.class )
.setFetchSize( BATCH_SIZE )
.scroll( ScrollMode.FORWARD_ONLY );
int index = 0;
while ( results.next() ) {
index++;
fullTextSession.index( results.get( 0 ) );
if ( index % BATCH_SIZE == 0 ) {
fullTextSession.flushToIndexes();
fullTextSession.clear();
}
}
fullTextSession.flush();
transaction.commit();
fullTextSession.clear();
}
@Override
public Class<?>[] getAnnotatedClasses() {
return new Class[]{
Book.class
};
}
@Override
public void configure(Map<String, Object> cfg) {
super.configure( cfg );
cfg.put( Environment.INDEXING_STRATEGY, "manual" );
}
private Blob dataAsBlob(File file, Session session) throws IOException {
FileInputStream in = FileUtils.openInputStream( file );
return session.getLobHelper().createBlob( in, file.length() );
}
private byte[] dataAsBytes(File file) throws IOException {
return Files.readAllBytes( file.toPath() );
}
}