package com.manning.hsia.dvdstore;
import com.manning.hsia.test.ch13.SearchTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.hibernate.Transaction;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
public class TestPDFTextExtractor extends SearchTestCase {
InputStream istream = null;
private Analyzer analyzer = new StandardAnalyzer();
@Test(groups="ch13")
public void testPDFExtractor() throws Exception {
FullTextSession session = Search.getFullTextSession( openSession() );
Transaction tx = session.beginTransaction();
PDDocument doc;
try {
File f = new File( "ch13/src/com/manning/hsia/dvdstore/file1.pdf" );
istream = new FileInputStream( f.getAbsolutePath() );
PDFParser p = new PDFParser( istream );
p.parse();
doc = p.getPDDocument();
Pdf pdf = getDocument( doc );
closeInputStream( istream );
closeDocument( doc );
pdf.setId(1);
buildIndex( pdf, session, tx );
tx = session.beginTransaction();
QueryParser parser = new QueryParser( "description", analyzer );
Query query = parser.parse( "description:salesman" );
org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery( query, Pdf.class );
List results = hibQuery.list();
assert results.size() == 1 : "incorrect result size";
Pdf result = (Pdf) results.get( 0 );
assert result.getAuthor().startsWith( "John Griffin" ) : "incorrect author";
assert result.getDescription().startsWith( "Keanu Reeves" ) : "incorrect description";
for (Object element : session.createQuery( "from " + Pdf.class.getName() ).list()) {
session.delete( element );
}
tx.commit();
}
catch (Exception e) {
e.printStackTrace();
}
finally {
session.close();
}
}
private Pdf getDocument( PDDocument pd ) {
String description;
try {
PDFTextStripper stripper = new PDFTextStripper();
description = stripper.getText( pd );
}
catch (IOException e) {
closeDocument( pd );
throw new PDFExtractorException( "unable to extract text", e );
}
PDDocumentInformation info = pd.getDocumentInformation();
String author = info.getAuthor();
String title = info.getTitle();
String keywords = info.getKeywords();
String subject = info.getSubject();
Pdf doc = new Pdf();
doc.setDescription( description );
doc.setAuthor( author );
doc.setTitle( title );
doc.setKeywords( keywords );
doc.setSubject( subject );
return doc;
}
private void buildIndex( Pdf doc, FullTextSession session, Transaction tx ) {
session.save( doc );
tx.commit();
session.clear();
}
private void closeDocument( PDDocument pd ) {
try {
if ( pd != null ) {
pd.close();
}
}
catch (IOException e) {
// live with it
}
}
private static void closeInputStream( InputStream istream ) {
if ( istream != null ) {
try {
istream.close();
}
catch (IOException e) {
System.out.printf( "unable to close file input stream" );
}
}
}
public class PDFExtractorException extends RuntimeException {
public PDFExtractorException( String msg, Throwable e ) {
super( msg, e );
}
}
protected Class[] getMappings() {
return new Class[]{
Pdf.class
};
}
}