package com.manning.hsia.dvdstore; import com.manning.hsia.test.ch13.SearchTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; import org.hibernate.Transaction; import org.hibernate.search.FullTextSession; import org.hibernate.search.Search; import org.hibernate.search.store.FSDirectoryProvider; import org.pdfbox.searchengine.lucene.LucenePDFDocument; import org.testng.annotations.Test; import java.io.*; import java.util.List; // this is testLuceneToPdfDoc public class TestPdfToDoc extends SearchTestCase { private Analyzer analyzer = new StandardAnalyzer(); @Test(groups="ch13") public void testPdfToDoc() throws Exception { FullTextSession session = Search.getFullTextSession( openSession() ); Transaction tx = session.beginTransaction(); File f = new File( "ch13/src/com/manning/hsia/dvdstore/file1.pdf" ); buildIndex( f.getAbsolutePath(), session, tx ); tx = session.beginTransaction(); try { QueryParser parser = new QueryParser( "description", analyzer ); Query query = parser.parse( "description" + ":salesman" ); org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery( query, Pdf.class ); List<Pdf> results = hibQuery.list(); assert results.size() == 1 : "incorrect result size"; Pdf result = results.get( 0 ); assert result.getAuthor().startsWith( "John Griffin" ) : "incorrect author"; assert result.getDescription().startsWith( "Keanu Reeves" ) : "incorrect description"; for (Object element : session.createQuery( "from " + Pdf.class.getName() ) .list()) { session.delete( element ); } tx.commit(); } finally { session.close(); } } private void buildIndex( String filename, FullTextSession session, Transaction tx ) { // session = Search.getFullTextSession( openSession() ); Document doc = getDocument( filename ); Pdf pdf = getPdf( doc ); pdf.setId( 1 ); session.save( pdf ); tx.commit(); session.clear(); } private Document getDocument( String filename ) { Document doc; InputStream istream; File file = new File( filename ); LucenePDFDocument pdf = new LucenePDFDocument(); try { istream = new FileInputStream( file ); doc = pdf.convertDocument( istream ); } catch (Exception e) { throw new PDFExtractorException( "unable to create document", e ); } return doc; } private Pdf getPdf( Document doc ) { Pdf pdf = new Pdf(); pdf.setAuthor( doc.get( "Author" ) ); pdf.setKeywords( doc.get( "Keywords" ) ); pdf.setSubject( doc.get( "Subject" ) ); pdf.setTitle( doc.get( "Title" ) ); pdf.setSummary( doc.get( "summary" ) ); pdf.setContents( getContents( doc.getField( "contents" ) ) ); pdf.setDescription( pdf.getContents() ); return pdf; } private String getContents( Field field ) { StringReader reader = (StringReader) field.readerValue(); BufferedReader br = new BufferedReader( reader ); String in; StringBuilder sb = new StringBuilder(); try { while (( in = br.readLine() ) != null) { sb.append( in ); } } catch (IOException e) { System.out.println( "unable to retrieve contents field" ); } finally { try { br.close(); } catch (IOException e) { // Live with it. } } return sb.toString(); } protected Class[] getMappings() { return new Class[]{ Pdf.class }; } public class PDFExtractorException extends RuntimeException { public PDFExtractorException( String msg, Throwable e ) { super( msg, e ); } } protected void configure( org.hibernate.cfg.Configuration cfg ) { cfg.setProperty( "hibernate.search.default.directory_provider", FSDirectoryProvider.class.getName() ); File sub = locateBaseDir(); cfg.setProperty( "hibernate.search.default.indexBase", sub.getAbsolutePath() ); } }