package org.neo4j.rdf.fulltext;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import javax.transaction.SystemException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.NotFoundException;
import org.neo4j.helpers.Predicate;
import org.neo4j.helpers.collection.FilteringIterator;
import org.neo4j.helpers.collection.IteratorUtil;
import org.neo4j.helpers.collection.PrefetchingIterator;
import org.neo4j.rdf.fulltext.PersistentQueue.Entry;
import org.neo4j.rdf.fulltext.VerificationHook.Status;
import org.neo4j.rdf.model.Uri;
import org.neo4j.util.GraphDatabaseUtil;
/**
* A {@link FulltextIndex} using lucene.
* The query format (see the search method) is a plain lucene query, but with
* the addition that an AND operator is squeezed in between every word making
* it and AND search by default, instead of OR.
*
* When you call the index and removeIndex methods a temporary log is created
* and a call to the end method will write all those additions to the queue
* to be indexed in the near future. The "txId" i.e. transaction id is really
* just the javax.transaction.Transaction object's hashCode() value at the
* moment. That is what you'll have to pass in to the
* end( boolean commit, int txId ) method if you choose not to use the
* end( boolean commit ) method which figures it out itself, provided that
* you are in a transaction at the time of the call.
*/
public class SimpleFulltextIndex implements FulltextIndex
{
/**
* The literal node id
*/
private static final String KEY_ID = "id";
private static final String KEY_INDEX = "index";
private static final String KEY_PREDICATE = "predicate";
private static final String KEY_INDEX_SOURCE = "index_source";
private static final String SNIPPET_DELIMITER = "...";
private static final int BATCH_SIZE = 100;
private LiteralReader literalReader = new SimpleLiteralReader();
private String directoryPath;
private String queuePath;
private Directory directory;
private Analyzer analyzer = new Analyzer()
{
@Override
public TokenStream tokenStream( String fieldName, Reader reader )
{
return new LowerCaseFilter( new WhitespaceTokenizer( reader ) );
}
};
private GraphDatabaseService graphDb;
private GraphDatabaseUtil graphDbUtil;
private Map<Integer, Collection<Object[]>> toIndex =
Collections.synchronizedMap(
new HashMap<Integer, Collection<Object[]>>() );
private PersistentQueue indexingQueue;
private IndexingThread indexingThread;
private Formatter highlightFormatter;
private Set<String> predicateFilter;
private IndexSearcher indexSearcher;
public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath )
{
this( graphDb, storagePath, null );
}
public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
Collection<String> predicateFilter )
{
this( graphDb, storagePath, null, null, predicateFilter );
}
public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
String highlightPreTag, String highlightPostTag,
Collection<String> predicateFilter )
{
if ( highlightPreTag == null || highlightPostTag == null )
{
this.highlightFormatter = new SimpleHTMLFormatter();
}
else
{
this.highlightFormatter = new SimpleHTMLFormatter(
highlightPreTag, highlightPostTag );
}
this.predicateFilter = predicateFilter == null ? null :
new HashSet<String>( predicateFilter );
this.directoryPath = storagePath.getAbsolutePath();
this.queuePath = this.directoryPath + "-queue";
this.graphDb = graphDb;
this.graphDbUtil = new GraphDatabaseUtil( graphDb );
startUpDirectoryAndThread();
}
private void startUpDirectoryAndThread()
{
this.indexingQueue = new PersistentQueue( new File( queuePath ) );
this.indexingQueue.setAutoCompleteEntries( false );
try
{
cleanWriteLocks( new File( directoryPath ) );
createLuceneDirectory();
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
this.indexingThread = new IndexingThread();
this.indexingThread.start();
}
private void cleanWriteLocks( File path )
{
if ( !path.isDirectory() )
{
return;
}
for ( File file : path.listFiles() )
{
if ( file.isDirectory() )
{
cleanWriteLocks( file );
}
else if ( file.getName().equals( "write.lock" ) )
{
boolean success = file.delete();
assert success;
}
}
}
public void clear()
{
internalShutDown();
delete();
startUpDirectoryAndThread();
}
private void createLuceneDirectory() throws IOException
{
if ( !IndexReader.indexExists( directoryPath ) )
{
new File( directoryPath ).mkdirs();
IndexWriter writer = new IndexWriter( directoryPath, analyzer,
true, MaxFieldLength.UNLIMITED );
writer.close();
}
directory = FSDirectory.getDirectory( directoryPath );
if ( IndexWriter.isLocked( directory ) )
{
IndexWriter.unlock( directory );
}
}
private Directory getDir() throws IOException
{
return this.directory;
}
private IndexWriter getWriter( boolean create ) throws IOException
{
return new IndexWriter( getDir(), analyzer, create,
MaxFieldLength.UNLIMITED );
}
public void index( Node node, Uri predicate, Object literal )
{
index( node.getId(), predicate.getUriAsString(), literal );
}
private void index( long nodeId, String predicate, Object literal )
{
enqueueCommand( true, nodeId, predicate, literal );
}
private void enqueueCommand( boolean trueForIndex,
long nodeId, String predicate, Object literal )
{
if ( predicateFilter != null &&
!predicateFilter.contains( predicate ) )
{
return;
}
try
{
int key =
graphDbUtil.getTransactionManager().getTransaction().hashCode();
Collection<Object[]> commands = toIndex.get( key );
if ( commands == null )
{
commands = new ArrayList<Object[]>();
toIndex.put( key, commands );
}
commands.add( new Object[] {
trueForIndex, nodeId, predicate, literal
} );
}
catch ( SystemException e )
{
throw new RuntimeException( e );
}
}
protected void safeClose( Object object )
{
try
{
if ( object != null )
{
if ( object instanceof IndexWriter )
{
( ( IndexWriter ) object ).close();
}
else if ( object instanceof IndexReader )
{
( ( IndexReader ) object ).close();
}
else if ( object instanceof IndexSearcher )
{
( ( IndexSearcher ) object ).close();
}
else
{
throw new RuntimeException( object.getClass().getName() );
}
}
}
catch ( IOException e )
{
e.printStackTrace();
}
}
private void doIndex( IndexWriter writer, long nodeId, String predicate,
Object literal )
{
try
{
Document doc = new Document();
doc.add( new Field( KEY_ID, String.valueOf( nodeId ), Store.YES,
Index.NOT_ANALYZED ) );
doc.add( new Field( KEY_INDEX, getLiteralReader().read( literal ),
Store.YES, Index.ANALYZED ) );
doc.add( new Field( KEY_PREDICATE, predicate,
Store.YES, Index.NOT_ANALYZED ) );
doc.add( new Field( KEY_INDEX_SOURCE, literal.toString(),
Store.YES, Index.NOT_ANALYZED ) );
writer.addDocument( doc );
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
}
public void removeIndex( Node node, Uri predicate, Object literal )
{
removeIndex( node.getId(), predicate.getUriAsString(), literal );
}
private void removeIndex( long nodeId, String predicate, Object literal )
{
enqueueCommand( false, nodeId, predicate, literal );
}
private void doRemoveIndex( IndexWriter writer,
long nodeId, String predicate, Object literal )
{
try
{
BooleanQuery deletionQuery = new BooleanQuery();
deletionQuery.add( new TermQuery(
new Term( KEY_ID, String.valueOf( nodeId ) ) ), Occur.MUST );
deletionQuery.add( new TermQuery(
new Term( KEY_PREDICATE, predicate ) ), Occur.MUST );
deletionQuery.add( new TermQuery(
new Term( KEY_INDEX_SOURCE, literal.toString() ) ),
Occur.MUST );
writer.deleteDocuments( deletionQuery );
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
}
private synchronized IndexSearcher getSearcher() throws IOException
{
if ( this.indexSearcher == null )
{
this.indexSearcher = new IndexSearcher( getDir() );
}
else
{
IndexReader reopened =
this.indexSearcher.getIndexReader().reopen();
if ( reopened != null )
{
this.indexSearcher = new IndexSearcher( reopened );
}
}
return this.indexSearcher;
}
private void leaveSearcher( IndexSearcher searcher )
{
}
public Iterable<RawQueryResult> search( String query )
{
return searchWithSnippets( query, 0 );
}
public Iterable<RawQueryResult> searchWithSnippets( String query,
int snippetCountLimit )
{
IndexSearcher searcher = null;
try
{
searcher = getSearcher();
Query q = new QueryParser( KEY_INDEX, analyzer ).parse( query );
Hits hits = searcher.search( q, Sort.RELEVANCE );
Highlighter highlighter = null;
if ( snippetCountLimit > 0 )
{
highlighter = new Highlighter( highlightFormatter,
new QueryScorer( q ) );
}
Iterator<RawQueryResult> resultIterator =
new ResultIterator( hits, snippetCountLimit,
highlighter );
return IteratorUtil.asIterable( resultIterator );
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
catch ( ParseException e )
{
throw new RuntimeException( e );
}
finally
{
leaveSearcher( searcher );
}
}
private static Predicate<RawQueryResult> OK_RESULT = new Predicate<RawQueryResult>()
{
public boolean accept( RawQueryResult result )
{
return result != null && result != SPECIAL_FILTERING_INSTANCE;
}
};
private class ResultIterator extends FilteringIterator<RawQueryResult>
{
ResultIterator( Hits hits, int snippetCountLimit,
Highlighter highlighter )
{
super( new RawResultIterator( hits, snippetCountLimit,
highlighter ), OK_RESULT );
}
}
private static final RawQueryResult SPECIAL_FILTERING_INSTANCE =
new RawQueryResult( null, 0, null );
private class RawResultIterator extends PrefetchingIterator<RawQueryResult>
{
private Hits hits;
private int hitsLength;
private int snippetCountLimit;
private Highlighter highlighter;
private int counter = 0;
private Set<Long> ids = new HashSet<Long>();
private long getIdTime = 0;
private long getSnippetTime = 0;
private long getNodeTime = 0;
RawResultIterator( Hits hits, int snippetCountLimit,
Highlighter highlighter )
{
this.hits = hits;
this.hitsLength = hits.length();
this.snippetCountLimit = snippetCountLimit;
this.highlighter = highlighter;
}
@Override
protected RawQueryResult fetchNextOrNull()
{
int docNum = counter;
if ( counter >= hitsLength )
{
return null;
}
counter++;
try
{
long t = System.currentTimeMillis();
Document doc = hits.doc( docNum );
long id = Long.parseLong( doc.get( KEY_ID ) );
getIdTime += ( System.currentTimeMillis() - t );
if ( !ids.add( id ) )
{
// It's a duplicate here, probably after a crash or
// something
removeDuplicate( doc );
return SPECIAL_FILTERING_INSTANCE;
}
float score = hits.score( docNum );
String snippet = null;
t = System.currentTimeMillis();
if ( docNum < snippetCountLimit )
{
snippet = generateSnippet( doc, highlighter );
}
getSnippetTime += ( System.currentTimeMillis() - t );
try
{
t = System.currentTimeMillis();
Node node = graphDb.getNodeById( id );
getNodeTime += ( System.currentTimeMillis() - t );
return new RawQueryResult( node, score, snippet );
}
catch ( NotFoundException e )
{
// Ok, probably index lagging a bit behind, that's all.
// This also effectively hides many bugs, which is a
// BAAD thing.
return SPECIAL_FILTERING_INSTANCE;
}
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
}
}
private void removeDuplicate( Document doc )
{
long nodeId = Long.parseLong( doc.get( KEY_ID ) );
String predicate = doc.get( KEY_PREDICATE );
String literal = doc.get( KEY_INDEX_SOURCE );
removeIndex( nodeId, predicate, literal );
index( nodeId, predicate, literal );
}
private String generateSnippet( Document doc, Highlighter highlighter )
{
StringBuffer snippet = new StringBuffer();
for ( Field field : doc.getFields( KEY_INDEX ) )
{
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream( KEY_INDEX,
new StringReader( text ) );
try
{
String fragment = highlighter.getBestFragments(
tokenStream, text, 2, SNIPPET_DELIMITER );
if ( snippet.length() > 0 )
{
snippet.append( SNIPPET_DELIMITER );
}
snippet.append( fragment );
}
catch ( IOException e )
{
// TODO
continue;
}
catch ( InvalidTokenOffsetsException e )
{
// TODO
continue;
}
}
return snippet.toString();
}
public boolean verify( VerificationHook hook, String queryOrNullForAll )
{
IndexSearcher searcher = null;
try
{
searcher = new IndexSearcher( getDir() );
Map<Status, MutableInteger> counts =
new HashMap<Status, MutableInteger>();
int maxDoc = 0;
final IndexReader reader = searcher.getIndexReader();
Iterator<Integer> hitsIterator = null;
if ( queryOrNullForAll == null )
{
maxDoc = reader.maxDoc();
hitsIterator = new PrefetchingIterator<Integer>()
{
private int limit = reader.maxDoc();
private int counter;
@Override
protected Integer fetchNextOrNull()
{
int c = counter++;
return c < limit ? c : null;
}
};
}
else
{
Query q = new QueryParser( KEY_INDEX, analyzer ).parse(
queryOrNullForAll );
final Hits hits = searcher.search( q, Sort.RELEVANCE );
maxDoc = hits.length();
hitsIterator = new PrefetchingIterator<Integer>()
{
private int counter;
@Override
protected Integer fetchNextOrNull()
{
try
{
int c = counter++;
return c < hits.length() ? hits.id( c ) : null;
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
}
};
}
hook.verificationStarting( maxDoc );
while ( hitsIterator.hasNext() )
{
int docId = hitsIterator.next();
if ( reader.isDeleted( docId ) )
{
hook.oneWasSkipped();
continue;
}
Document doc = reader.document( docId );
long nodeId = Long.parseLong( doc.get( KEY_ID ) );
Status status = hook.verify( nodeId,
doc.get( KEY_PREDICATE ), doc.get( KEY_INDEX_SOURCE ) );
MutableInteger count = counts.get( status );
if ( count == null )
{
count = new MutableInteger();
counts.put( status, count );
}
count.value++;
}
Map<Status, Integer> resultCounts = new HashMap<Status, Integer>();
int errors = 0;
for ( Map.Entry<Status, MutableInteger> count :
counts.entrySet() )
{
resultCounts.put( count.getKey(), count.getValue().value );
errors += ( count.getKey() == Status.OK ? 0 :
count.getValue().value );
}
hook.verificationCompleted( resultCounts );
return errors == 0;
}
catch ( ParseException e )
{
throw new RuntimeException( e );
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
finally
{
safeClose( searcher );
}
}
private static class MutableInteger
{
private int value;
}
public LiteralReader getLiteralReader()
{
return this.literalReader;
}
public void setLiteralReader( LiteralReader reader )
{
this.literalReader = reader;
}
public void end( boolean commit )
{
try
{
end( graphDbUtil.getTransactionManager().getTransaction().hashCode(),
commit );
}
catch ( SystemException e )
{
throw new RuntimeException( e );
}
}
public void end( int txId, boolean commit )
{
Collection<Object[]> commands = toIndex.remove( txId );
if ( commands == null || !commit )
{
return;
}
for ( Object[] command : commands )
{
this.indexingQueue.add( command );
this.indexingThread.hasItems = true;
}
}
public boolean queueIsEmpty()
{
return !this.indexingThread.hasItems;
}
public void shutDown()
{
// TemporaryLogger.getLogger().info( getClass().getName() +
// " shutDown called", new Exception() );
internalShutDown();
}
private void internalShutDown()
{
indexingThread.halt();
try
{
indexingThread.join();
}
catch ( InterruptedException e )
{
e.printStackTrace();
}
indexingQueue.close();
try
{
directory.close();
}
catch ( IOException e )
{
throw new RuntimeException( e );
}
safeClose( this.indexSearcher );
this.indexSearcher = null;
}
private class IndexingThread extends Thread
{
private boolean halted;
private boolean hasItems;
private IndexWriter writer;
private Collection<Entry> entriesToComplete = new ArrayList<Entry>();
private void halt()
{
this.halted = true;
}
@Override
public void run()
{
while ( !halted )
{
try
{
hasItems = indexingQueue.hasNext();
while ( !halted && hasItems )
{
Entry entry = indexingQueue.next();
Object[] data = entry.data();
ensureWriters();
if ( ( Boolean ) data[ 0 ] )
{
doIndex( writer, ( Long ) data[ 1 ],
( String ) data[ 2 ], data[ 3 ] );
}
else
{
doRemoveIndex( writer, ( Long ) data[ 1 ],
( String ) data[ 2 ], data[ 3 ] );
}
entriesToComplete.add( entry );
if ( entriesToComplete.size() >= BATCH_SIZE ||
!indexingQueue.hasNext() )
{
flushEntries();
}
hasItems = indexingQueue.hasNext();
}
// This is so that it flushes if the indexer gets halted.
flushEntries();
try
{
long time = System.currentTimeMillis();
while ( !halted &&
System.currentTimeMillis() - time < 100 )
{
hasItems = indexingQueue.hasNext();
Thread.sleep( 20 );
}
}
catch ( InterruptedException e )
{
Thread.interrupted();
}
}
catch ( Throwable t )
{
t.printStackTrace();
}
}
}
private void ensureWriters() throws Exception
{
if ( writer == null )
{
writer = getWriter( false );
writer.setMaxBufferedDocs( BATCH_SIZE * 2 );
writer.setMaxBufferedDeleteTerms( BATCH_SIZE * 2 );
}
}
private void flushEntries()
{
if ( writer == null )
{
return;
}
safeClose( writer );
writer = null;
// try
// {
// writer.commit();
//
// }
// catch ( IOException e )
// {
// TemporaryLogger.getLogger().info(
// "Couldn't commit fulltext index writer ", e );
// safeClose( writer );
// writer = null;
// }
indexingQueue.markAsCompleted( entriesToComplete.toArray(
new Entry[ entriesToComplete.size() ] ) );
entriesToComplete.clear();
}
}
private void delete()
{
deleteDir( new File( directoryPath ) );
new File( queuePath ).delete();
}
protected void deleteDir( File dir )
{
if ( !dir.exists() )
{
return;
}
for ( File child : dir.listFiles() )
{
if ( child.isFile() )
{
child.delete();
}
else
{
deleteDir( child );
}
}
dir.delete();
}
}