SimpleFulltextIndex.java example

Explorer
neo4j-rdf-master
- src
  - main
    - java
      - org
        neo4j
        rdf
        fulltext
        FulltextIndex.java
        FundamentalTypeNioUtil.java
        LiteralReader.java
        PersistentQueue.java
        QueryResult.java
        RawQueryResult.java
        SimpleFulltextIndex.java
        SimpleLiteralReader.java
        VerificationHook.java
        model
        BlankNode.java
        CompleteStatement.java
        Context.java
        Literal.java
        Resource.java
        Statement.java
        StatementMetadata.java
        Uri.java
        Value.java
        Wildcard.java
        WildcardStatement.java
        store
        CachingLuceneIndexService.java
        DenseTripleStore.java
        RdfStore.java
        RdfStoreImpl.java
        RdfStoreTools.java
        VerboseQuadStore.java
        representation
        AbstractElement.java
        AbstractNode.java
        AbstractRelationship.java
        AbstractRepresentation.java
        RepresentationExecutor.java
        RepresentationStrategy.java
        standard
        AbstractUriBasedExecutor.java
        DenseTripleStrategy.java
        FilteringIndexService.java
        StandardAbstractRepresentationStrategy.java
        UriBasedExecutor.java
        VerboseQuadExecutor.java
        VerboseQuadStrategy.java
        util
        VerifyFulltextIndexOffline.java
  - test
    - java
      - examples
        SiteExamples.java
      - org
        neo4j
        rdf
        fulltext
        TestFulltextIndex.java
        store
        Neo4jTestCase.java
        QuadStoreAbstractTestCase.java
        StoreTestCase.java
        TestBasicQuadContract.java
        TestBasicTripleContract.java
        TestDenseTripleStore.java
        TestInlineShutdown.java
        TestLiteralQuadContract.java
        TestVerboseQuadStore.java
        TripleStoreAbstractTestCase.java
        representation
        TestRepresentations.java
package org.neo4j.rdf.fulltext;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import javax.transaction.SystemException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.NotFoundException;
import org.neo4j.helpers.Predicate;
import org.neo4j.helpers.collection.FilteringIterator;
import org.neo4j.helpers.collection.IteratorUtil;
import org.neo4j.helpers.collection.PrefetchingIterator;
import org.neo4j.rdf.fulltext.PersistentQueue.Entry;
import org.neo4j.rdf.fulltext.VerificationHook.Status;
import org.neo4j.rdf.model.Uri;
import org.neo4j.util.GraphDatabaseUtil;

/**
 * A {@link FulltextIndex} using lucene.
 * The query format (see the search method) is a plain lucene query, but with
 * the addition that an AND operator is squeezed in between every word making
 * it and AND search by default, instead of OR.
 * 
 * When you call the index and removeIndex methods a temporary log is created
 * and a call to the end method will write all those additions to the queue
 * to be indexed in the near future. The "txId" i.e. transaction id is really
 * just the javax.transaction.Transaction object's hashCode() value at the
 * moment. That is what you'll have to pass in to the
 * end( boolean commit, int txId ) method if you choose not to use the
 * end( boolean commit ) method which figures it out itself, provided that
 * you are in a transaction at the time of the call.
 */
public class SimpleFulltextIndex implements FulltextIndex
{
    /**
     * The literal node id
     */
    private static final String KEY_ID = "id";
    private static final String KEY_INDEX = "index";
    private static final String KEY_PREDICATE = "predicate";
    private static final String KEY_INDEX_SOURCE = "index_source";
    private static final String SNIPPET_DELIMITER = "...";
    private static final int BATCH_SIZE = 100;
    
    private LiteralReader literalReader = new SimpleLiteralReader();
    private String directoryPath;
    private String queuePath;
    private Directory directory;
    private Analyzer analyzer = new Analyzer()
    {
        @Override
        public TokenStream tokenStream( String fieldName, Reader reader )
        {
            return new LowerCaseFilter( new WhitespaceTokenizer( reader ) );
        }
    };
    private GraphDatabaseService graphDb;
    private GraphDatabaseUtil graphDbUtil;
    private Map<Integer, Collection<Object[]>> toIndex =
        Collections.synchronizedMap(
            new HashMap<Integer, Collection<Object[]>>() );
    private PersistentQueue indexingQueue;
    private IndexingThread indexingThread;
    private Formatter highlightFormatter;
    private Set<String> predicateFilter;
    private IndexSearcher indexSearcher;
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath )
    {
        this( graphDb, storagePath, null );
    }
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
        Collection<String> predicateFilter )
    {
        this( graphDb, storagePath, null, null, predicateFilter );
    }
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
        String highlightPreTag, String highlightPostTag,
        Collection<String> predicateFilter )
    {
        if ( highlightPreTag == null || highlightPostTag == null )
        {
            this.highlightFormatter = new SimpleHTMLFormatter();
        }
        else
        {
            this.highlightFormatter = new SimpleHTMLFormatter(
                highlightPreTag, highlightPostTag );
        }
        
        this.predicateFilter = predicateFilter == null ? null :
            new HashSet<String>( predicateFilter );
        this.directoryPath = storagePath.getAbsolutePath();
        this.queuePath = this.directoryPath + "-queue";
        this.graphDb = graphDb;
        this.graphDbUtil = new GraphDatabaseUtil( graphDb );
        startUpDirectoryAndThread();
    }
    
    private void startUpDirectoryAndThread()
    {
        this.indexingQueue = new PersistentQueue( new File( queuePath ) );
        this.indexingQueue.setAutoCompleteEntries( false );
        try
        {
            cleanWriteLocks( new File( directoryPath ) );
            createLuceneDirectory();
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        
        this.indexingThread = new IndexingThread();
        this.indexingThread.start();
    }
    
    private void cleanWriteLocks( File path )
    {
        if ( !path.isDirectory() )
        {
            return;
        }
        for ( File file : path.listFiles() )
        {
            if ( file.isDirectory() )
            {
                cleanWriteLocks( file );
            }
            else if ( file.getName().equals( "write.lock" ) )
            {
                boolean success = file.delete();
                assert success;
            }
        }
    }
    
    public void clear()
    {
        internalShutDown();
        delete();
        startUpDirectoryAndThread();
    }
    
    private void createLuceneDirectory() throws IOException
    {
        if ( !IndexReader.indexExists( directoryPath ) )
        {
            new File( directoryPath ).mkdirs();
            IndexWriter writer = new IndexWriter( directoryPath, analyzer,
                true, MaxFieldLength.UNLIMITED );
            writer.close();
        }
        directory = FSDirectory.getDirectory( directoryPath );
        if ( IndexWriter.isLocked( directory ) )
        {
            IndexWriter.unlock( directory );
        }
    }
    
    private Directory getDir() throws IOException
    {
        return this.directory;
    }
    
    private IndexWriter getWriter( boolean create ) throws IOException
    {
        return new IndexWriter( getDir(), analyzer, create,
            MaxFieldLength.UNLIMITED );
    }
    
    public void index( Node node, Uri predicate, Object literal )
    {
        index( node.getId(), predicate.getUriAsString(), literal );
    }
    
    private void index( long nodeId, String predicate, Object literal )
    {
        enqueueCommand( true, nodeId, predicate, literal );
    }
    
    private void enqueueCommand( boolean trueForIndex,
        long nodeId, String predicate, Object literal )
    {
        if ( predicateFilter != null &&
            !predicateFilter.contains( predicate ) )
        {
            return;
        }
        
        try
        {
            int key =
                graphDbUtil.getTransactionManager().getTransaction().hashCode();
            Collection<Object[]> commands = toIndex.get( key );
            if ( commands == null )
            {
                commands = new ArrayList<Object[]>();
                toIndex.put( key, commands );
            }
            commands.add( new Object[] {
                trueForIndex, nodeId, predicate, literal
            } );
        }
        catch ( SystemException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    protected void safeClose( Object object )
    {
        try
        {
            if ( object != null )
            {
                if ( object instanceof IndexWriter )
                {
                    ( ( IndexWriter ) object ).close();
                }
                else if ( object instanceof IndexReader )
                {
                    ( ( IndexReader ) object ).close();
                }
                else if ( object instanceof IndexSearcher )
                {
                    ( ( IndexSearcher ) object ).close();
                }
                else
                {
                    throw new RuntimeException( object.getClass().getName() );
                }
            }
        }
        catch ( IOException e )
        {
            e.printStackTrace();
        }
    }
    
    private void doIndex( IndexWriter writer, long nodeId, String predicate,
        Object literal )
    {
        try
        {
            Document doc = new Document();
            doc.add( new Field( KEY_ID, String.valueOf( nodeId ), Store.YES,
                Index.NOT_ANALYZED ) );
            doc.add( new Field( KEY_INDEX, getLiteralReader().read( literal ),
                Store.YES, Index.ANALYZED ) );
            doc.add( new Field( KEY_PREDICATE, predicate,
                Store.YES, Index.NOT_ANALYZED ) );
            doc.add( new Field( KEY_INDEX_SOURCE, literal.toString(),
                Store.YES, Index.NOT_ANALYZED ) );
            writer.addDocument( doc );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    public void removeIndex( Node node, Uri predicate, Object literal )
    {
        removeIndex( node.getId(), predicate.getUriAsString(), literal );
    }
    
    private void removeIndex( long nodeId, String predicate, Object literal )
    {
        enqueueCommand( false, nodeId, predicate, literal );
    }
    
    private void doRemoveIndex( IndexWriter writer,
        long nodeId, String predicate, Object literal )
    {
        try
        {
            BooleanQuery deletionQuery = new BooleanQuery();
            deletionQuery.add( new TermQuery(
                new Term( KEY_ID, String.valueOf( nodeId ) ) ), Occur.MUST );
            deletionQuery.add( new TermQuery(
                new Term( KEY_PREDICATE, predicate ) ), Occur.MUST );
            deletionQuery.add( new TermQuery(
                new Term( KEY_INDEX_SOURCE, literal.toString() ) ),
                Occur.MUST );
            
            writer.deleteDocuments( deletionQuery );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    private synchronized IndexSearcher getSearcher() throws IOException
    {
        if ( this.indexSearcher == null )
        {
            this.indexSearcher = new IndexSearcher( getDir() );
        }
        else
        {
            IndexReader reopened =
                this.indexSearcher.getIndexReader().reopen();
            if ( reopened != null )
            {
                this.indexSearcher = new IndexSearcher( reopened );
            }
        }
        return this.indexSearcher;
    }
    
    private void leaveSearcher( IndexSearcher searcher )
    {
    }
    
    public Iterable<RawQueryResult> search( String query )
    {
        return searchWithSnippets( query, 0 );
    }
    
    public Iterable<RawQueryResult> searchWithSnippets( String query,
        int snippetCountLimit )
    {
        IndexSearcher searcher = null;
        try
        {
            searcher = getSearcher();
            Query q = new QueryParser( KEY_INDEX, analyzer ).parse( query );
            Hits hits = searcher.search( q, Sort.RELEVANCE );
            
            Highlighter highlighter = null;
            if ( snippetCountLimit > 0 )
            {
                highlighter = new Highlighter( highlightFormatter,
                    new QueryScorer( q ) );
            }
            
            Iterator<RawQueryResult> resultIterator =
                new ResultIterator( hits, snippetCountLimit,
                    highlighter );
            return IteratorUtil.asIterable( resultIterator );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        catch ( ParseException e )
        {
            throw new RuntimeException( e );
        }
        finally
        {
            leaveSearcher( searcher );
        }
    }
    
    private static Predicate<RawQueryResult> OK_RESULT = new Predicate<RawQueryResult>()
    {
        public boolean accept( RawQueryResult result )
        {
            return result != null && result != SPECIAL_FILTERING_INSTANCE;
        }
    };
    
    private class ResultIterator extends FilteringIterator<RawQueryResult>
    {
        ResultIterator( Hits hits, int snippetCountLimit,
            Highlighter highlighter )
        {
            super( new RawResultIterator( hits, snippetCountLimit,
                highlighter ), OK_RESULT );
        }
    }
    
    private static final RawQueryResult SPECIAL_FILTERING_INSTANCE =
        new RawQueryResult( null, 0, null );
    
    private class RawResultIterator extends PrefetchingIterator<RawQueryResult>
    {
        private Hits hits;
        private int hitsLength;
        private int snippetCountLimit;
        private Highlighter highlighter;
        private int counter = 0;
        private Set<Long> ids = new HashSet<Long>();
        
        private long getIdTime = 0;
        private long getSnippetTime = 0;
        private long getNodeTime = 0;
        
        RawResultIterator( Hits hits, int snippetCountLimit,
            Highlighter highlighter )
        {
            this.hits = hits;
            this.hitsLength = hits.length();
            this.snippetCountLimit = snippetCountLimit;
            this.highlighter = highlighter;
        }
        
        @Override
        protected RawQueryResult fetchNextOrNull()
        {
            int docNum = counter;
            if ( counter >= hitsLength )
            {
                return null;
            }
            
            counter++;
            try
            {
                long t = System.currentTimeMillis();
                Document doc = hits.doc( docNum );
                long id = Long.parseLong( doc.get( KEY_ID ) );
                getIdTime += ( System.currentTimeMillis() - t );
                if ( !ids.add( id ) )
                {
                    // It's a duplicate here, probably after a crash or
                    // something
                    removeDuplicate( doc );
                    return SPECIAL_FILTERING_INSTANCE;
                }
                float score = hits.score( docNum );
                
                String snippet = null;
                t = System.currentTimeMillis();
                if ( docNum < snippetCountLimit )
                {
                    snippet = generateSnippet( doc, highlighter );
                }
                getSnippetTime += ( System.currentTimeMillis() - t );
                
                try
                {
                    t = System.currentTimeMillis();
                    Node node = graphDb.getNodeById( id );
                    getNodeTime += ( System.currentTimeMillis() - t );
                    return new RawQueryResult( node, score, snippet );
                }
                catch ( NotFoundException e )
                {
                    // Ok, probably index lagging a bit behind, that's all.
                    // This also effectively hides many bugs, which is a
                    // BAAD thing.
                    return SPECIAL_FILTERING_INSTANCE;
                }
            }
            catch ( IOException e )
            {
                throw new RuntimeException( e );
            }
        }
    }
    
    private void removeDuplicate( Document doc )
    {
        long nodeId = Long.parseLong( doc.get( KEY_ID ) );
        String predicate = doc.get( KEY_PREDICATE );
        String literal = doc.get( KEY_INDEX_SOURCE );
        removeIndex( nodeId, predicate, literal );
        index( nodeId, predicate, literal );
    }
    
    private String generateSnippet( Document doc, Highlighter highlighter )
    {
        StringBuffer snippet = new StringBuffer();
        for ( Field field : doc.getFields( KEY_INDEX ) )
        {
            String text = field.stringValue();
            TokenStream tokenStream = analyzer.tokenStream( KEY_INDEX,
                new StringReader( text ) );
            try
            {
                String fragment = highlighter.getBestFragments(
                    tokenStream, text, 2, SNIPPET_DELIMITER );
                if ( snippet.length() > 0 )
                {
                    snippet.append( SNIPPET_DELIMITER );
                }
                snippet.append( fragment );
            }
            catch ( IOException e )
            {
                // TODO
                continue;
            }
            catch ( InvalidTokenOffsetsException e )
            {
                // TODO
                continue;
            }
        }
        return snippet.toString();
    }
    
    public boolean verify( VerificationHook hook, String queryOrNullForAll )
    {
        IndexSearcher searcher = null;
        try
        {
            searcher = new IndexSearcher( getDir() );
            Map<Status, MutableInteger> counts =
                new HashMap<Status, MutableInteger>();
            int maxDoc = 0;
            final IndexReader reader = searcher.getIndexReader();
            Iterator<Integer> hitsIterator = null;
            if ( queryOrNullForAll == null )
            {
                maxDoc = reader.maxDoc();
                hitsIterator = new PrefetchingIterator<Integer>()
                {
                    private int limit = reader.maxDoc();
                    private int counter;
                    
                    @Override
                    protected Integer fetchNextOrNull()
                    {
                        int c = counter++;
                        return c < limit ? c : null;
                    }
                };
            }
            else
            {
                Query q = new QueryParser( KEY_INDEX, analyzer ).parse(
                    queryOrNullForAll );
                final Hits hits = searcher.search( q, Sort.RELEVANCE );
                maxDoc = hits.length();
                hitsIterator = new PrefetchingIterator<Integer>()
                {
                    private int counter;
                    
                    @Override
                    protected Integer fetchNextOrNull()
                    {
                        try
                        {
                            int c = counter++;
                            return c < hits.length() ? hits.id( c ) : null;
                        }
                        catch ( IOException e )
                        {
                            throw new RuntimeException( e );
                        }
                    }
                };
            }
            
            hook.verificationStarting( maxDoc );
            while ( hitsIterator.hasNext() )
            {
                int docId = hitsIterator.next();
                if ( reader.isDeleted( docId ) )
                {
                    hook.oneWasSkipped();
                    continue;
                }
                
                Document doc = reader.document( docId );
                long nodeId = Long.parseLong( doc.get( KEY_ID ) );
                Status status = hook.verify( nodeId,
                    doc.get( KEY_PREDICATE ), doc.get( KEY_INDEX_SOURCE ) );
                MutableInteger count = counts.get( status );
                if ( count == null )
                {
                    count = new MutableInteger();
                    counts.put( status, count );
                }
                count.value++;
            }
            
            Map<Status, Integer> resultCounts = new HashMap<Status, Integer>();
            int errors = 0;
            for ( Map.Entry<Status, MutableInteger> count :
                counts.entrySet() )
            {
                resultCounts.put( count.getKey(), count.getValue().value );
                errors += ( count.getKey() == Status.OK ? 0 :
                    count.getValue().value );
            }
            hook.verificationCompleted( resultCounts );
            return errors == 0;
        }
        catch ( ParseException e )
        {
            throw new RuntimeException( e );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        finally
        {
            safeClose( searcher );
        }
    }
    
    private static class MutableInteger
    {
        private int value;
    }
    
    public LiteralReader getLiteralReader()
    {
        return this.literalReader;
    }
    
    public void setLiteralReader( LiteralReader reader )
    {
        this.literalReader = reader;
    }
    
    public void end( boolean commit )
    {
        try
        {
            end( graphDbUtil.getTransactionManager().getTransaction().hashCode(),
                commit );
        }
        catch ( SystemException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    public void end( int txId, boolean commit )
    {
        Collection<Object[]> commands = toIndex.remove( txId );
        if ( commands == null || !commit )
        {
            return;
        }
        
        for ( Object[] command : commands )
        {
            this.indexingQueue.add( command );
            this.indexingThread.hasItems = true;
        }
    }
    
    public boolean queueIsEmpty()
    {
        return !this.indexingThread.hasItems;
    }
    
    public void shutDown()
    {
//        TemporaryLogger.getLogger().info( getClass().getName() +
//            " shutDown called", new Exception() );
        internalShutDown();
    }
    
    private void internalShutDown()
    {
        indexingThread.halt();
        try
        {
            indexingThread.join();
        }
        catch ( InterruptedException e )
        {
            e.printStackTrace();
        }
        
        indexingQueue.close();
        try
        {
            directory.close();
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        safeClose( this.indexSearcher );
        this.indexSearcher = null;
    }
    
    private class IndexingThread extends Thread
    {
        private boolean halted;
        private boolean hasItems;
        private IndexWriter writer;
        private Collection<Entry> entriesToComplete = new ArrayList<Entry>();
        
        private void halt()
        {
            this.halted = true;
        }
        
        @Override
        public void run()
        {
            while ( !halted )
            {
                try
                {
                    hasItems = indexingQueue.hasNext();
                    while ( !halted && hasItems )
                    {
                        Entry entry = indexingQueue.next();
                        Object[] data = entry.data();
                        ensureWriters();
                        if ( ( Boolean ) data[ 0 ] )
                        {
                            doIndex( writer, ( Long ) data[ 1 ],
                                ( String ) data[ 2 ], data[ 3 ] );
                        }
                        else
                        {
                            doRemoveIndex( writer, ( Long ) data[ 1 ],
                                ( String ) data[ 2 ], data[ 3 ] );
                        }
                        entriesToComplete.add( entry );
                        
                        if ( entriesToComplete.size() >= BATCH_SIZE ||
                            !indexingQueue.hasNext() )
                        {
                            flushEntries();
                        }
                        hasItems = indexingQueue.hasNext();
                    }
                    
                    // This is so that it flushes if the indexer gets halted.
                    flushEntries();
                    
                    try
                    {
                        long time = System.currentTimeMillis();
                        while ( !halted &&
                            System.currentTimeMillis() - time < 100 )
                        {
                            hasItems = indexingQueue.hasNext();
                            Thread.sleep( 20 );
                        }
                    }
                    catch ( InterruptedException e )
                    {
                        Thread.interrupted();
                    }
                }
                catch ( Throwable t )
                {
                    t.printStackTrace();
                }
            }
        }
        
        private void ensureWriters() throws Exception
        {
            if ( writer == null )
            {
                writer = getWriter( false );
                writer.setMaxBufferedDocs( BATCH_SIZE * 2 );
                writer.setMaxBufferedDeleteTerms( BATCH_SIZE * 2 );
            }
        }
        
        private void flushEntries()
        {
            if ( writer == null )
            {
                return;
            }
            
            safeClose( writer );
            writer = null;
//            try
//            {
//                writer.commit();
//                
//            }
//            catch ( IOException e )
//            {
//                TemporaryLogger.getLogger().info(
//                    "Couldn't commit fulltext index writer ", e );
//                safeClose( writer );
//                writer = null;
//            }
            indexingQueue.markAsCompleted( entriesToComplete.toArray(
                new Entry[ entriesToComplete.size() ] ) );
            entriesToComplete.clear();
        }
    }
    
    private void delete()
    {
        deleteDir( new File( directoryPath ) );
        new File( queuePath ).delete();
    }
    
    protected void deleteDir( File dir )
    {
        if ( !dir.exists() )
        {
            return;
        }
        
        for ( File child : dir.listFiles() )
        {
            if ( child.isFile() )
            {
                child.delete();
            }
            else
            {
                deleteDir( child );
            }
        }
        dir.delete();
    }
}