NRTWorkspaceImpl.java example

Explorer
hibernate-search-master
/*
 * Hibernate Search, full-text search for your domain model
 *
 * License: GNU Lesser General Public License (LGPL), version 2.1 or later
 * See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
 */
package org.hibernate.search.backend.impl.lucene;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.hibernate.search.backend.AddLuceneWork;
import org.hibernate.search.backend.DeleteLuceneWork;
import org.hibernate.search.backend.FlushLuceneWork;
import org.hibernate.search.backend.IndexWorkVisitor;
import org.hibernate.search.backend.LuceneWork;
import org.hibernate.search.backend.OptimizeLuceneWork;
import org.hibernate.search.backend.PurgeAllLuceneWork;
import org.hibernate.search.backend.UpdateLuceneWork;
import org.hibernate.search.backend.impl.CommitPolicy;
import org.hibernate.search.backend.spi.DeleteByQueryLuceneWork;
import org.hibernate.search.exception.AssertionFailure;
import org.hibernate.search.indexes.spi.DirectoryBasedIndexManager;
import org.hibernate.search.indexes.spi.DirectoryBasedReaderProvider;
import org.hibernate.search.spi.WorkerBuildContext;
import org.hibernate.search.util.logging.impl.Log;
import org.hibernate.search.util.logging.impl.LoggerFactory;

/**
 * A {@code Workspace} implementation taking advantage of NRT Lucene features.
 * {@code IndexReader} instances are obtained directly from the {@code IndexWriter}, which is not forced
 * to flush all pending changes to the {@code Directory} structure.
 * <p>
 *
 * Lucene requires in its current version to flush delete operations, or the {@code IndexReader}s
 * retrieved via NRT will include deleted Document instances in queries; flushing delete operations
 * happens to be quite expensive so this {@code Workspace} implementation attempts to detect when such
 * a flush operation is needed.
 * <p>
 *
 * Applying write operations flags "indexReader requirements" with needs for either normal flush
 * or flush including deletes, but does not update {@code IndexReader} instances. The {@code IndexReader}s
 * are updated only if and when a fresh {@code IndexReader} is requested via {@link #openIndexReader()}.
 * This method will check if it can return the last opened {@code IndexReader} or in case of the reader being stale
 * open a fresh reader from the current {@code IndexWriter}.
 * <p>
 *
 * Generation counters are used to track need-at-least version versus last-updated-at version:
 * shared state is avoided between index writers and reader threads to avoid high complexity.
 * The method {@link #afterTransactionApplied(boolean, boolean)} might trigger multiple times flagging
 * the index to be dirty without triggering an actual {@code IndexReader} refresh, so the version counters
 * can have gaps: method {@link #refreshReaders()} will always jump to latest seen version, as it will
 * refresh the index to satisfy both kinds of flush requirements (writes and deletes).
 * <p>
 *
 * We keep a reference {@code IndexReader} in the {@link #currentReader} atomic reference as a fast path
 * for multiple read events when the index is not dirty.
 * <p>
 *
 * This class implements both {@code Workspace} and {@code ReaderProvider}.
 *
 * @author Sanne Grinovero (C) 2011 Red Hat Inc.
 */
public class NRTWorkspaceImpl extends AbstractWorkspaceImpl implements DirectoryBasedReaderProvider {

	private static final Log log = LoggerFactory.make();

	private final ReentrantLock writeLock = new ReentrantLock();
	private final AtomicReference<DirectoryReader> currentReader = new AtomicReference<DirectoryReader>();
	private final CommitPolicy commitPolicy = new NRTCommitPolicy( writerHolder );

	/**
	 * Visits {@code LuceneWork} types and applies the required kind of index flushing
	 */
	private final FlushStrategyExecutor flushStrategySelector = new FlushStrategyExecutor();

	/**
	 * Set to true when this service is shutdown (not revertible)
	 */
	private boolean shutdown = false;

	/**
	 * When true a flush operation should make sure all write operations are flushed,
	 * otherwise a simpler flush strategy can be picked.
	 */
	private final AtomicBoolean needFlushWrites = new AtomicBoolean( true );

	/**
	 * Often when flushing deletes don't need to be applied. Some operation might have requested otherwise:
	 */
	private final AtomicBoolean needFlushDeletes = new AtomicBoolean( false );

	/**
	 * Internal counter used to mark different generations of IndexReaders. Monotonic incremental.
	 * Not expecting an overflow in this planet's lifetime.
	 */
	private final AtomicLong readerGeneration = new AtomicLong( 0 );

	/**
	 * When refreshing an {@code IndexReader} to achieve a fresh snapshot to a generation, we need to check this
	 * value to see if deletions need to be flushed. We try hard to not flush deletions as that is a
	 * very expensive operation.
	 * NOTE: concurrently accessed. Guarded by readerGenRequiringFlushWrites: read the other first, write it last.
	 */
	private long readerGenRequiringFlushDeletes = 0;

	/**
	 * As with {@link #readerGenRequiringFlushDeletes}, if this value is above the value of {@link #currentReaderGen}
	 * a new {@code IndexReader} should be opened as the current generation is stale.
	 */
	private volatile long readerGenRequiringFlushWrites = 0;

	/**
	 * Generation identifier of the current open {@code IndexReader} (the one stored in {@link #currentReader}
	 */
	private volatile long currentReaderGen = 0;

	public NRTWorkspaceImpl(DirectoryBasedIndexManager indexManager, WorkerBuildContext buildContext, Properties cfg) {
		super( indexManager, buildContext, cfg );
	}

	@Override
	public void afterTransactionApplied(boolean someFailureHappened, boolean streaming) {
		commitPolicy.onChangeSetApplied( someFailureHappened, streaming );
		if ( ! streaming ) {
			setupNewReadersRequirements();
		}
	}

	/**
	 * Translates fields as{@code needFlushWrites} and {@code needFlushDeletes} in a set of requirements as checked
	 * by reader threads. This is commonly invoked by a single thread (so no contention on this method
	 * is expected) but it needs to expose a consistent view of the written fields to {@link #refreshReaders()}.
	 * This is normally not invoked in parallel by multiple threads as the backend design allows a single working thread
	 * per index, but it could be invoked concurrently when streaming work is being applied (when a MassIndexer is
	 * running). Note that multiple threads invoking this in parallel might result in skipping some sequence numbers
	 * but that's not a problem.
	 */
	private void setupNewReadersRequirements() {
		if ( needFlushDeletes.get() || needFlushWrites.get() ) {
			final long nextGenId = readerGeneration.incrementAndGet();
			if ( needFlushDeletes.get() ) {
				this.needFlushDeletes.lazySet( false ); //flushed by volatile write at end of method
				this.readerGenRequiringFlushDeletes = nextGenId; //flushed by volatile write at end of method
			}
			this.needFlushWrites.lazySet( false ); //flushed by volatile write at end of method
			this.readerGenRequiringFlushWrites = nextGenId;
		}
	}

	/**
	 * Invoked when a refresh of current {@code IndexReader}s is detected necessary.
	 *
	 * The implementation is blocking to maximize reuse of a single {@code IndexReader} (better for buffer usage,
	 * caching, ..) and to avoid multiple threads trying and opening the same resources at the same time.
	 *
	 * @return the refreshed {@code IndexReader}
	 */
	private synchronized DirectoryReader refreshReaders() {
		//double-check for the case we don't need anymore to refresh
		if ( indexReaderIsFresh() ) {
			return currentReader.get();
		}
		//order of the following two reads DOES matter:
		final long readerGenRequiringFlushWrites = this.readerGenRequiringFlushWrites;
		final long readerGenRequiringFlushDeletes = this.readerGenRequiringFlushDeletes;
		final boolean flushDeletes = currentReaderGen < readerGenRequiringFlushDeletes;
		final long openingGen = Math.max( readerGenRequiringFlushDeletes, readerGenRequiringFlushWrites );

		final DirectoryReader newIndexReader = writerHolder.openNRTIndexReader( flushDeletes );
		final DirectoryReader oldReader = currentReader.getAndSet( newIndexReader );
		this.currentReaderGen = openingGen;
		try {
			if ( oldReader != null ) {
				oldReader.decRef();
			}
		}
		catch (IOException e) {
			log.unableToCloseLuceneIndexReader( e );
		}
		return newIndexReader;
	}

	private boolean indexReaderIsFresh() {
		final long currentReaderGen = this.currentReaderGen;
		//Note it reads the volatile first. These two longs are always updated in pairs.
		return currentReaderGen >= readerGenRequiringFlushWrites && currentReaderGen >= readerGenRequiringFlushDeletes;
	}

	@Override
	public DirectoryReader openIndexReader() {
		return openIndexReader( ! indexReaderIsFresh() );
	}

	/**
	 * @param needRefresh when {@code false} it won't guarantee the index reader to be affected by "latest" changes
	 * @return returns an {@code IndexReader} instance, either pooled or a new one
	 */
	private DirectoryReader openIndexReader(final boolean needRefresh) {
		DirectoryReader indexReader;
		if ( needRefresh ) {
			indexReader = refreshReaders();
		}
		else {
			indexReader = currentReader.get();
		}
		if ( indexReader == null ) {
			writeLock.lock();
			try {
				if ( shutdown ) {
					throw new AssertionFailure( "IndexReader requested after ReaderProvider is shutdown" );
				}
				indexReader = currentReader.get();
				if ( indexReader == null ) {
					indexReader = writerHolder.openDirectoryIndexReader();
					currentReader.set( indexReader );
				}
			}
			finally {
				writeLock.unlock();
			}
		}
		if ( indexReader.tryIncRef() ) {
			return indexReader;
		}
		else {
			//In this case we have a race: the chosen IndexReader was closed before we could increment its reference, so we need
			//to try again. Basically an optimistic lock as the race condition is very unlikely.
			//Changes should be tested at least with ReadWriteParallelismTest (in the performance tests module).
			//In case new writes happened there is no need to refresh again.
			return openIndexReader( false );
		}
	}

	@Override
	public void closeIndexReader(IndexReader reader) {
		if ( reader == null ) {
			return;
		}
		try {
			//don't use IndexReader#close as it prevents further counter decrements!
			reader.decRef();
		}
		catch (IOException e) {
			log.unableToCloseLuceneIndexReader( e );
		}
	}

	@Override
	public void initialize(DirectoryBasedIndexManager indexManager, Properties props) {
	}

	@Override
	public void stop() {
		writeLock.lock();
		try {
			final IndexReader oldReader = currentReader.getAndSet( null );
			closeIndexReader( oldReader );
			shutdown = true;
		}
		finally {
			writeLock.unlock();
		}
	}

	@Override
	public void flush() {
		//Even if this is the NRT workspace, Flush is implemented as a real Flush to make sure
		//MassIndexer output is committed to permanent storage
		commitPolicy.onFlush();
	}

	@Override
	public void notifyWorkApplied(LuceneWork work) {
		incrementModificationCounter();
		work.acceptIndexWorkVisitor( flushStrategySelector, this );
	}

	@Override
	public CommitPolicy getCommitPolicy() {
		return commitPolicy;
	}

	/**
	 * Visits each kind of {@code LuceneWork} we're processing and applies the correct flushing strategy to create
	 * consistent index readers.
	 */
	private static class FlushStrategyExecutor implements IndexWorkVisitor<NRTWorkspaceImpl, Void> {

		@Override
		public Void visitAddWork(AddLuceneWork addLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_WRITES.apply( p );
			return null;
		}

		@Override
		public Void visitDeleteWork(DeleteLuceneWork deleteLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_DELETIONS.apply( p );
			return null;
		}

		@Override
		public Void visitOptimizeWork(OptimizeLuceneWork optimizeLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.NONE.apply( p );
			return null;
		}

		@Override
		public Void visitPurgeAllWork(PurgeAllLuceneWork purgeAllLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_DELETIONS.apply( p );
			return null;
		}

		@Override
		public Void visitUpdateWork(UpdateLuceneWork updateLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_WRITES_AND_DELETES.apply( p );
			return null;
		}

		@Override
		public Void visitFlushWork(FlushLuceneWork flushLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_WRITES_AND_DELETES.apply( p );
			return null;
		}

		@Override
		public Void visitDeleteByQueryWork(DeleteByQueryLuceneWork deleteByQueryLuceneWork, NRTWorkspaceImpl p) {
			FlushStrategy.FLUSH_DELETIONS.apply( p );
			return null;
		}
	}

	private enum FlushStrategy {
		NONE {
			@Override
			void apply(final NRTWorkspaceImpl workspace) {
			}
		},
		FLUSH_DELETIONS {
			@Override
			void apply(final NRTWorkspaceImpl workspace) {
				// AtomicBoolean#lazySet is good enough as we only want to provide reads consistent with the state
				// the application is expecting. If for example no other flush is happening down the road
				// (which will eventually flush this write too) we're fine for other cores to "see"
				// IndexReader instances slightly stale.
				workspace.needFlushDeletes.lazySet( true );
			}
		},
		FLUSH_WRITES {
			@Override
			void apply(final NRTWorkspaceImpl workspace) {
				//See FLUSH_DELETIONS for why #lazySet is good enough.
				workspace.needFlushWrites.lazySet( true );
			}
		},
		FLUSH_WRITES_AND_DELETES {
			@Override
			void apply(NRTWorkspaceImpl workspace) {
				FLUSH_DELETIONS.apply( workspace );
				FLUSH_WRITES.apply( workspace );
			}
		};
		abstract void apply(NRTWorkspaceImpl workspace);
	}

}