package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.PrintStream;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ThreadInterruptedException;
/**
* This class accepts multiple added documents and directly
* writes a single segment file. It does this more
* efficiently than creating a single segment per document
* (with DocumentWriter) and doing standard merges on those
* segments.
*
* Each added document is passed to the {@link DocConsumer},
* which in turn processes the document and interacts with
* other consumers in the indexing chain. Certain
* consumers, like {@link StoredFieldsWriter} and {@link
* TermVectorsTermsWriter}, digest a document and
* immediately write bytes to the "doc store" files (ie,
* they do not consume RAM per document, except while they
* are processing the document).
*
* Other consumers, eg {@link FreqProxTermsWriter} and
* {@link NormsWriter}, buffer bytes in RAM and flush only
* when a new segment is produced.
* Once we have used our allowed RAM buffer, or the number
* of added docs is large enough (in the case we are
* flushing by doc count instead of RAM usage), we create a
* real segment and flush it to the Directory.
*
* Threads:
*
* Multiple threads are allowed into addDocument at once.
* There is an initial synchronized call to getThreadState
* which allocates a ThreadState for this thread. The same
* thread will get the same ThreadState over time (thread
* affinity) so that if there are consistent patterns (for
* example each thread is indexing a different content
* source) then we make better use of RAM. Then
* processDocument is called on that ThreadState without
* synchronization (most of the "heavy lifting" is in this
* call). Finally the synchronized "finishDocument" is
* called to flush changes to the directory.
*
* When flush is called by IndexWriter we forcefully idle
* all threads and flush only once they are all idle. This
* means you can call flush with a given thread even while
* other threads are actively adding/deleting documents.
*
*
* Exceptions:
*
* Because this class directly updates in-memory posting
* lists, and flushes stored fields and term vectors
* directly to files in the directory, there are certain
* limited times when an exception can corrupt this state.
* For example, a disk full while flushing stored fields
* leaves this file in a corrupt state. Or, an OOM
* exception while appending to the in-memory posting lists
* can corrupt that posting list. We call such exceptions
* "aborting exceptions". In these cases we must call
* abort() to discard all docs added since the last flush.
*
* All other exceptions ("non-aborting exceptions") can
* still partially update the index structures. These
* updates are consistent, but, they represent only a part
* of the document seen up until the exception was hit.
* When this happens, we immediately mark the document as
* deleted so that the document is always atomically ("all
* or none") added to the index.
*/
final class DocumentsWriter {
final AtomicLong bytesUsed = new AtomicLong(0);
IndexWriter writer;
Directory directory;
String segment; // Current segment we are working on
private int nextDocID; // Next docID to be added
private int numDocs; // # of docs added, but not yet flushed
// Max # ThreadState instances; if there are more threads
// than this they share ThreadStates
private DocumentsWriterThreadState[] threadStates = new DocumentsWriterThreadState[0];
private final HashMap<Thread,DocumentsWriterThreadState> threadBindings = new HashMap<Thread,DocumentsWriterThreadState>();
boolean bufferIsFull; // True when it's time to write segment
private boolean aborting; // True if an abort is pending
PrintStream infoStream;
int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
Similarity similarity;
// max # simultaneous threads; if there are more than
// this, they wait for others to finish first
private final int maxThreadStates;
// Deletes for our still-in-RAM (to be flushed next) segment
private BufferedDeletes pendingDeletes = new BufferedDeletes();
static class DocState {
DocumentsWriter docWriter;
Analyzer analyzer;
int maxFieldLength;
PrintStream infoStream;
Similarity similarity;
int docID;
Document doc;
String maxTermPrefix;
// Only called by asserts
public boolean testPoint(String name) {
return docWriter.writer.testPoint(name);
}
public void clear() {
// don't hold onto doc nor analyzer, in case it is
// largish:
doc = null;
analyzer = null;
}
}
/** Consumer returns this on each doc. This holds any
* state that must be flushed synchronized "in docID
* order". We gather these and flush them in order. */
abstract static class DocWriter {
DocWriter next;
int docID;
abstract void finish() throws IOException;
abstract void abort();
abstract long sizeInBytes();
void setNext(DocWriter next) {
this.next = next;
}
}
/**
* Create and return a new DocWriterBuffer.
*/
PerDocBuffer newPerDocBuffer() {
return new PerDocBuffer();
}
/**
* RAMFile buffer for DocWriters.
*/
class PerDocBuffer extends RAMFile {
/**
* Allocate bytes used from shared pool.
*/
@Override
protected byte[] newBuffer(int size) {
assert size == PER_DOC_BLOCK_SIZE;
return perDocAllocator.getByteBlock();
}
/**
* Recycle the bytes used.
*/
synchronized void recycle() {
if (buffers.size() > 0) {
setLength(0);
// Recycle the blocks
perDocAllocator.recycleByteBlocks(buffers);
buffers.clear();
sizeInBytes = 0;
assert numBuffers() == 0;
}
}
}
/**
* The IndexingChain must define the {@link #getChain(DocumentsWriter)} method
* which returns the DocConsumer that the DocumentsWriter calls to process the
* documents.
*/
abstract static class IndexingChain {
abstract DocConsumer getChain(DocumentsWriter documentsWriter);
}
static final IndexingChain defaultIndexingChain = new IndexingChain() {
@Override
DocConsumer getChain(DocumentsWriter documentsWriter) {
/*
This is the current indexing chain:
DocConsumer / DocConsumerPerThread
--> code: DocFieldProcessor / DocFieldProcessorPerThread
--> DocFieldConsumer / DocFieldConsumerPerThread / DocFieldConsumerPerField
--> code: DocFieldConsumers / DocFieldConsumersPerThread / DocFieldConsumersPerField
--> code: DocInverter / DocInverterPerThread / DocInverterPerField
--> InvertedDocConsumer / InvertedDocConsumerPerThread / InvertedDocConsumerPerField
--> code: TermsHash / TermsHashPerThread / TermsHashPerField
--> TermsHashConsumer / TermsHashConsumerPerThread / TermsHashConsumerPerField
--> code: FreqProxTermsWriter / FreqProxTermsWriterPerThread / FreqProxTermsWriterPerField
--> code: TermVectorsTermsWriter / TermVectorsTermsWriterPerThread / TermVectorsTermsWriterPerField
--> InvertedDocEndConsumer / InvertedDocConsumerPerThread / InvertedDocConsumerPerField
--> code: NormsWriter / NormsWriterPerThread / NormsWriterPerField
--> code: StoredFieldsWriter / StoredFieldsWriterPerThread / StoredFieldsWriterPerField
*/
// Build up indexing chain:
final TermsHashConsumer termVectorsWriter = new TermVectorsTermsWriter(documentsWriter);
final TermsHashConsumer freqProxWriter = new FreqProxTermsWriter();
final InvertedDocConsumer termsHash = new TermsHash(documentsWriter, true, freqProxWriter,
new TermsHash(documentsWriter, false, termVectorsWriter, null));
final NormsWriter normsWriter = new NormsWriter();
final DocInverter docInverter = new DocInverter(termsHash, normsWriter);
return new DocFieldProcessor(documentsWriter, docInverter);
}
};
final DocConsumer consumer;
// How much RAM we can use before flushing. This is 0 if
// we are flushing by doc count instead.
private final IndexWriterConfig config;
private boolean closed;
private final FieldInfos fieldInfos;
private final BufferedDeletesStream bufferedDeletesStream;
private final IndexWriter.FlushControl flushControl;
DocumentsWriter(IndexWriterConfig config, Directory directory, IndexWriter writer, FieldInfos fieldInfos, BufferedDeletesStream bufferedDeletesStream) throws IOException {
this.directory = directory;
this.writer = writer;
this.similarity = config.getSimilarity();
this.maxThreadStates = config.getMaxThreadStates();
this.fieldInfos = fieldInfos;
this.bufferedDeletesStream = bufferedDeletesStream;
flushControl = writer.flushControl;
consumer = config.getIndexingChain().getChain(this);
this.config = config;
}
// Buffer a specific docID for deletion. Currently only
// used when we hit a exception when adding a document
synchronized void deleteDocID(int docIDUpto) {
pendingDeletes.addDocID(docIDUpto);
// NOTE: we do not trigger flush here. This is
// potentially a RAM leak, if you have an app that tries
// to add docs but every single doc always hits a
// non-aborting exception. Allowing a flush here gets
// very messy because we are only invoked when handling
// exceptions so to do this properly, while handling an
// exception we'd have to go off and flush new deletes
// which is risky (likely would hit some other
// confounding exception).
}
boolean deleteQueries(Query... queries) {
final boolean doFlush = flushControl.waitUpdate(0, queries.length);
synchronized(this) {
for (Query query : queries) {
pendingDeletes.addQuery(query, numDocs);
}
}
return doFlush;
}
boolean deleteQuery(Query query) {
final boolean doFlush = flushControl.waitUpdate(0, 1);
synchronized(this) {
pendingDeletes.addQuery(query, numDocs);
}
return doFlush;
}
boolean deleteTerms(Term... terms) {
final boolean doFlush = flushControl.waitUpdate(0, terms.length);
synchronized(this) {
for (Term term : terms) {
pendingDeletes.addTerm(term, numDocs);
}
}
return doFlush;
}
// TODO: we could check w/ FreqProxTermsWriter: if the
// term doesn't exist, don't bother buffering into the
// per-DWPT map (but still must go into the global map)
boolean deleteTerm(Term term, boolean skipWait) {
final boolean doFlush = flushControl.waitUpdate(0, 1, skipWait);
synchronized(this) {
pendingDeletes.addTerm(term, numDocs);
}
return doFlush;
}
public FieldInfos getFieldInfos() {
return fieldInfos;
}
/** If non-null, various details of indexing are printed
* here. */
synchronized void setInfoStream(PrintStream infoStream) {
this.infoStream = infoStream;
for(int i=0;i<threadStates.length;i++) {
threadStates[i].docState.infoStream = infoStream;
}
}
synchronized void setMaxFieldLength(int maxFieldLength) {
this.maxFieldLength = maxFieldLength;
for(int i=0;i<threadStates.length;i++) {
threadStates[i].docState.maxFieldLength = maxFieldLength;
}
}
synchronized void setSimilarity(Similarity similarity) {
this.similarity = similarity;
for(int i=0;i<threadStates.length;i++) {
threadStates[i].docState.similarity = similarity;
}
}
/** Get current segment name we are writing. */
synchronized String getSegment() {
return segment;
}
/** Returns how many docs are currently buffered in RAM. */
synchronized int getNumDocs() {
return numDocs;
}
void message(String message) {
if (infoStream != null) {
writer.message("DW: " + message);
}
}
synchronized void setAborting() {
if (infoStream != null) {
message("setAborting");
}
aborting = true;
}
/** Called if we hit an exception at a bad time (when
* updating the index files) and must discard all
* currently buffered docs. This resets our state,
* discarding any docs added since last flush. */
synchronized void abort() throws IOException {
if (infoStream != null) {
message("docWriter: abort");
}
boolean success = false;
try {
// Forcefully remove waiting ThreadStates from line
try {
waitQueue.abort();
} catch (Throwable t) {
}
// Wait for all other threads to finish with
// DocumentsWriter:
try {
waitIdle();
} finally {
if (infoStream != null) {
message("docWriter: abort waitIdle done");
}
assert 0 == waitQueue.numWaiting: "waitQueue.numWaiting=" + waitQueue.numWaiting;
waitQueue.waitingBytes = 0;
pendingDeletes.clear();
for (DocumentsWriterThreadState threadState : threadStates) {
try {
threadState.consumer.abort();
} catch (Throwable t) {
}
}
try {
consumer.abort();
} catch (Throwable t) {
}
// Reset all postings data
doAfterFlush();
}
success = true;
} finally {
aborting = false;
notifyAll();
if (infoStream != null) {
message("docWriter: done abort; success=" + success);
}
}
}
/** Reset after a flush */
private void doAfterFlush() throws IOException {
// All ThreadStates should be idle when we are called
assert allThreadsIdle();
threadBindings.clear();
waitQueue.reset();
segment = null;
numDocs = 0;
nextDocID = 0;
bufferIsFull = false;
for(int i=0;i<threadStates.length;i++) {
threadStates[i].doAfterFlush();
}
}
private synchronized boolean allThreadsIdle() {
for(int i=0;i<threadStates.length;i++) {
if (!threadStates[i].isIdle) {
return false;
}
}
return true;
}
synchronized boolean anyChanges() {
return numDocs != 0 || pendingDeletes.any();
}
// for testing
public BufferedDeletes getPendingDeletes() {
return pendingDeletes;
}
private void pushDeletes(SegmentInfo newSegment, SegmentInfos segmentInfos) {
// Lock order: DW -> BD
final long delGen = bufferedDeletesStream.getNextGen();
if (pendingDeletes.any()) {
if (segmentInfos.size() > 0 || newSegment != null) {
final FrozenBufferedDeletes packet = new FrozenBufferedDeletes(pendingDeletes, delGen);
if (infoStream != null) {
message("flush: push buffered deletes startSize=" + pendingDeletes.bytesUsed.get() + " frozenSize=" + packet.bytesUsed);
}
bufferedDeletesStream.push(packet);
if (infoStream != null) {
message("flush: delGen=" + packet.gen);
}
if (newSegment != null) {
newSegment.setBufferedDeletesGen(packet.gen);
}
} else {
if (infoStream != null) {
message("flush: drop buffered deletes: no segments");
}
// We can safely discard these deletes: since
// there are no segments, the deletions cannot
// affect anything.
}
pendingDeletes.clear();
} else if (newSegment != null) {
newSegment.setBufferedDeletesGen(delGen);
}
}
public boolean anyDeletions() {
return pendingDeletes.any();
}
/** Flush all pending docs to a new segment */
// Lock order: IW -> DW
synchronized SegmentInfo flush(IndexWriter writer, IndexFileDeleter deleter, MergePolicy mergePolicy, SegmentInfos segmentInfos) throws IOException {
final long startTime = System.currentTimeMillis();
// We change writer's segmentInfos:
assert Thread.holdsLock(writer);
waitIdle();
if (numDocs == 0) {
// nothing to do!
if (infoStream != null) {
message("flush: no docs; skipping");
}
// Lock order: IW -> DW -> BD
pushDeletes(null, segmentInfos);
return null;
}
if (aborting) {
if (infoStream != null) {
message("flush: skip because aborting is set");
}
return null;
}
boolean success = false;
SegmentInfo newSegment;
try {
//System.out.println(Thread.currentThread().getName() + ": nw=" + waitQueue.numWaiting);
assert nextDocID == numDocs: "nextDocID=" + nextDocID + " numDocs=" + numDocs;
assert waitQueue.numWaiting == 0: "numWaiting=" + waitQueue.numWaiting;
assert waitQueue.waitingBytes == 0;
if (infoStream != null) {
message("flush postings as segment " + segment + " numDocs=" + numDocs);
}
final SegmentWriteState flushState = new SegmentWriteState(infoStream, directory, segment, fieldInfos,
numDocs, writer.getConfig().getTermIndexInterval(),
pendingDeletes);
// Apply delete-by-docID now (delete-byDocID only
// happens when an exception is hit processing that
// doc, eg if analyzer has some problem w/ the text):
if (pendingDeletes.docIDs.size() > 0) {
flushState.deletedDocs = new BitVector(numDocs);
for(int delDocID : pendingDeletes.docIDs) {
flushState.deletedDocs.set(delDocID);
}
pendingDeletes.bytesUsed.addAndGet(-pendingDeletes.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
pendingDeletes.docIDs.clear();
}
newSegment = new SegmentInfo(segment, numDocs, directory, false, true, fieldInfos.hasProx(), false);
Collection<DocConsumerPerThread> threads = new HashSet<DocConsumerPerThread>();
for (DocumentsWriterThreadState threadState : threadStates) {
threads.add(threadState.consumer);
}
double startMBUsed = bytesUsed()/1024./1024.;
consumer.flush(threads, flushState);
newSegment.setHasVectors(flushState.hasVectors);
if (infoStream != null) {
message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors"));
if (flushState.deletedDocs != null) {
message("new segment has " + flushState.deletedDocs.count() + " deleted docs");
}
message("flushedFiles=" + newSegment.files());
}
if (mergePolicy.useCompoundFile(segmentInfos, newSegment)) {
final String cfsFileName = IndexFileNames.segmentFileName(segment, IndexFileNames.COMPOUND_FILE_EXTENSION);
if (infoStream != null) {
message("flush: create compound file \"" + cfsFileName + "\"");
}
CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, cfsFileName);
for(String fileName : newSegment.files()) {
cfsWriter.addFile(fileName);
}
cfsWriter.close();
deleter.deleteNewFiles(newSegment.files());
newSegment.setUseCompoundFile(true);
}
// Must write deleted docs after the CFS so we don't
// slurp the del file into CFS:
if (flushState.deletedDocs != null) {
final int delCount = flushState.deletedDocs.count();
assert delCount > 0;
newSegment.setDelCount(delCount);
newSegment.advanceDelGen();
final String delFileName = newSegment.getDelFileName();
if (infoStream != null) {
message("flush: write " + delCount + " deletes to " + delFileName);
}
boolean success2 = false;
try {
// TODO: in the NRT case it'd be better to hand
// this del vector over to the
// shortly-to-be-opened SegmentReader and let it
// carry the changes; there's no reason to use
// filesystem as intermediary here.
flushState.deletedDocs.write(directory, delFileName);
success2 = true;
} finally {
if (!success2) {
try {
directory.deleteFile(delFileName);
} catch (Throwable t) {
// suppress this so we keep throwing the
// original exception
}
}
}
}
if (infoStream != null) {
message("flush: segment=" + newSegment);
final double newSegmentSizeNoStore = newSegment.sizeInBytes(false)/1024./1024.;
final double newSegmentSize = newSegment.sizeInBytes(true)/1024./1024.;
message(" ramUsed=" + nf.format(startMBUsed) + " MB" +
" newFlushedSize=" + nf.format(newSegmentSize) + " MB" +
" (" + nf.format(newSegmentSizeNoStore) + " MB w/o doc stores)" +
" docs/MB=" + nf.format(numDocs / newSegmentSize) +
" new/old=" + nf.format(100.0 * newSegmentSizeNoStore / startMBUsed) + "%");
}
success = true;
} finally {
notifyAll();
if (!success) {
if (segment != null) {
deleter.refresh(segment);
}
abort();
}
}
doAfterFlush();
// Lock order: IW -> DW -> BD
pushDeletes(newSegment, segmentInfos);
if (infoStream != null) {
message("flush time " + (System.currentTimeMillis()-startTime) + " msec");
}
return newSegment;
}
synchronized void close() {
closed = true;
notifyAll();
}
/** Returns a free (idle) ThreadState that may be used for
* indexing this one document. This call also pauses if a
* flush is pending. If delTerm is non-null then we
* buffer this deleted term after the thread state has
* been acquired. */
synchronized DocumentsWriterThreadState getThreadState(Term delTerm, int docCount) throws IOException {
final Thread currentThread = Thread.currentThread();
assert !Thread.holdsLock(writer);
// First, find a thread state. If this thread already
// has affinity to a specific ThreadState, use that one
// again.
DocumentsWriterThreadState state = threadBindings.get(currentThread);
if (state == null) {
// First time this thread has called us since last
// flush. Find the least loaded thread state:
DocumentsWriterThreadState minThreadState = null;
for(int i=0;i<threadStates.length;i++) {
DocumentsWriterThreadState ts = threadStates[i];
if (minThreadState == null || ts.numThreads < minThreadState.numThreads) {
minThreadState = ts;
}
}
if (minThreadState != null && (minThreadState.numThreads == 0 || threadStates.length >= maxThreadStates)) {
state = minThreadState;
state.numThreads++;
} else {
// Just create a new "private" thread state
DocumentsWriterThreadState[] newArray = new DocumentsWriterThreadState[1+threadStates.length];
if (threadStates.length > 0) {
System.arraycopy(threadStates, 0, newArray, 0, threadStates.length);
}
state = newArray[threadStates.length] = new DocumentsWriterThreadState(this);
threadStates = newArray;
}
threadBindings.put(currentThread, state);
}
// Next, wait until my thread state is idle (in case
// it's shared with other threads), and no flush/abort
// pending
waitReady(state);
// Allocate segment name if this is the first doc since
// last flush:
if (segment == null) {
segment = writer.newSegmentName();
assert numDocs == 0;
}
state.docState.docID = nextDocID;
nextDocID += docCount;
if (delTerm != null) {
pendingDeletes.addTerm(delTerm, state.docState.docID);
}
numDocs += docCount;
state.isIdle = false;
return state;
}
boolean addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
return updateDocument(doc, analyzer, null);
}
boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
throws CorruptIndexException, IOException {
// Possibly trigger a flush, or wait until any running flush completes:
boolean doFlush = flushControl.waitUpdate(1, delTerm != null ? 1 : 0);
// This call is synchronized but fast
final DocumentsWriterThreadState state = getThreadState(delTerm, 1);
final DocState docState = state.docState;
docState.doc = doc;
docState.analyzer = analyzer;
boolean success = false;
try {
// This call is not synchronized and does all the
// work
final DocWriter perDoc;
try {
perDoc = state.consumer.processDocument();
} finally {
docState.clear();
}
// This call is synchronized but fast
finishDocument(state, perDoc);
success = true;
} finally {
if (!success) {
// If this thread state had decided to flush, we
// must clear it so another thread can flush
if (doFlush) {
flushControl.clearFlushPending();
}
if (infoStream != null) {
message("exception in updateDocument aborting=" + aborting);
}
synchronized(this) {
state.isIdle = true;
notifyAll();
if (aborting) {
abort();
} else {
skipDocWriter.docID = docState.docID;
boolean success2 = false;
try {
waitQueue.add(skipDocWriter);
success2 = true;
} finally {
if (!success2) {
abort();
return false;
}
}
// Immediately mark this document as deleted
// since likely it was partially added. This
// keeps indexing as "all or none" (atomic) when
// adding a document:
deleteDocID(state.docState.docID);
}
}
}
}
doFlush |= flushControl.flushByRAMUsage("new document");
return doFlush;
}
boolean updateDocuments(Collection<Document> docs, Analyzer analyzer, Term delTerm)
throws CorruptIndexException, IOException {
// Possibly trigger a flush, or wait until any running flush completes:
boolean doFlush = flushControl.waitUpdate(docs.size(), delTerm != null ? 1 : 0);
final int docCount = docs.size();
// This call is synchronized but fast -- we allocate the
// N docIDs up front:
final DocumentsWriterThreadState state = getThreadState(null, docCount);
final DocState docState = state.docState;
final int startDocID = docState.docID;
int docID = startDocID;
//System.out.println(Thread.currentThread().getName() + ": A " + docCount);
for(Document doc : docs) {
docState.doc = doc;
docState.analyzer = analyzer;
// Assign next docID from our block:
docState.docID = docID++;
boolean success = false;
try {
// This call is not synchronized and does all the
// work
final DocWriter perDoc;
try {
perDoc = state.consumer.processDocument();
} finally {
docState.clear();
}
// Must call this w/o holding synchronized(this) else
// we'll hit deadlock:
balanceRAM();
// Synchronized but fast
synchronized(this) {
if (aborting) {
break;
}
assert perDoc == null || perDoc.docID == docState.docID;
final boolean doPause;
if (perDoc != null) {
waitQueue.add(perDoc);
} else {
skipDocWriter.docID = docState.docID;
waitQueue.add(skipDocWriter);
}
}
success = true;
} finally {
if (!success) {
//System.out.println(Thread.currentThread().getName() + ": E");
// If this thread state had decided to flush, we
// must clear it so another thread can flush
if (doFlush) {
message("clearFlushPending!");
flushControl.clearFlushPending();
}
if (infoStream != null) {
message("exception in updateDocuments aborting=" + aborting);
}
synchronized(this) {
state.isIdle = true;
notifyAll();
if (aborting) {
abort();
} else {
// Fill hole in the doc stores for all
// docIDs we pre-allocated
//System.out.println(Thread.currentThread().getName() + ": F " + docCount);
final int endDocID = startDocID + docCount;
docID = docState.docID;
while(docID < endDocID) {
skipDocWriter.docID = docID++;
boolean success2 = false;
try {
waitQueue.add(skipDocWriter);
success2 = true;
} finally {
if (!success2) {
abort();
return false;
}
}
}
//System.out.println(Thread.currentThread().getName() + ": F " + docCount + " done");
// Mark all pre-allocated docIDs as deleted:
docID = startDocID;
while(docID < startDocID + docs.size()) {
deleteDocID(docID++);
}
}
}
}
}
}
synchronized(this) {
// We must delay pausing until the full doc block is
// added, else we can hit deadlock if more than one
// thread is adding a block and we need to pause when
// both are only part way done:
if (waitQueue.doPause()) {
waitForWaitQueue();
}
//System.out.println(Thread.currentThread().getName() + ": A " + docCount);
if (aborting) {
// We are currently aborting, and another thread is
// waiting for me to become idle. We just forcefully
// idle this threadState; it will be fully reset by
// abort()
state.isIdle = true;
// wakes up any threads waiting on the wait queue
notifyAll();
abort();
if (doFlush) {
message("clearFlushPending!");
flushControl.clearFlushPending();
}
return false;
}
// Apply delTerm only after all indexing has
// succeeded, but apply it only to docs prior to when
// this batch started:
if (delTerm != null) {
pendingDeletes.addTerm(delTerm, startDocID);
}
state.isIdle = true;
// wakes up any threads waiting on the wait queue
notifyAll();
}
doFlush |= flushControl.flushByRAMUsage("new document");
//System.out.println(Thread.currentThread().getName() + ": B " + docCount);
return doFlush;
}
public synchronized void waitIdle() {
while (!allThreadsIdle()) {
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
}
synchronized void waitReady(DocumentsWriterThreadState state) {
while (!closed && (!state.isIdle || aborting)) {
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
if (closed) {
throw new AlreadyClosedException("this IndexWriter is closed");
}
}
/** Does the synchronized work to finish/flush the
* inverted document. */
private void finishDocument(DocumentsWriterThreadState perThread, DocWriter docWriter) throws IOException {
// Must call this w/o holding synchronized(this) else
// we'll hit deadlock:
balanceRAM();
synchronized(this) {
assert docWriter == null || docWriter.docID == perThread.docState.docID;
if (aborting) {
// We are currently aborting, and another thread is
// waiting for me to become idle. We just forcefully
// idle this threadState; it will be fully reset by
// abort()
if (docWriter != null) {
try {
docWriter.abort();
} catch (Throwable t) {
}
}
perThread.isIdle = true;
// wakes up any threads waiting on the wait queue
notifyAll();
return;
}
final boolean doPause;
if (docWriter != null) {
doPause = waitQueue.add(docWriter);
} else {
skipDocWriter.docID = perThread.docState.docID;
doPause = waitQueue.add(skipDocWriter);
}
if (doPause) {
waitForWaitQueue();
}
perThread.isIdle = true;
// wakes up any threads waiting on the wait queue
notifyAll();
}
}
synchronized void waitForWaitQueue() {
do {
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
} while (!waitQueue.doResume());
}
private static class SkipDocWriter extends DocWriter {
@Override
void finish() {
}
@Override
void abort() {
}
@Override
long sizeInBytes() {
return 0;
}
}
final SkipDocWriter skipDocWriter = new SkipDocWriter();
NumberFormat nf = NumberFormat.getInstance();
/* Initial chunks size of the shared byte[] blocks used to
store postings data */
final static int BYTE_BLOCK_SHIFT = 15;
final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
private class ByteBlockAllocator extends ByteBlockPool.Allocator {
final int blockSize;
ByteBlockAllocator(int blockSize) {
this.blockSize = blockSize;
}
ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
/* Allocate another byte[] from the shared pool */
@Override
byte[] getByteBlock() {
synchronized(DocumentsWriter.this) {
final int size = freeByteBlocks.size();
final byte[] b;
if (0 == size) {
b = new byte[blockSize];
bytesUsed.addAndGet(blockSize);
} else
b = freeByteBlocks.remove(size-1);
return b;
}
}
/* Return byte[]'s to the pool */
@Override
void recycleByteBlocks(byte[][] blocks, int start, int end) {
synchronized(DocumentsWriter.this) {
for(int i=start;i<end;i++) {
freeByteBlocks.add(blocks[i]);
blocks[i] = null;
}
}
}
@Override
void recycleByteBlocks(List<byte[]> blocks) {
synchronized(DocumentsWriter.this) {
final int size = blocks.size();
for(int i=0;i<size;i++) {
freeByteBlocks.add(blocks.get(i));
blocks.set(i, null);
}
}
}
}
/* Initial chunks size of the shared int[] blocks used to
store postings data */
final static int INT_BLOCK_SHIFT = 13;
final static int INT_BLOCK_SIZE = 1 << INT_BLOCK_SHIFT;
final static int INT_BLOCK_MASK = INT_BLOCK_SIZE - 1;
private List<int[]> freeIntBlocks = new ArrayList<int[]>();
/* Allocate another int[] from the shared pool */
synchronized int[] getIntBlock() {
final int size = freeIntBlocks.size();
final int[] b;
if (0 == size) {
b = new int[INT_BLOCK_SIZE];
bytesUsed.addAndGet(INT_BLOCK_SIZE*RamUsageEstimator.NUM_BYTES_INT);
} else {
b = freeIntBlocks.remove(size-1);
}
return b;
}
synchronized void bytesUsed(long numBytes) {
bytesUsed.addAndGet(numBytes);
}
long bytesUsed() {
return bytesUsed.get() + pendingDeletes.bytesUsed.get();
}
/* Return int[]s to the pool */
synchronized void recycleIntBlocks(int[][] blocks, int start, int end) {
for(int i=start;i<end;i++) {
freeIntBlocks.add(blocks[i]);
blocks[i] = null;
}
}
ByteBlockAllocator byteBlockAllocator = new ByteBlockAllocator(BYTE_BLOCK_SIZE);
final static int PER_DOC_BLOCK_SIZE = 1024;
final ByteBlockAllocator perDocAllocator = new ByteBlockAllocator(PER_DOC_BLOCK_SIZE);
/* Initial chunk size of the shared char[] blocks used to
store term text */
final static int CHAR_BLOCK_SHIFT = 14;
final static int CHAR_BLOCK_SIZE = 1 << CHAR_BLOCK_SHIFT;
final static int CHAR_BLOCK_MASK = CHAR_BLOCK_SIZE - 1;
final static int MAX_TERM_LENGTH = CHAR_BLOCK_SIZE-1;
private ArrayList<char[]> freeCharBlocks = new ArrayList<char[]>();
/* Allocate another char[] from the shared pool */
synchronized char[] getCharBlock() {
final int size = freeCharBlocks.size();
final char[] c;
if (0 == size) {
bytesUsed.addAndGet(CHAR_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_CHAR);
c = new char[CHAR_BLOCK_SIZE];
} else
c = freeCharBlocks.remove(size-1);
// We always track allocations of char blocks, for now,
// because nothing that skips allocation tracking
// (currently only term vectors) uses its own char
// blocks.
return c;
}
/* Return char[]s to the pool */
synchronized void recycleCharBlocks(char[][] blocks, int numBlocks) {
for(int i=0;i<numBlocks;i++) {
freeCharBlocks.add(blocks[i]);
blocks[i] = null;
}
}
String toMB(long v) {
return nf.format(v/1024./1024.);
}
/* We have four pools of RAM: Postings, byte blocks
* (holds freq/prox posting data), char blocks (holds
* characters in the term) and per-doc buffers (stored fields/term vectors).
* Different docs require varying amount of storage from
* these four classes.
*
* For example, docs with many unique single-occurrence
* short terms will use up the Postings RAM and hardly any
* of the other two. Whereas docs with very large terms
* will use alot of char blocks RAM and relatively less of
* the other two. This method just frees allocations from
* the pools once we are over-budget, which balances the
* pools to match the current docs. */
void balanceRAM() {
final boolean doBalance;
final long deletesRAMUsed;
deletesRAMUsed = bufferedDeletesStream.bytesUsed();
final long ramBufferSize;
final double mb = config.getRAMBufferSizeMB();
if (mb == IndexWriterConfig.DISABLE_AUTO_FLUSH) {
ramBufferSize = IndexWriterConfig.DISABLE_AUTO_FLUSH;
} else {
ramBufferSize = (long) (mb*1024*1024);
}
synchronized(this) {
if (ramBufferSize == IndexWriterConfig.DISABLE_AUTO_FLUSH || bufferIsFull) {
return;
}
doBalance = bytesUsed() + deletesRAMUsed >= ramBufferSize;
}
if (doBalance) {
if (infoStream != null) {
message(" RAM: balance allocations: usedMB=" + toMB(bytesUsed()) +
" vs trigger=" + toMB(ramBufferSize) +
" deletesMB=" + toMB(deletesRAMUsed) +
" byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
" perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE) +
" charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*RamUsageEstimator.NUM_BYTES_CHAR));
}
final long startBytesUsed = bytesUsed() + deletesRAMUsed;
int iter = 0;
// We free equally from each pool in 32 KB
// chunks until we are below our threshold
// (freeLevel)
boolean any = true;
final long freeLevel = (long) (0.95 * ramBufferSize);
while(bytesUsed()+deletesRAMUsed > freeLevel) {
synchronized(this) {
if (0 == perDocAllocator.freeByteBlocks.size()
&& 0 == byteBlockAllocator.freeByteBlocks.size()
&& 0 == freeCharBlocks.size()
&& 0 == freeIntBlocks.size()
&& !any) {
// Nothing else to free -- must flush now.
bufferIsFull = bytesUsed()+deletesRAMUsed > ramBufferSize;
if (infoStream != null) {
if (bytesUsed()+deletesRAMUsed > ramBufferSize) {
message(" nothing to free; set bufferIsFull");
} else {
message(" nothing to free");
}
}
break;
}
if ((0 == iter % 5) && byteBlockAllocator.freeByteBlocks.size() > 0) {
byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1);
bytesUsed.addAndGet(-BYTE_BLOCK_SIZE);
}
if ((1 == iter % 5) && freeCharBlocks.size() > 0) {
freeCharBlocks.remove(freeCharBlocks.size()-1);
bytesUsed.addAndGet(-CHAR_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_CHAR);
}
if ((2 == iter % 5) && freeIntBlocks.size() > 0) {
freeIntBlocks.remove(freeIntBlocks.size()-1);
bytesUsed.addAndGet(-INT_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_INT);
}
if ((3 == iter % 5) && perDocAllocator.freeByteBlocks.size() > 0) {
// Remove upwards of 32 blocks (each block is 1K)
for (int i = 0; i < 32; ++i) {
perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1);
bytesUsed.addAndGet(-PER_DOC_BLOCK_SIZE);
if (perDocAllocator.freeByteBlocks.size() == 0) {
break;
}
}
}
}
if ((4 == iter % 5) && any) {
// Ask consumer to free any recycled state
any = consumer.freeRAM();
}
iter++;
}
if (infoStream != null) {
message(" after free: freedMB=" + nf.format((startBytesUsed-bytesUsed()-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((bytesUsed()+deletesRAMUsed)/1024./1024.));
}
}
}
final WaitQueue waitQueue = new WaitQueue();
private class WaitQueue {
DocWriter[] waiting;
int nextWriteDocID;
int nextWriteLoc;
int numWaiting;
long waitingBytes;
public WaitQueue() {
waiting = new DocWriter[10];
}
synchronized void reset() {
// NOTE: nextWriteLoc doesn't need to be reset
assert numWaiting == 0;
assert waitingBytes == 0;
nextWriteDocID = 0;
}
synchronized boolean doResume() {
final double mb = config.getRAMBufferSizeMB();
final long waitQueueResumeBytes;
if (mb == IndexWriterConfig.DISABLE_AUTO_FLUSH) {
waitQueueResumeBytes = 2*1024*1024;
} else {
waitQueueResumeBytes = (long) (mb*1024*1024*0.05);
}
return waitingBytes <= waitQueueResumeBytes;
}
synchronized boolean doPause() {
final double mb = config.getRAMBufferSizeMB();
final long waitQueuePauseBytes;
if (mb == IndexWriterConfig.DISABLE_AUTO_FLUSH) {
waitQueuePauseBytes = 4*1024*1024;
} else {
waitQueuePauseBytes = (long) (mb*1024*1024*0.1);
}
return waitingBytes > waitQueuePauseBytes;
}
synchronized void abort() {
int count = 0;
for(int i=0;i<waiting.length;i++) {
final DocWriter doc = waiting[i];
if (doc != null) {
doc.abort();
waiting[i] = null;
count++;
}
}
waitingBytes = 0;
assert count == numWaiting;
numWaiting = 0;
}
private void writeDocument(DocWriter doc) throws IOException {
assert doc == skipDocWriter || nextWriteDocID == doc.docID;
boolean success = false;
try {
doc.finish();
nextWriteDocID++;
nextWriteLoc++;
assert nextWriteLoc <= waiting.length;
if (nextWriteLoc == waiting.length) {
nextWriteLoc = 0;
}
success = true;
} finally {
if (!success) {
setAborting();
}
}
}
synchronized public boolean add(DocWriter doc) throws IOException {
assert doc.docID >= nextWriteDocID;
if (doc.docID == nextWriteDocID) {
writeDocument(doc);
while(true) {
doc = waiting[nextWriteLoc];
if (doc != null) {
numWaiting--;
waiting[nextWriteLoc] = null;
waitingBytes -= doc.sizeInBytes();
writeDocument(doc);
} else {
break;
}
}
} else {
// I finished before documents that were added
// before me. This can easily happen when I am a
// small doc and the docs before me were large, or,
// just due to luck in the thread scheduling. Just
// add myself to the queue and when that large doc
// finishes, it will flush me:
int gap = doc.docID - nextWriteDocID;
if (gap >= waiting.length) {
// Grow queue
DocWriter[] newArray = new DocWriter[ArrayUtil.oversize(gap, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert nextWriteLoc >= 0;
System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc);
System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc);
nextWriteLoc = 0;
waiting = newArray;
gap = doc.docID - nextWriteDocID;
}
int loc = nextWriteLoc + gap;
if (loc >= waiting.length) {
loc -= waiting.length;
}
// We should only wrap one time
assert loc < waiting.length;
// Nobody should be in my spot!
assert waiting[loc] == null;
waiting[loc] = doc;
numWaiting++;
waitingBytes += doc.sizeInBytes();
}
return doPause();
}
}
}