package com.bigdata.search;
import java.util.Arrays;
import java.util.Map;
import org.apache.log4j.Logger;
import com.bigdata.btree.ITupleSerializer;
import com.bigdata.btree.keys.KV;
import com.bigdata.btree.proc.LongAggregator;
/**
* A buffer holding tokens extracted from one or more documents / fields.
* Each entry in the buffer corresponds to the {@link TermFrequencyData}
* extracted from a field of some document. When the buffer overflows it is
* {@link #flush()}, writing on the indices.
*
* @param <V> The generic type of the document identifier.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TokenBuffer<V extends Comparable<V>> {
final private static Logger log = Logger.getLogger(TokenBuffer.class);
/** The object on which the buffer will write when it overflows. */
private FullTextIndex<V> textIndexer;
/** The capacity of the {@link #buffer}. */
private final int capacity;
/** Each entry models a field of some document. */
private final TermFrequencyData<V>[] buffer;
/** #of entries in the {@link #buffer}. */
private int count = 0;
/** #of distinct {docId} values in the {@link #buffer}. */
private int ndocs;
/** #of distinct {docId,fieldId} values in the {@link #buffer}. */
private int nfields;
/** #of distinct {docId,fieldId,termText} tuples.*/
private int nterms;
/** The last observed docId and <code>null</code> if none observed. */
private V lastDocId;
/** The last observed fieldId and -1 if none observed. */
private long lastFieldId;
/**
* Ctor.
* @param capacity
* The #of distinct {document,field} tuples that can be held in
* the buffer before it will overflow. The buffer will NOT
* overflow until you exceed this capacity.
*
* @param textIndexer
* The object on which the buffer will write when it overflows or
* is {@link #flush()}ed.
*/
@SuppressWarnings("unchecked")
public TokenBuffer(final int capacity, final FullTextIndex<V> textIndexer) {
if (capacity <= 0)
throw new IllegalArgumentException();
if (textIndexer == null)
throw new IllegalArgumentException();
this.capacity = capacity;
this.textIndexer = textIndexer;
buffer = new TermFrequencyData[capacity];
reset();
}
/**
* Discards all data in the buffer and resets it to a clean state.
*/
public void reset() {
for (int i = 0; i < count; i++) {
buffer[i] = null;
}
count = 0;
ndocs = 0;
nfields = 0;
nterms = 0;
lastDocId = null;
lastFieldId = -1;
}
/**
* The #of entries in the buffer.
*/
public int size() {
return count;
}
/**
* Return the {@link TermFrequencyData} for the specified index.
*
* @param index
* The index in [0:<i>count</i>).
*
* @return The {@link TermFrequencyData} at that index.
*
* @throws IndexOutOfBoundsException
*/
public TermFrequencyData<V> get(final int index) {
if (index < 0 || index >= count)
throw new IndexOutOfBoundsException();
return buffer[index];
}
/**
* Adds another token to the current field of the current document. If
* either the field or the document identifier changes, then begins a
* new field and possibly a new document. If the buffer is full then it
* will be {@link #flush()}ed before beginning a new field.
* <p>
* Note: This method is NOT thread-safe.
* <p>
* Note: There is an assumption that the caller will process all tokens
* for a given field in the same document at once. Failure to do this
* will lead to only part of the term-frequency distribution for the
* field being captured by the indices.
*
* @param docId
* The document identifier.
* @param fieldId
* The field identifier.
* @param token
* The token.
*/
public void add(final V docId, final int fieldId, final String token) {
if (log.isDebugEnabled()) {
log.debug("docId=" + docId + ", fieldId=" + fieldId + ", token="
+ token);
}
final boolean newField;
if (count == 0) {
// first tuple in clean buffer.
ndocs++;
nfields++;
lastDocId = docId;
lastFieldId = fieldId;
newField = true;
} else {
if (lastDocId != docId) {
// start of new document
ndocs++;
// also start of new field.
nfields++;
newField = true;
// normalize the last term-frequency distribution.
buffer[count-1].normalize();
} else if (lastFieldId != fieldId) {
// start of new field in same document.
nfields++;
newField = true;
// normalize the last term-frequency distribution.
buffer[count-1].normalize();
} else {
newField = false;
}
}
if (newField && count == capacity) {
flush();
}
if(newField) {
buffer[count++] = new TermFrequencyData<V>(docId, fieldId, token);
nterms++;
} else {
if( buffer[count-1].add(token) ) {
nterms++;
}
}
lastDocId = docId;
lastFieldId = fieldId;
}
/**
* Write any buffered data on the indices.
* <p>
* Note: The writes on the terms index are scattered since the key for the
* index is {term, docId, fieldId}. This method will batch up and then apply
* a set of updates, but the total operation is not atomic. Therefore search
* results which are concurrent with indexing may not have access to the
* full data for concurrently indexed documents. This issue may be resolved
* by allowing the indexer to write ahead and using a historical commit time
* for the search.
* <p>
* Note: If a document is pre-existing, then the existing data for that
* document MUST be removed unless you know that the fields to be found in
* the will not have changed (they may have different contents, but the same
* fields exist in the old and new versions of the document).
*/
public void flush() {
if (nterms == 0) {
reset();
return;
}
if (log.isInfoEnabled())
log.info("count=" + count + ", ndocs=" + ndocs + ", nfields="
+ nfields + ", nterms=" + nterms);
// Normalize the last document/field in the buffer
buffer[count - 1].normalize();
/*
* Generate keys[] and vals[].
*/
// array of correlated key/value tuples.
final KV[] a = new KV[nterms];
// Knows how to encode and decode the keys and values.
@SuppressWarnings("unchecked")
final ITupleSerializer<ITermDocKey<V>, ITermDocVal> tupleSer = textIndexer
.getIndex().getIndexMetadata().getTupleSerializer();
// #of {token,docId,fieldId} tuples generated
int n = 0;
// for each document in the buffer.
for (int i = 0; i < count; i++) {
final TermFrequencyData<V> termFreq = buffer[i];
final V docId = termFreq.docId;
final int fieldId = termFreq.fieldId;
// emit {token,docId,fieldId} tuples.
for(Map.Entry<String, ITermMetadata> e : termFreq.terms.entrySet()) {
final String termText = e.getKey();
final ITermMetadata termMetadata = e.getValue();
// final byte[] key = recordBuilder.getKey(keyBuilder, termText,
// false/* successor */, docId, fieldId);
/*
* Note: This wraps both sides of the record together and passes
* them into the tupleSerializer so it can examine both pieces
* when making its decision on how to encode the information
* into the key/val of the index.
*/
final ITermDocRecord<V> rec = new ReadOnlyTermDocRecord<V>(
termText, docId, fieldId, /* termMetadata.termFreq(), */
termMetadata.getLocalTermWeight());
final byte[] key = tupleSer.serializeKey(rec);
// final byte[] val = recordBuilder.getValue(buf, termMetadata);
final byte[] val = tupleSer.serializeVal(rec);
if (log.isDebugEnabled()) {
log.debug("{" + termText + "," + docId + "," + fieldId
+ "}: #occurences=" + termMetadata.termFreq()
+ ", termWeight="
+ termMetadata.getLocalTermWeight());
}
// save in the correlated array.
a[n++] = new KV(key, val);
}
}
assert n == nterms : "ntokens=" + nterms + ", n=" + n;
// Sort {term,docId,fieldId}:{value} tuples into total index order.
Arrays.sort(a);
/*
* Copy the correlated key:val data into keys[] and vals[] arrays.
*/
final byte[][] keys = new byte[nterms][];
final byte[][] vals = new byte[nterms][];
for (int i = 0; i < nterms; i++) {
keys[i] = a[i].key;
vals[i] = a[i].val;
}
// Batch write on the index.
writeOnIndex(n, keys, vals);
// Clear the buffer.
reset();
}
// static private class ReadOnlyTermDocKey<V extends Comparable<V>> implements
// ITermDocKey<V> {
//
// private final String token;
//
// private final V docId;
//
// private final int fieldId;
//
// private ReadOnlyTermDocKey(final String token, final V docId,
// final int fieldId) {
// this.token = token;
// this.docId = docId;
// this.fieldId = fieldId;
// }
//
// public String getToken() {
// return token;
// }
//
// public V getDocId() {
// return docId;
// }
//
// public int getFieldId() {
// return fieldId;
// }
//
// }
/**
* Writes on the index.
*
* @param n
* @param keys
* @param vals
*
* @return The #of pre-existing records that were updated.
*/
protected long writeOnIndex(final int n, final byte[][] keys,
final byte[][] vals) {
final LongAggregator resultHandler = new LongAggregator();
textIndexer.getIndex().submit(0, //fromIndex
n, // toIndex
keys,//
vals,//
(textIndexer.isOverwrite() //
? TextIndexWriteProc.IndexWriteProcConstructor.OVERWRITE
: TextIndexWriteProc.IndexWriteProcConstructor.NO_OVERWRITE//
),//
resultHandler//
);
return resultHandler.getResult();
}
/**
* Writes on the index.
*
* @param n
* @param keys
* @param vals
*
* @return The #of pre-existing records that were updated.
*/
protected long deleteFromIndex(final int n, final byte[][] keys,
final byte[][] vals) {
throw new RuntimeException("implement me");
}
}