package com.bigdata.search;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.log4j.Logger;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ISimpleSplitHandler;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.util.BytesUtil;
/**
* Procedure reads on the terms index, aggregating data on a per-{@link Hit}
* basis.
* <p>
* The procedure uses an {@link IRangeQuery#rangeIterator(byte[], byte[])} to
* perform a key range scan for a specific term. The range iterator will
* automatically issue queries, obtaining a "chunk" of results at a time. Those
* results are aggregated on the {@link Hit} collection, which is maintained in
* a thread-safe hash map.
* <p>
* Note: An {@link ISimpleSplitHandler} imposes the constraint that index
* partitions may only fall on a term boundary, hence all tuples for any given
* term will always be found on the same index partition.
*
* @param <V>
* The generic type of the document identifier.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class ReadIndexTask<V extends Comparable<V>> extends AbstractIndexTask<V>
implements Callable<Object> {
final private static Logger log = Logger.getLogger(ReadIndexTask.class);
private final IHitCollector<V> hits;
private final ITupleIterator<?> itr;
/**
* This instance is reused until it is consumed by a successful insertion
* into {@link #hits} using
* {@link ConcurrentHashMap#putIfAbsent(Object, Object)}. Once successfully
* inserted, the {@link Hit#setDocId(long) docId} is set on the {@link Hit}
* and a new instance is assigned to {@link #tmp}.
*/
private Hit<V> tmp;// = new Hit<V>();
/**
* Setup a task that will perform a range scan for entries matching the
* search term.
*
* @param termText
* The term text for the search term.
* @param termNdx
* The index of this term within the overall search.
* @param numTerms
* The overall number of search terms.
* @param prefixMatch
* When <code>true</code> any term having <i>termText</i> as a
* prefix will be matched. Otherwise the term must be an exact
* match for the <i>termText</i>.
* @param queryTermWeight
* The weight for the search term.
* @param searchEngine
* The search engine.
* @param hits
* The map where the hits are being aggregated.
*/
public ReadIndexTask(final String termText,
final int termNdx, final int numTerms,
final boolean prefixMatch, final double queryTermWeight,
final FullTextIndex<V> searchEngine, final IHitCollector<V> hits) {
super(termText, termNdx, numTerms, prefixMatch, queryTermWeight, searchEngine);
if (hits == null)
throw new IllegalArgumentException();
this.hits = hits;
if (log.isDebugEnabled())
log.debug("termText=[" + termText + "], prefixMatch=" + prefixMatch
+ ", queryTermWeight=" + queryTermWeight + "\nfromKey="
+ BytesUtil.toString(fromKey) + "\n toKey="
+ BytesUtil.toString(toKey));
/*
* TODO filter by document and/or field. all we need to do is pass in
* an array of the fields that should be accepted and formulate and pass
* along an ITupleFilter which accepts only those fields. we can also
* filter by document in the same manner.
*/
itr = searchEngine.getIndex()
.rangeIterator(fromKey, toKey, 0/* capacity */,
IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */);
tmp = new Hit<V>(numTerms);
}
/**
* @return The #of fields with a hit on the search term as a
* {@link Long}.
*/
public Long call() throws Exception {
try {
final long nhits = run();
return nhits;
} catch (Throwable t) {
throw launderThrowable(t);
}
}
private long run() {
long nhits = 0;
if (log.isDebugEnabled())
log.debug("queryTerm=" + queryTerm + ", termWeight="
+ queryTermWeight);
// final Thread t = Thread.currentThread();
while (itr.hasNext()) {
// don't test for interrupted on each result -- too much work.
if (nhits % 1000 == 0 && Thread.interrupted()) {
// if (log.isInfoEnabled())
log.warn("Interrupted: queryTerm=" + queryTerm + ", nhits="
+ nhits);
return nhits;
}
// next entry
final ITuple<?> tuple = itr.next();
// decode the tuple.
final ITermDocRecord<V> rec = (ITermDocRecord<V>) tuple.getObject();
// the document identifier.
final V docId = rec.getDocId();
/*
* Extract the term frequency and normalized term-frequency (term
* weight) for this document.
*/
// final ITermMetadata md = recordBuilder.decodeValue(tuple);
final double termWeight = rec.getLocalTermWeight();
// Note: Log test shows up in profiler.
// if (log.isDebugEnabled()) {
// final int termFreq = rec.termFreq();
// log.debug("hit: term=" + queryTerm + ", docId=" + docId
// + ", termFreq=" + termFreq + ", termWeight="
// + termWeight + ", product="
// + (queryTermWeight * termWeight));
// }
/*
* Play a little magic to get the docId in the hit set without race
* conditions.
*/
final Hit<V> hit;
{
Hit<V> oldValue = hits.putIfAbsent(docId, tmp);
if (oldValue == null) {
hit = tmp;
hit.setDocId(docId);
tmp = new Hit<V>(numQueryTerms);
} else {
hit = oldValue;
}
}
hit.add( queryTermNdx, queryTermWeight * termWeight );
nhits++;
}
return nhits;
}
/**
* Log an error and wrap the exception iff necessary.
*
* @param t
* The thrown error.
*
* @return The laundered exception.
*
* @throws Exception
*/
private RuntimeException launderThrowable(final Throwable t)
throws Exception {
try {
// log an error
// log.error(t, t);
} finally {
// ignore any problems here.
}
if (t instanceof RuntimeException) {
return (RuntimeException) t;
} else if (t instanceof Error) {
throw (Error) t;
} else if (t instanceof Exception) {
throw (Exception) t;
} else
throw new RuntimeException(t);
}
}