ReadIndexTask.java example

Explorer
blazegraph-master
- database-master
package com.bigdata.search;

import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.log4j.Logger;

import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ISimpleSplitHandler;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.util.BytesUtil;

/**
 * Procedure reads on the terms index, aggregating data on a per-{@link Hit}
 * basis.
 * <p>
 * The procedure uses an {@link IRangeQuery#rangeIterator(byte[], byte[])} to
 * perform a key range scan for a specific term. The range iterator will
 * automatically issue queries, obtaining a "chunk" of results at a time. Those
 * results are aggregated on the {@link Hit} collection, which is maintained in
 * a thread-safe hash map.
 * <p>
 * Note: An {@link ISimpleSplitHandler} imposes the constraint that index
 * partitions may only fall on a term boundary, hence all tuples for any given
 * term will always be found on the same index partition.
 * 
 * @param <V>
 *            The generic type of the document identifier.
 *            
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class ReadIndexTask<V extends Comparable<V>> extends AbstractIndexTask<V> 
	implements Callable<Object> {

    final private static Logger log = Logger.getLogger(ReadIndexTask.class);

    private final IHitCollector<V> hits;
    private final ITupleIterator<?> itr;

    /**
     * This instance is reused until it is consumed by a successful insertion
     * into {@link #hits} using
     * {@link ConcurrentHashMap#putIfAbsent(Object, Object)}. Once successfully
     * inserted, the {@link Hit#setDocId(long) docId} is set on the {@link Hit}
     * and a new instance is assigned to {@link #tmp}.
     */
    private Hit<V> tmp;// = new Hit<V>();

    /**
     * Setup a task that will perform a range scan for entries matching the
     * search term.
     * 
     * @param termText
     *            The term text for the search term.
     * @param termNdx
     * 			  The index of this term within the overall search.
     * @param numTerms
     * 			  The overall number of search terms.
     * @param prefixMatch
     *            When <code>true</code> any term having <i>termText</i> as a
     *            prefix will be matched. Otherwise the term must be an exact
     *            match for the <i>termText</i>.
     * @param queryTermWeight
     *            The weight for the search term.
     * @param searchEngine
     *            The search engine.
     * @param hits
     *            The map where the hits are being aggregated.
     */
    public ReadIndexTask(final String termText, 
    		final int termNdx, final int numTerms,
    		final boolean prefixMatch, final double queryTermWeight, 
    		final FullTextIndex<V> searchEngine, final IHitCollector<V> hits) {

    	super(termText, termNdx, numTerms, prefixMatch, queryTermWeight, searchEngine);
    	
        if (hits == null)
            throw new IllegalArgumentException();
        
        this.hits = hits;
     
        if (log.isDebugEnabled())
            log.debug("termText=[" + termText + "], prefixMatch=" + prefixMatch
                    + ", queryTermWeight=" + queryTermWeight + "\nfromKey="
                    + BytesUtil.toString(fromKey) + "\n  toKey="
                    + BytesUtil.toString(toKey));

        /*
         * TODO filter by document and/or field. all we need to do is pass in
         * an array of the fields that should be accepted and formulate and pass
         * along an ITupleFilter which accepts only those fields. we can also
         * filter by document in the same manner.
         */
        itr = searchEngine.getIndex()
                .rangeIterator(fromKey, toKey, 0/* capacity */,
                        IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */);
        
        tmp = new Hit<V>(numTerms);

    }
    
    /**
     * @return The #of fields with a hit on the search term as a
     *         {@link Long}.
     */
    public Long call() throws Exception {

        try {

            final long nhits = run();
        
            return nhits;
        
        } catch (Throwable t) {

            throw launderThrowable(t);
            
        }
        
    }
    
    private long run()  {
        
        long nhits = 0;

        if (log.isDebugEnabled())
            log.debug("queryTerm=" + queryTerm + ", termWeight="
                    + queryTermWeight);

//        final Thread t = Thread.currentThread();
        
        while (itr.hasNext()) {

            // don't test for interrupted on each result -- too much work.
            if (nhits % 1000 == 0 && Thread.interrupted()) {

//                if (log.isInfoEnabled())
                log.warn("Interrupted: queryTerm=" + queryTerm + ", nhits="
                        + nhits);

                return nhits;
                
            }
            
            // next entry
            final ITuple<?> tuple = itr.next();
            
            // decode the tuple.
            final ITermDocRecord<V> rec = (ITermDocRecord<V>) tuple.getObject();

            // the document identifier.
            final V docId = rec.getDocId();
            
            /*
             * Extract the term frequency and normalized term-frequency (term
             * weight) for this document.
             */
//            final ITermMetadata md = recordBuilder.decodeValue(tuple);

            final double termWeight = rec.getLocalTermWeight();
            
            // Note: Log test shows up in profiler.
//            if (log.isDebugEnabled()) {
//            final int termFreq = rec.termFreq();
//                log.debug("hit: term=" + queryTerm + ", docId=" + docId
//                        + ", termFreq=" + termFreq + ", termWeight="
//                        + termWeight + ", product="
//                        + (queryTermWeight * termWeight));
//        }
            
            /*
             * Play a little magic to get the docId in the hit set without race
             * conditions.
             */
            final Hit<V> hit;
            {
                Hit<V> oldValue = hits.putIfAbsent(docId, tmp);
                if (oldValue == null) {
                    hit = tmp;
                    hit.setDocId(docId);
                    tmp = new Hit<V>(numQueryTerms);
                } else {
                    hit = oldValue;
                }
            }
            
            hit.add( queryTermNdx, queryTermWeight * termWeight );
            
            nhits++;
            
        }

        return nhits;
        
    }
    
    /**
     * Log an error and wrap the exception iff necessary.
     * 
     * @param t
     *            The thrown error.
     * 
     * @return The laundered exception.
     * 
     * @throws Exception
     */
    private RuntimeException launderThrowable(final Throwable t)
            throws Exception {
        try {
            // log an error
//    		log.error(t, t);
        } finally {
            // ignore any problems here.
        }
        if (t instanceof RuntimeException) {
            return (RuntimeException) t;
        } else if (t instanceof Error) {
            throw (Error) t;
        } else if (t instanceof Exception) {
            throw (Exception) t;
        } else
            throw new RuntimeException(t);
    }

}