LucandraTermEnum.java example

Explorer
Solandra-master
/**
 * Copyright 2009 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.InvalidRequestException;
import org.apache.cassandra.thrift.KeySlice;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.thrift.TimedOutException;
import org.apache.cassandra.thrift.UnavailableException;
import org.apache.log4j.Logger;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.thrift.TException;

/**
 * 
 * @author jake
 * 
 */
public class LucandraTermEnum extends TermEnum {

    private final IndexReader indexReader;
    private final String indexName;

    private int termPosition;
    private Term[] termBuffer;
    private SortedMap<Term, List<ColumnOrSuperColumn>> termDocFreqBuffer;
    private SortedMap<Term, List<ColumnOrSuperColumn>> termCache;

    // number of sequential terms to read initially
    private final int maxInitSize = 2;
    private final int maxChunkSize = 1024;
    private int actualInitSize = -1;
    private Term initTerm = null;
    private Term chunkBoundryTerm;
    private String currentField = null;
    private int chunkCount = 0;

    private final Cassandra.Iface client;
    private final Term finalTerm = new Term(CassandraUtils.delimeter, CassandraUtils.finalToken);

    private static final Logger logger = Logger.getLogger(LucandraTermEnum.class);

    public LucandraTermEnum(IndexReader indexReader) {
        this.indexReader = indexReader;
        this.indexName = indexReader.getIndexName();
        this.client = indexReader.getClient();
        this.termPosition = 0;
    }

    public boolean skipTo(Term term) throws IOException {

        if (term == null)
            return false;

        loadTerms(term);
        
        currentField = term.field();

        return termBuffer.length == 0 ? false : true;
    }

    @Override
    public void close() throws IOException {

    }

    @Override
    public int docFreq() {
        return termDocFreqBuffer.size();
    }

    @Override
    public boolean next() throws IOException {    
        
        if(termBuffer == null){
            skipTo(new Term(""));
        }
        
        termPosition++;
        
        boolean hasNext = termPosition < termBuffer.length;

        if (hasNext && termBuffer[termPosition].equals(finalTerm)) {
            termPosition++;
            hasNext = termPosition < termBuffer.length;
        }

        if (!hasNext) {

            // if we've already done init try grabbing more
            if ((chunkCount == 1 && actualInitSize == maxInitSize) || (chunkCount > 1 && actualInitSize == maxChunkSize)) {
                loadTerms(chunkBoundryTerm);
                hasNext = termBuffer.length > 0;
            } else if ((chunkCount == 1 && actualInitSize < maxInitSize) || (chunkCount > 1 && actualInitSize < maxChunkSize)) {
                hasNext = false;            
                
                loadTerms(initTerm); //start over at top (for facets)   
            }
            
            termPosition = 0;
        }

        return hasNext;
    }

    @Override
    public Term term() {
        return termBuffer[termPosition];
    }

    private void loadTerms(Term skipTo) {

        if(initTerm == null)
            initTerm = skipTo;
        
        // chose starting term
        String startTerm = CassandraUtils.hashKey(
                    indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo)
                );
                
        // ending term. the initial query we don't care since
        // we only pull 2 terms, also we don't
        String endTerm = "";
      
        //The boundary condition for this search. currently the field.
        String boundryTerm = CassandraUtils.hashKey(
                indexName + CassandraUtils.delimeter + 
                CassandraUtils.createColumnName(skipTo.field(), CassandraUtils.finalToken)
                );
        
        
        if ((!skipTo.equals(chunkBoundryTerm) || termPosition == 0) && termCache != null) {
            termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey());
        } else {          
            termDocFreqBuffer = null;
        }

        if (termDocFreqBuffer != null) {

            termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});
            termPosition = 0;

            logger.debug("Found " + startTerm + " in cache");
            return;
        } else if (chunkCount > 1 && actualInitSize < maxChunkSize) {
            
            //include last term
            if(skipTo.equals(chunkBoundryTerm) && termCache.containsKey(skipTo)){
                termBuffer = new Term[] {skipTo};
                termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey());
            }else{
                termBuffer = new Term[] {};
            }
            
            termPosition = 0;
            return; // done!
        }

        chunkCount++;

        // The first time we grab just a few keys
        int count = maxInitSize;

        // otherwise we grab all the rest of the keys
        if (chunkBoundryTerm != null) {
            count = maxChunkSize;
            startTerm = CassandraUtils.hashKey(
                        indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(chunkBoundryTerm)
                    );
            
            
            //After first pass use the boundary term, since we know on pass 2 we are using the OPP
            endTerm = boundryTerm;
            
        }

        long start = System.currentTimeMillis();

        termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>();

        ColumnParent columnParent = new ColumnParent(CassandraUtils.termVecColumnFamily);        
        SlicePredicate slicePredicate = new SlicePredicate();
       

        // Get all columns
        SliceRange sliceRange = new SliceRange(new byte[] {}, new byte[] {}, true, Integer.MAX_VALUE);
        slicePredicate.setSlice_range(sliceRange);
        
        List<KeySlice> columns;
        try {
            columns = client.get_range_slice(CassandraUtils.keySpace, columnParent, slicePredicate, startTerm, endTerm, count, ConsistencyLevel.ONE);
        } catch (InvalidRequestException e) {
            throw new RuntimeException(e);
        } catch (TException e) {
            throw new RuntimeException(e);
        } catch (UnavailableException e) {
            throw new RuntimeException(e);
        } catch (TimedOutException e) {
            throw new RuntimeException(e);
        }

        // term to start with next time
        actualInitSize = columns.size();
        logger.debug("Found " + columns.size() + " keys in range:" + startTerm + " to " + endTerm + " in " + (System.currentTimeMillis() - start) + "ms");

        if (actualInitSize > 0) {
            for (KeySlice entry : columns) {
   
                // term keys look like wikipedia/body/wiki
                String termStr = entry.getKey().substring(entry.getKey().indexOf(CassandraUtils.delimeter) + CassandraUtils.delimeter.length());
                Term term = CassandraUtils.parseTerm(termStr);                 
                
                logger.debug(termStr + " has " + entry.getColumns().size());
                
                //check for tombstone keys or incorrect keys (from RP)
                if(entry.getColumns().size() > 0 && term.field().equals(skipTo.field()) &&
                        //from this index
                        entry.getKey().equals(CassandraUtils.hashKey(indexName+CassandraUtils.delimeter+term.field()+CassandraUtils.delimeter+term.text())))
                    
                    termDocFreqBuffer.put(term, entry.getColumns());
            }

            if(!termDocFreqBuffer.isEmpty()){
                chunkBoundryTerm = termDocFreqBuffer.lastKey();
            }
        }

        // add a final key (excluded in submap below)
        termDocFreqBuffer.put(finalTerm, null);

        // put in cache
        for (Term termKey : termDocFreqBuffer.keySet()) {

            if (termCache == null) {
                termCache = termDocFreqBuffer;
            } else {
                termCache.putAll(termDocFreqBuffer);
            }

            indexReader.addTermEnumCache(termKey, this);
        }

        // cache the initial term too
        indexReader.addTermEnumCache(skipTo, this);

        termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});

        termPosition = 0;

        long end = System.currentTimeMillis();

        logger.debug("loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms");

    }

    void loadFilteredTerms(Term term, List<String> docNums)  {
        long start = System.currentTimeMillis();
        ColumnParent parent = new ColumnParent();
        parent.setColumn_family(CassandraUtils.termVecColumnFamily);

        String key = CassandraUtils.hashKey(
                indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(term)
            );

        SlicePredicate slicePredicate = new SlicePredicate();

        
        for (String docNum : docNums) {
            slicePredicate.addToColumn_names(docNum.getBytes());
        }

        

        List<ColumnOrSuperColumn> columsList = null;
        try {
            columsList = client.get_slice(CassandraUtils.keySpace, key, parent, slicePredicate, ConsistencyLevel.ONE);
        } catch (InvalidRequestException e) {
            throw new RuntimeException(e);
        } catch (UnavailableException e) {
            throw new RuntimeException(e);
        } catch (TimedOutException e) {
            throw new RuntimeException(e);
        } catch (TException e) {
            throw new RuntimeException(e);
        }catch (Exception e) {
            throw new RuntimeException(e);
        }

        termBuffer = new Term[0];

        if (columsList != null  && columsList.size()>0){
            termBuffer = new Term[1];
            termBuffer[0] = term;
            termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>();
            termDocFreqBuffer.put(term, columsList);
        }
        long end = System.currentTimeMillis();
        logger.debug("loadFilterdTerms: " + term + "(" + termBuffer.length + ") took " + (end - start) + "ms");

    }
    
    public final List<ColumnOrSuperColumn> getTermDocFreq() {
        if (termBuffer.length == 0)
            return null;

        List<ColumnOrSuperColumn> termDocs = termDocFreqBuffer.get(termBuffer[termPosition]);

        int size = termDocs.size();

        // create proper docIds.
        // Make sure these ids are sorted in ascending order since lucene
        // requires this.
        int docIds[] = new int[size];
        int idx = 0;
        List<ColumnOrSuperColumn> sortedTermDocs = new ArrayList<ColumnOrSuperColumn>(termDocs.size());
        Map<Integer, ColumnOrSuperColumn> termDocMap = new HashMap<Integer, ColumnOrSuperColumn>();

        for (ColumnOrSuperColumn col : termDocs) {
            int docId = indexReader.addDocument(col.getSuper_column(), currentField);
            termDocMap.put(docId, col);
            docIds[idx++] = docId;
        }

        // sort
        Arrays.sort(docIds);

        // move
        for (idx = 0; idx < size; idx++) {
            sortedTermDocs.add(termDocMap.get(docIds[idx]));
        }

        return sortedTermDocs;
    }

    public Set<Term> getCachedTerms() {
        return termCache.keySet();
    }

}