/** * Copyright 2009 T Jake Luciani * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package lucandra; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import org.apache.cassandra.thrift.Cassandra; import org.apache.cassandra.thrift.ColumnOrSuperColumn; import org.apache.cassandra.thrift.ColumnParent; import org.apache.cassandra.thrift.ConsistencyLevel; import org.apache.cassandra.thrift.InvalidRequestException; import org.apache.cassandra.thrift.KeySlice; import org.apache.cassandra.thrift.SlicePredicate; import org.apache.cassandra.thrift.SliceRange; import org.apache.cassandra.thrift.TimedOutException; import org.apache.cassandra.thrift.UnavailableException; import org.apache.log4j.Logger; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.thrift.TException; /** * * @author jake * */ public class LucandraTermEnum extends TermEnum { private final IndexReader indexReader; private final String indexName; private int termPosition; private Term[] termBuffer; private SortedMap<Term, List<ColumnOrSuperColumn>> termDocFreqBuffer; private SortedMap<Term, List<ColumnOrSuperColumn>> termCache; // number of sequential terms to read initially private final int maxInitSize = 2; private final int maxChunkSize = 1024; private int actualInitSize = -1; private Term initTerm = null; private Term chunkBoundryTerm; private String currentField = null; private int chunkCount = 0; private final Cassandra.Iface client; private final Term finalTerm = new Term(CassandraUtils.delimeter, CassandraUtils.finalToken); private static final Logger logger = Logger.getLogger(LucandraTermEnum.class); public LucandraTermEnum(IndexReader indexReader) { this.indexReader = indexReader; this.indexName = indexReader.getIndexName(); this.client = indexReader.getClient(); this.termPosition = 0; } public boolean skipTo(Term term) throws IOException { if (term == null) return false; loadTerms(term); currentField = term.field(); return termBuffer.length == 0 ? false : true; } @Override public void close() throws IOException { } @Override public int docFreq() { return termDocFreqBuffer.size(); } @Override public boolean next() throws IOException { if(termBuffer == null){ skipTo(new Term("")); } termPosition++; boolean hasNext = termPosition < termBuffer.length; if (hasNext && termBuffer[termPosition].equals(finalTerm)) { termPosition++; hasNext = termPosition < termBuffer.length; } if (!hasNext) { // if we've already done init try grabbing more if ((chunkCount == 1 && actualInitSize == maxInitSize) || (chunkCount > 1 && actualInitSize == maxChunkSize)) { loadTerms(chunkBoundryTerm); hasNext = termBuffer.length > 0; } else if ((chunkCount == 1 && actualInitSize < maxInitSize) || (chunkCount > 1 && actualInitSize < maxChunkSize)) { hasNext = false; loadTerms(initTerm); //start over at top (for facets) } termPosition = 0; } return hasNext; } @Override public Term term() { return termBuffer[termPosition]; } private void loadTerms(Term skipTo) { if(initTerm == null) initTerm = skipTo; // chose starting term String startTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo) ); // ending term. the initial query we don't care since // we only pull 2 terms, also we don't String endTerm = ""; //The boundary condition for this search. currently the field. String boundryTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo.field(), CassandraUtils.finalToken) ); if ((!skipTo.equals(chunkBoundryTerm) || termPosition == 0) && termCache != null) { termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey()); } else { termDocFreqBuffer = null; } if (termDocFreqBuffer != null) { termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; logger.debug("Found " + startTerm + " in cache"); return; } else if (chunkCount > 1 && actualInitSize < maxChunkSize) { //include last term if(skipTo.equals(chunkBoundryTerm) && termCache.containsKey(skipTo)){ termBuffer = new Term[] {skipTo}; termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey()); }else{ termBuffer = new Term[] {}; } termPosition = 0; return; // done! } chunkCount++; // The first time we grab just a few keys int count = maxInitSize; // otherwise we grab all the rest of the keys if (chunkBoundryTerm != null) { count = maxChunkSize; startTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(chunkBoundryTerm) ); //After first pass use the boundary term, since we know on pass 2 we are using the OPP endTerm = boundryTerm; } long start = System.currentTimeMillis(); termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>(); ColumnParent columnParent = new ColumnParent(CassandraUtils.termVecColumnFamily); SlicePredicate slicePredicate = new SlicePredicate(); // Get all columns SliceRange sliceRange = new SliceRange(new byte[] {}, new byte[] {}, true, Integer.MAX_VALUE); slicePredicate.setSlice_range(sliceRange); List<KeySlice> columns; try { columns = client.get_range_slice(CassandraUtils.keySpace, columnParent, slicePredicate, startTerm, endTerm, count, ConsistencyLevel.ONE); } catch (InvalidRequestException e) { throw new RuntimeException(e); } catch (TException e) { throw new RuntimeException(e); } catch (UnavailableException e) { throw new RuntimeException(e); } catch (TimedOutException e) { throw new RuntimeException(e); } // term to start with next time actualInitSize = columns.size(); logger.debug("Found " + columns.size() + " keys in range:" + startTerm + " to " + endTerm + " in " + (System.currentTimeMillis() - start) + "ms"); if (actualInitSize > 0) { for (KeySlice entry : columns) { // term keys look like wikipedia/body/wiki String termStr = entry.getKey().substring(entry.getKey().indexOf(CassandraUtils.delimeter) + CassandraUtils.delimeter.length()); Term term = CassandraUtils.parseTerm(termStr); logger.debug(termStr + " has " + entry.getColumns().size()); //check for tombstone keys or incorrect keys (from RP) if(entry.getColumns().size() > 0 && term.field().equals(skipTo.field()) && //from this index entry.getKey().equals(CassandraUtils.hashKey(indexName+CassandraUtils.delimeter+term.field()+CassandraUtils.delimeter+term.text()))) termDocFreqBuffer.put(term, entry.getColumns()); } if(!termDocFreqBuffer.isEmpty()){ chunkBoundryTerm = termDocFreqBuffer.lastKey(); } } // add a final key (excluded in submap below) termDocFreqBuffer.put(finalTerm, null); // put in cache for (Term termKey : termDocFreqBuffer.keySet()) { if (termCache == null) { termCache = termDocFreqBuffer; } else { termCache.putAll(termDocFreqBuffer); } indexReader.addTermEnumCache(termKey, this); } // cache the initial term too indexReader.addTermEnumCache(skipTo, this); termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; long end = System.currentTimeMillis(); logger.debug("loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms"); } void loadFilteredTerms(Term term, List<String> docNums) { long start = System.currentTimeMillis(); ColumnParent parent = new ColumnParent(); parent.setColumn_family(CassandraUtils.termVecColumnFamily); String key = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(term) ); SlicePredicate slicePredicate = new SlicePredicate(); for (String docNum : docNums) { slicePredicate.addToColumn_names(docNum.getBytes()); } List<ColumnOrSuperColumn> columsList = null; try { columsList = client.get_slice(CassandraUtils.keySpace, key, parent, slicePredicate, ConsistencyLevel.ONE); } catch (InvalidRequestException e) { throw new RuntimeException(e); } catch (UnavailableException e) { throw new RuntimeException(e); } catch (TimedOutException e) { throw new RuntimeException(e); } catch (TException e) { throw new RuntimeException(e); }catch (Exception e) { throw new RuntimeException(e); } termBuffer = new Term[0]; if (columsList != null && columsList.size()>0){ termBuffer = new Term[1]; termBuffer[0] = term; termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>(); termDocFreqBuffer.put(term, columsList); } long end = System.currentTimeMillis(); logger.debug("loadFilterdTerms: " + term + "(" + termBuffer.length + ") took " + (end - start) + "ms"); } public final List<ColumnOrSuperColumn> getTermDocFreq() { if (termBuffer.length == 0) return null; List<ColumnOrSuperColumn> termDocs = termDocFreqBuffer.get(termBuffer[termPosition]); int size = termDocs.size(); // create proper docIds. // Make sure these ids are sorted in ascending order since lucene // requires this. int docIds[] = new int[size]; int idx = 0; List<ColumnOrSuperColumn> sortedTermDocs = new ArrayList<ColumnOrSuperColumn>(termDocs.size()); Map<Integer, ColumnOrSuperColumn> termDocMap = new HashMap<Integer, ColumnOrSuperColumn>(); for (ColumnOrSuperColumn col : termDocs) { int docId = indexReader.addDocument(col.getSuper_column(), currentField); termDocMap.put(docId, col); docIds[idx++] = docId; } // sort Arrays.sort(docIds); // move for (idx = 0; idx < size; idx++) { sortedTermDocs.add(termDocMap.get(docIds[idx])); } return sortedTermDocs; } public Set<Term> getCachedTerms() { return termCache.keySet(); } }