package org.apache.solr.request; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.Filter; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.PackedInts; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.schema.FieldType; import org.apache.solr.search.DocSet; import org.apache.solr.search.SolrIndexReader; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.BoundedTreeSet; import java.io.IOException; import java.util.*; import java.util.concurrent.*; class PerSegmentSingleValuedFaceting { // input params SolrIndexSearcher searcher; DocSet docs; String fieldName; int offset; int limit; int mincount; boolean missing; String sort; String prefix; Filter baseSet; int nThreads; public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix) { this.searcher = searcher; this.docs = docs; this.fieldName = fieldName; this.offset = offset; this.limit = limit; this.mincount = mincount; this.missing = missing; this.sort = sort; this.prefix = prefix; } public void setNumThreads(int threads) { nThreads = threads; } NamedList getFacetCounts(Executor executor) throws IOException { CompletionService<SegFacet> completionService = new ExecutorCompletionService<SegFacet>(executor); // reuse the translation logic to go from top level set to per-segment set baseSet = docs.getTopFilter(); SolrIndexReader topReader = searcher.getReader(); final SolrIndexReader[] leafReaders = topReader.getLeafReaders(); int[] offsets = topReader.getLeafOffsets(); // The list of pending tasks that aren't immediately submitted // TODO: Is there a completion service, or a delegating executor that can // limit the number of concurrent tasks submitted to a bigger executor? LinkedList<Callable<SegFacet>> pending = new LinkedList<Callable<SegFacet>>(); int threads = nThreads <= 0 ? Integer.MAX_VALUE : nThreads; for (int i=0; i<leafReaders.length; i++) { final SegFacet segFacet = new SegFacet(leafReaders[i], offsets[i]); Callable<SegFacet> task = new Callable<SegFacet>() { public SegFacet call() throws Exception { segFacet.countTerms(); return segFacet; } }; // TODO: if limiting threads, submit by largest segment first? if (--threads >= 0) { completionService.submit(task); } else { pending.add(task); } } // now merge the per-segment results PriorityQueue<SegFacet> queue = new PriorityQueue<SegFacet>() { { initialize(leafReaders.length); } @Override protected boolean lessThan(SegFacet a, SegFacet b) { return a.tempBR.compareTo(b.tempBR) < 0; } }; boolean hasMissingCount=false; int missingCount=0; for (int i=0; i<leafReaders.length; i++) { SegFacet seg = null; try { Future<SegFacet> future = completionService.take(); seg = future.get(); if (!pending.isEmpty()) { completionService.submit(pending.removeFirst()); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof RuntimeException) { throw (RuntimeException)cause; } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error in per-segment faceting on field: " + fieldName, cause); } } if (seg.startTermIndex < seg.endTermIndex) { if (seg.startTermIndex==0) { hasMissingCount=true; missingCount += seg.counts[0]; seg.pos = 1; } else { seg.pos = seg.startTermIndex; } if (seg.pos < seg.endTermIndex) { seg.tenum = seg.si.getTermsEnum(); seg.tenum.seek(seg.pos); seg.tempBR = seg.tenum.term(); queue.add(seg); } } } FacetCollector collector; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { collector = new CountSortedFacetCollector(offset, limit, mincount); } else { collector = new IndexSortedFacetCollector(offset, limit, mincount); } BytesRef val = new BytesRef(); while (queue.size() > 0) { SegFacet seg = queue.top(); // make a shallow copy val.bytes = seg.tempBR.bytes; val.offset = seg.tempBR.offset; val.length = seg.tempBR.length; int count = 0; do { count += seg.counts[seg.pos - seg.startTermIndex]; // TODO: OPTIMIZATION... // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry. seg.pos++; if (seg.pos >= seg.endTermIndex) { queue.pop(); seg = queue.top(); } else { seg.tempBR = seg.tenum.next(); seg = queue.updateTop(); } } while (seg != null && val.compareTo(seg.tempBR) == 0); boolean stop = collector.collect(val, count); if (stop) break; } NamedList res = collector.getFacetCounts(); // convert labels to readable form FieldType ft = searcher.getSchema().getFieldType(fieldName); int sz = res.size(); for (int i=0; i<sz; i++) { res.setName(i, ft.indexedToReadable(res.getName(i))); } if (missing) { if (!hasMissingCount) { missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,fieldName); } res.add(null, missingCount); } return res; } class SegFacet { SolrIndexReader reader; int readerOffset; SegFacet(SolrIndexReader reader, int readerOffset) { this.reader = reader; this.readerOffset = readerOffset; } FieldCache.DocTermsIndex si; int startTermIndex; int endTermIndex; int[] counts; int pos; // only used when merging TermsEnum tenum; // only used when merging BytesRef tempBR = new BytesRef(); void countTerms() throws IOException { si = FieldCache.DEFAULT.getTermsIndex(reader, fieldName); // SolrCore.log.info("reader= " + reader + " FC=" + System.identityHashCode(si)); if (prefix!=null) { startTermIndex = si.binarySearchLookup(new BytesRef(prefix), tempBR); if (startTermIndex<0) startTermIndex=-startTermIndex-1; // find the end term. \uffff isn't a legal unicode char, but only compareTo // is used, so it should be fine, and is guaranteed to be bigger than legal chars. // TODO: switch to binarySearch version that takes start/end in Java6 endTermIndex = si.binarySearchLookup(new BytesRef(prefix+"\uffff\uffff\uffff\uffff"), tempBR); assert endTermIndex < 0; endTermIndex = -endTermIndex-1; } else { startTermIndex=0; endTermIndex=si.numOrd(); } final int nTerms=endTermIndex-startTermIndex; if (nTerms>0) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = this.counts = new int[nTerms]; DocIdSet idSet = baseSet.getDocIdSet(reader); DocIdSetIterator iter = idSet.iterator(); //// PackedInts.Reader ordReader = si.getDocToOrd(); int doc; if (ordReader instanceof Direct32) { int[] ords = ((Direct32)ordReader).getArray(); if (prefix==null) { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { counts[ords[doc]]++; } } else { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { int term = ords[doc]; int arrIdx = term-startTermIndex; if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++; } } } else if (ordReader instanceof Direct16) { short[] ords = ((Direct16)ordReader).getArray(); if (prefix==null) { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { counts[ords[doc] & 0xffff]++; } } else { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { int term = ords[doc] & 0xffff; int arrIdx = term-startTermIndex; if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++; } } } else if (ordReader instanceof Direct8) { byte[] ords = ((Direct8)ordReader).getArray(); if (prefix==null) { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { counts[ords[doc] & 0xff]++; } } else { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { int term = ords[doc] & 0xff; int arrIdx = term-startTermIndex; if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++; } } } else { if (prefix==null) { // specialized version when collecting counts for all terms while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { counts[si.getOrd(doc)]++; } } else { // version that adjusts term numbers because we aren't collecting the full range while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { int term = si.getOrd(doc); int arrIdx = term-startTermIndex; if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++; } } } } } } } abstract class FacetCollector { /*** return true to stop collection */ public abstract boolean collect(BytesRef term, int count); public abstract NamedList getFacetCounts(); } // This collector expects facets to be collected in index order class CountSortedFacetCollector extends FacetCollector { final int offset; final int limit; final int maxsize; final BoundedTreeSet<SimpleFacets.CountPair<String,Integer>> queue; int min; // the smallest value in the top 'N' values public CountSortedFacetCollector(int offset, int limit, int mincount) { this.offset = offset; this.limit = limit; maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; queue = new BoundedTreeSet<SimpleFacets.CountPair<String,Integer>>(maxsize); min=mincount-1; // the smallest value in the top 'N' values } @Override public boolean collect(BytesRef term, int count) { if (count > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). queue.add(new SimpleFacets.CountPair<String,Integer>(term.utf8ToString(), count)); if (queue.size()>=maxsize) min=queue.last().val; } return false; } @Override public NamedList getFacetCounts() { NamedList res = new NamedList(); int off=offset; int lim=limit>=0 ? limit : Integer.MAX_VALUE; // now select the right page from the results for (SimpleFacets.CountPair<String,Integer> p : queue) { if (--off>=0) continue; if (--lim<0) break; res.add(p.key, p.val); } return res; } } // This collector expects facets to be collected in index order class IndexSortedFacetCollector extends FacetCollector { int offset; int limit; final int mincount; final NamedList res = new NamedList(); public IndexSortedFacetCollector(int offset, int limit, int mincount) { this.offset = offset; this.limit = limit>0 ? limit : Integer.MAX_VALUE; this.mincount = mincount; } @Override public boolean collect(BytesRef term, int count) { if (count < mincount) { return false; } if (offset > 0) { offset--; return false; } if (limit > 0) { res.add(term.utf8ToString(), count); limit--; } return limit <= 0; } @Override public NamedList getFacetCounts() { return res; } }