/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search.facet; import java.io.Closeable; import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.lucene.index.Fields; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiPostingsEnum; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.TrieField; import org.apache.solr.search.DocSet; import org.apache.solr.search.HashDocSet; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SortedIntDocSet; /** * Enumerates indexed terms in order in a streaming fashion. * It's able to stream since no data needs to be accumulated so long as it's index order. */ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implements Closeable { long bucketsToSkip; long bucketsReturned; boolean closed; boolean countOnly; boolean hasSubFacets; // true if there are subfacets int minDfFilterCache; DocSet docs; DocSet fastForRandomSet; TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; PostingsEnum postingsEnum; BytesRef startTermBytes; BytesRef term; LeafReaderContext[] leaves; FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) { super(fcontext, freq, sf); } @Override public void close() throws IOException { if (!closed) { closed = true; // fcontext.base.decref(); // OFF-HEAP } } @Override public void process() throws IOException { super.process(); // We need to keep the fcontext open after processing is done (since we will be streaming in the response writer). // But if the connection is broken, we want to clean up. // fcontext.base.incref(); // OFF-HEAP fcontext.qcontext.addCloseHook(this); setup(); response = new SimpleOrderedMap<>(); response.add("buckets", new Iterator() { boolean retrieveNext = true; Object val; @Override public boolean hasNext() { if (retrieveNext) { val = nextBucket(); } retrieveNext = false; return val != null; } @Override public Object next() { if (retrieveNext) { val = nextBucket(); } retrieveNext = true; if (val == null) { // Last value, so clean up. In the case that we are doing streaming facets within streaming facets, // the number of close hooks could grow very large, so we want to remove ourselves. boolean removed = fcontext.qcontext.removeCloseHook(FacetFieldProcessorByEnumTermsStream.this); assert removed; try { close(); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming close", e); } } return val; } @Override public void remove() { throw new UnsupportedOperationException(); } }); } private void setup() throws IOException { countOnly = freq.facetStats.size() == 0 || freq.facetStats.values().iterator().next() instanceof CountAgg; hasSubFacets = freq.subFacets.size() > 0; bucketsToSkip = freq.offset; createAccs(-1, 1); // Minimum term docFreq in order to use the filterCache for that term. if (freq.cacheDf == -1) { // -1 means never cache minDfFilterCache = Integer.MAX_VALUE; } else if (freq.cacheDf == 0) { // default; compute as fraction of maxDoc minDfFilterCache = Math.max(fcontext.searcher.maxDoc() >> 4, 3); // (minimum of 3 is for test coverage purposes) } else { minDfFilterCache = freq.cacheDf; } docs = fcontext.base; fastForRandomSet = null; if (freq.prefix != null) { String indexedPrefix = sf.getType().toInternal(freq.prefix); startTermBytes = new BytesRef(indexedPrefix); } else if (sf.getType().getNumberType() != null) { String triePrefix = TrieField.getMainValuePrefix(sf.getType()); if (triePrefix != null) { startTermBytes = new BytesRef(triePrefix); } } Fields fields = fcontext.searcher.getSlowAtomicReader().fields(); Terms terms = fields == null ? null : fields.terms(sf.getName()); termsEnum = null; deState = null; term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (startTermBytes != null) { if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } List<LeafReaderContext> leafList = fcontext.searcher.getTopReaderContext().leaves(); leaves = leafList.toArray( new LeafReaderContext[ leafList.size() ]); } private SimpleOrderedMap<Object> nextBucket() { try { return _nextBucket(); } catch (Exception e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming", e); } } private SimpleOrderedMap<Object> _nextBucket() throws IOException { DocSet termSet = null; try { while (term != null) { if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) { break; } int df = termsEnum.docFreq(); if (df < effectiveMincount) { term = termsEnum.next(); continue; } if (termSet != null) { // termSet.decref(); // OFF-HEAP termSet = null; } int c = 0; if (hasSubFacets || df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = sf.getName(); deState.liveDocs = fcontext.searcher.getSlowAtomicReader().getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; deState.minSetSizeCached = minDfFilterCache; } if (hasSubFacets || !countOnly) { DocSet termsAll = fcontext.searcher.getDocSet(deState); termSet = docs.intersection(termsAll); // termsAll.decref(); // OFF-HEAP c = termSet.size(); } else { c = fcontext.searcher.numDocs(docs, deState); } postingsEnum = deState.postingsEnum; resetStats(); if (!countOnly) { collect(termSet, 0); } } else { // We don't need the docset here (meaning no sub-facets). // if countOnly, then we are calculating some other stats... resetStats(); // lazy convert to fastForRandomSet if (fastForRandomSet == null) { fastForRandomSet = docs; if (docs instanceof SortedIntDocSet) { // OFF-HEAP todo: also check for native version SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } } // iterate over TermDocs to calculate the intersection postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; if (countOnly) { while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } else { setNextReader(leaves[sub.slice.readerIndex]); while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) { c++; collect(docid, 0); } } } } } else { int docid; if (countOnly) { while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } else { setNextReader(leaves[0]); while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) { c++; collect(docid, 0); } } } } } if (c < effectiveMincount) { term = termsEnum.next(); continue; } // handle offset and limit if (bucketsToSkip > 0) { bucketsToSkip--; term = termsEnum.next(); continue; } if (freq.limit >= 0 && ++bucketsReturned > freq.limit) { return null; } // set count in case other stats depend on it countAcc.incrementCount(0, c); // OK, we have a good bucket to return... first get bucket value before moving to next term Object bucketVal = sf.getType().toObject(sf, term); TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null; term = termsEnum.next(); SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>(); bucket.add("val", bucketVal); addStats(bucket, 0); if (hasSubFacets) { processSubs(bucket, bucketQuery, termSet, false, null); } // TODO... termSet needs to stick around for streaming sub-facets? return bucket; } } finally { if (termSet != null) { // termSet.decref(); // OFF-HEAP termSet = null; } } // end of the iteration return null; } }