/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search.facet; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.function.BiPredicate; import java.util.function.Function; import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.Query; import org.apache.lucene.util.PriorityQueue; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocSet; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; /** * Facet processing based on field values. (not range nor by query) * @see FacetField */ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> { SchemaField sf; SlotAcc indexOrderAcc; int effectiveMincount; Map<String,AggValueSource> deferredAggs; // null if none // TODO: push any of this down to base class? // // For sort="x desc", collectAcc would point to "x", and sortAcc would also point to "x". // collectAcc would be used to accumulate all buckets, and sortAcc would be used to sort those buckets. // SlotAcc collectAcc; // Accumulator to collect across entire domain (in addition to the countAcc). May be null. SlotAcc sortAcc; // Accumulator to use for sorting *only* (i.e. not used for collection). May be an alias of countAcc, collectAcc, or indexOrderAcc SlotAcc[] otherAccs; // Accumulators that do not need to be calculated across all buckets. SpecialSlotAcc allBucketsAcc; // this can internally refer to otherAccs and/or collectAcc. setNextReader should be called on otherAccs directly if they exist. FacetFieldProcessor(FacetContext fcontext, FacetField freq, SchemaField sf) { super(fcontext, freq); this.sf = sf; this.effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount); } /** This is used to create accs for second phase (or to create accs for all aggs) */ @Override protected void createAccs(int docCount, int slotCount) throws IOException { if (accMap == null) { accMap = new LinkedHashMap<>(); } // allow a custom count acc to be used if (countAcc == null) { countAcc = new CountSlotArrAcc(fcontext, slotCount); countAcc.key = "count"; } if (accs != null) { // reuse these accs, but reset them first for (SlotAcc acc : accs) { acc.reset(); } return; } else { accs = new SlotAcc[ freq.getFacetStats().size() ]; } int accIdx = 0; for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) { SlotAcc acc = null; if (slotCount == 1) { acc = accMap.get(entry.getKey()); if (acc != null) { acc.reset(); } } if (acc == null) { acc = entry.getValue().createSlotAcc(fcontext, docCount, slotCount); acc.key = entry.getKey(); accMap.put(acc.key, acc); } accs[accIdx++] = acc; } } void createCollectAcc(int numDocs, int numSlots) throws IOException { accMap = new LinkedHashMap<>(); // we always count... // allow a subclass to set a custom counter. if (countAcc == null) { countAcc = new CountSlotArrAcc(fcontext, numSlots); } if ("count".equals(freq.sortVariable)) { sortAcc = countAcc; deferredAggs = freq.getFacetStats(); } else if ("index".equals(freq.sortVariable)) { // allow subclass to set indexOrderAcc first if (indexOrderAcc == null) { // This sorting accumulator just goes by the slot number, so does not need to be collected // and hence does not need to find it's way into the accMap or accs array. indexOrderAcc = new SortSlotAcc(fcontext); } sortAcc = indexOrderAcc; deferredAggs = freq.getFacetStats(); } else { AggValueSource sortAgg = freq.getFacetStats().get(freq.sortVariable); if (sortAgg != null) { collectAcc = sortAgg.createSlotAcc(fcontext, numDocs, numSlots); collectAcc.key = freq.sortVariable; // TODO: improve this } sortAcc = collectAcc; deferredAggs = new HashMap<>(freq.getFacetStats()); deferredAggs.remove(freq.sortVariable); } if (deferredAggs.size() == 0) { deferredAggs = null; } boolean needOtherAccs = freq.allBuckets; // TODO: use for missing too... if (!needOtherAccs) { // we may need them later, but we don't want to create them now // otherwise we won't know if we need to call setNextReader on them. return; } // create the deferred aggs up front for use by allBuckets createOtherAccs(numDocs, 1); } private void createOtherAccs(int numDocs, int numSlots) throws IOException { if (otherAccs != null) { // reuse existing accumulators for (SlotAcc acc : otherAccs) { acc.reset(); // todo - make reset take numDocs and numSlots? } return; } int numDeferred = deferredAggs == null ? 0 : deferredAggs.size(); if (numDeferred <= 0) return; otherAccs = new SlotAcc[ numDeferred ]; int otherAccIdx = 0; for (Map.Entry<String,AggValueSource> entry : deferredAggs.entrySet()) { AggValueSource agg = entry.getValue(); SlotAcc acc = agg.createSlotAcc(fcontext, numDocs, numSlots); acc.key = entry.getKey(); accMap.put(acc.key, acc); otherAccs[otherAccIdx++] = acc; } if (numDeferred == freq.getFacetStats().size()) { // accs and otherAccs are the same... accs = otherAccs; } } int collectFirstPhase(DocSet docs, int slot) throws IOException { int num = -1; if (collectAcc != null) { num = collectAcc.collect(docs, slot); } if (allBucketsAcc != null) { num = allBucketsAcc.collect(docs, slot); } return num >= 0 ? num : docs.size(); } void collectFirstPhase(int segDoc, int slot) throws IOException { if (collectAcc != null) { collectAcc.collect(segDoc, slot); } if (allBucketsAcc != null) { allBucketsAcc.collect(segDoc, slot); } } /** Processes the collected data to finds the top slots, and composes it in the response NamedList. */ SimpleOrderedMap<Object> findTopSlots(final int numSlots, final int slotCardinality, IntFunction<Comparable> bucketValFromSlotNumFunc, Function<Comparable, String> fieldQueryValFunc) throws IOException { int numBuckets = 0; final int off = fcontext.isShard() ? 0 : (int) freq.offset; long effectiveLimit = Integer.MAX_VALUE; // use max-int instead of max-long to avoid overflow if (freq.limit >= 0) { effectiveLimit = freq.limit; if (fcontext.isShard()) { if (freq.overrequest == -1) { // add over-request if this is a shard request and if we have a small offset (large offsets will already be gathering many more buckets than needed) if (freq.offset < 10) { effectiveLimit = (long) (effectiveLimit * 1.1 + 4); // default: add 10% plus 4 (to overrequest for very small limits) } } else { effectiveLimit += freq.overrequest; } } } final int sortMul = freq.sortDirection.getMultiplier(); int maxTopVals = (int) (effectiveLimit >= 0 ? Math.min(freq.offset + effectiveLimit, Integer.MAX_VALUE - 1) : Integer.MAX_VALUE - 1); maxTopVals = Math.min(maxTopVals, slotCardinality); final SlotAcc sortAcc = this.sortAcc, indexOrderAcc = this.indexOrderAcc; final BiPredicate<Slot,Slot> orderPredicate; if (indexOrderAcc != null && indexOrderAcc != sortAcc) { orderPredicate = (a, b) -> { int cmp = sortAcc.compare(a.slot, b.slot) * sortMul; return cmp == 0 ? (indexOrderAcc.compare(a.slot, b.slot) > 0) : cmp < 0; }; } else { orderPredicate = (a, b) -> { int cmp = sortAcc.compare(a.slot, b.slot) * sortMul; return cmp == 0 ? b.slot < a.slot : cmp < 0; }; } final PriorityQueue<Slot> queue = new PriorityQueue<Slot>(maxTopVals) { @Override protected boolean lessThan(Slot a, Slot b) { return orderPredicate.test(a, b); } }; // note: We avoid object allocation by having a Slot and re-using the 'bottom'. Slot bottom = null; Slot scratchSlot = new Slot(); for (int slotNum = 0; slotNum < numSlots; slotNum++) { // screen out buckets not matching mincount if (effectiveMincount > 0) { int count = countAcc.getCount(slotNum); if (count < effectiveMincount) { if (count > 0) numBuckets++; // Still increment numBuckets as long as we have some count. This is for consistency between distrib and non-distrib mode. continue; } } numBuckets++; if (bottom != null) { scratchSlot.slot = slotNum; // scratchSlot is only used to hold this slotNum for the following line if (orderPredicate.test(bottom, scratchSlot)) { bottom.slot = slotNum; bottom = queue.updateTop(); } } else if (effectiveLimit > 0) { // queue not full Slot s = new Slot(); s.slot = slotNum; queue.add(s); if (queue.size() >= maxTopVals) { bottom = queue.top(); } } } assert queue.size() <= numBuckets; SimpleOrderedMap<Object> res = new SimpleOrderedMap<>(); if (freq.numBuckets) { if (!fcontext.isShard()) { res.add("numBuckets", numBuckets); } else { calculateNumBuckets(res); } } FacetDebugInfo fdebug = fcontext.getDebugInfo(); if (fdebug != null) fdebug.putInfoItem("numBuckets", (long) numBuckets); if (freq.allBuckets) { SimpleOrderedMap<Object> allBuckets = new SimpleOrderedMap<>(); // countAcc.setValues(allBuckets, allBucketsSlot); allBuckets.add("count", allBucketsAcc.getSpecialCount()); allBucketsAcc.setValues(allBuckets, -1); // -1 slotNum is unused for SpecialSlotAcc // allBuckets currently doesn't execute sub-facets (because it doesn't change the domain?) res.add("allBuckets", allBuckets); } if (freq.missing) { // TODO: it would be more efficient to build up a missing DocSet if we need it here anyway. SimpleOrderedMap<Object> missingBucket = new SimpleOrderedMap<>(); fillBucket(missingBucket, getFieldMissingQuery(fcontext.searcher, freq.field), null, false, null); res.add("missing", missingBucket); } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= maxTopVals; int[] sortedSlots = new int[collectCount]; for (int i = collectCount - 1; i >= 0; i--) { sortedSlots[i] = queue.pop().slot; } ArrayList<SimpleOrderedMap> bucketList = new ArrayList<>(collectCount); res.add("buckets", bucketList); boolean needFilter = deferredAggs != null || freq.getSubFacets().size() > 0; for (int slotNum : sortedSlots) { SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>(); Comparable val = bucketValFromSlotNumFunc.apply(slotNum); bucket.add("val", val); Query filter = needFilter ? sf.getType().getFieldQuery(null, sf, fieldQueryValFunc.apply(val)) : null; fillBucket(bucket, countAcc.getCount(slotNum), slotNum, null, filter); bucketList.add(bucket); } return res; } private void calculateNumBuckets(SimpleOrderedMap<Object> target) throws IOException { DocSet domain = fcontext.base; if (freq.prefix != null) { Query prefixFilter = sf.getType().getPrefixQuery(null, sf, freq.prefix); domain = fcontext.searcher.getDocSet(prefixFilter, domain); } HLLAgg agg = new HLLAgg(freq.field); SlotAcc acc = agg.createSlotAcc(fcontext, domain.size(), 1); acc.collect(domain, 0); acc.key = "numBuckets"; acc.setValues(target, 0); } private static class Slot { int slot; } private void fillBucket(SimpleOrderedMap<Object> target, int count, int slotNum, DocSet subDomain, Query filter) throws IOException { target.add("count", count); if (count <= 0 && !freq.processEmpty) return; if (collectAcc != null && slotNum >= 0) { collectAcc.setValues(target, slotNum); } createOtherAccs(-1, 1); if (otherAccs == null && freq.subFacets.isEmpty()) return; if (subDomain == null) { subDomain = fcontext.searcher.getDocSet(filter, fcontext.base); } // if no subFacets, we only need a DocSet // otherwise we need more? // TODO: save something generic like "slotNum" in the context and use that to implement things like filter exclusion if necessary? // Hmmm, but we need to look up some stuff anyway (for the label?) // have a method like "DocSet applyConstraint(facet context, DocSet parent)" // that's needed for domain changing things like joins anyway??? if (otherAccs != null) { // do acc at a time (traversing domain each time) or do all accs for each doc? for (SlotAcc acc : otherAccs) { acc.reset(); // TODO: only needed if we previously used for allBuckets or missing acc.collect(subDomain, 0); acc.setValues(target, 0); } } processSubs(target, filter, subDomain, false, null); } @Override protected void processStats(SimpleOrderedMap<Object> bucket, DocSet docs, int docCount) throws IOException { if (docCount == 0 && !freq.processEmpty || freq.getFacetStats().size() == 0) { bucket.add("count", docCount); return; } createAccs(docCount, 1); int collected = collect(docs, 0); // countAcc.incrementCount(0, collected); // should we set the counton the acc instead of just passing it? assert collected == docCount; addStats(bucket, collected, 0); } // overrides but with different signature! private void addStats(SimpleOrderedMap<Object> target, int count, int slotNum) throws IOException { target.add("count", count); if (count > 0 || freq.processEmpty) { for (SlotAcc acc : accs) { acc.setValues(target, slotNum); } } } @Override void setNextReader(LeafReaderContext ctx) throws IOException { // base class calls this (for missing bucket...) ... go over accs[] in that case super.setNextReader(ctx); } void setNextReaderFirstPhase(LeafReaderContext ctx) throws IOException { if (collectAcc != null) { collectAcc.setNextReader(ctx); } if (otherAccs != null) { for (SlotAcc acc : otherAccs) { acc.setNextReader(ctx); } } } static class SpecialSlotAcc extends SlotAcc { SlotAcc collectAcc; SlotAcc[] otherAccs; int collectAccSlot; int otherAccsSlot; long count; SpecialSlotAcc(FacetContext fcontext, SlotAcc collectAcc, int collectAccSlot, SlotAcc[] otherAccs, int otherAccsSlot) { super(fcontext); this.collectAcc = collectAcc; this.collectAccSlot = collectAccSlot; this.otherAccs = otherAccs; this.otherAccsSlot = otherAccsSlot; } public int getCollectAccSlot() { return collectAccSlot; } public int getOtherAccSlot() { return otherAccsSlot; } long getSpecialCount() { return count; } @Override public void collect(int doc, int slot) throws IOException { assert slot != collectAccSlot || slot < 0; count++; if (collectAcc != null) { collectAcc.collect(doc, collectAccSlot); } if (otherAccs != null) { for (SlotAcc otherAcc : otherAccs) { otherAcc.collect(doc, otherAccsSlot); } } } @Override public void setNextReader(LeafReaderContext readerContext) throws IOException { // collectAcc and otherAccs will normally have setNextReader called directly on them. // This, however, will be used when collect(DocSet,slot) variant is used on this Acc. if (collectAcc != null) { collectAcc.setNextReader(readerContext); } if (otherAccs != null) { for (SlotAcc otherAcc : otherAccs) { otherAcc.setNextReader(readerContext); } } } @Override public int compare(int slotA, int slotB) { throw new UnsupportedOperationException(); } @Override public Object getValue(int slotNum) throws IOException { throw new UnsupportedOperationException(); } @Override public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException { if (collectAcc != null) { collectAcc.setValues(bucket, collectAccSlot); } if (otherAccs != null) { for (SlotAcc otherAcc : otherAccs) { otherAcc.setValues(bucket, otherAccsSlot); } } } @Override public void reset() { // reset should be called on underlying accs // TODO: but in case something does need to be done here, should we require this method to be called but do nothing for now? throw new UnsupportedOperationException(); } @Override public void resize(Resizer resizer) { // someone else will call resize on collectAcc directly if (collectAccSlot >= 0) { collectAccSlot = resizer.getNewSlot(collectAccSlot); } } } /* "qfacet":{"cat2":{"_l":["A"]}}, "all":{"_s":[[ "all", {"cat3":{"_l":["A"]}}]]}, "cat1":{"_l":["A"]}}} */ static <T> List<T> asList(Object list) { return list != null ? (List<T>)list : Collections.EMPTY_LIST; } protected SimpleOrderedMap<Object> refineFacets() throws IOException { boolean skipThisFacet = (fcontext.flags & SKIP_FACET) != 0; List leaves = asList(fcontext.facetInfo.get("_l")); // We have not seen this bucket: do full faceting for this bucket, including all sub-facets List<List> skip = asList(fcontext.facetInfo.get("_s")); // We have seen this bucket, so skip stats on it, and skip sub-facets except for the specified sub-facets that should calculate specified buckets. List<List> partial = asList(fcontext.facetInfo.get("_p")); // We have not seen this bucket, do full faceting for this bucket, and most sub-facets... but some sub-facets are partial and should only visit specified buckets. // For leaf refinements, we do full faceting for each leaf bucket. Any sub-facets of these buckets will be fully evaluated. Because of this, we should never // encounter leaf refinements that have sub-facets that return partial results. SimpleOrderedMap<Object> res = new SimpleOrderedMap<>(); List<SimpleOrderedMap> bucketList = new ArrayList<>( leaves.size() + skip.size() + partial.size() ); res.add("buckets", bucketList); // TODO: an alternate implementations can fill all accs at once createAccs(-1, 1); for (Object bucketVal : leaves) { bucketList.add( refineBucket(bucketVal, false, null) ); } for (List bucketAndFacetInfo : skip) { assert bucketAndFacetInfo.size() == 2; Object bucketVal = bucketAndFacetInfo.get(0); Map<String,Object> facetInfo = (Map<String, Object>) bucketAndFacetInfo.get(1); bucketList.add( refineBucket(bucketVal, true, facetInfo ) ); } // The only difference between skip and missing is the value of "skip" passed to refineBucket for (List bucketAndFacetInfo : partial) { assert bucketAndFacetInfo.size() == 2; Object bucketVal = bucketAndFacetInfo.get(0); Map<String,Object> facetInfo = (Map<String, Object>) bucketAndFacetInfo.get(1); bucketList.add( refineBucket(bucketVal, false, facetInfo ) ); } if (freq.missing) { Map<String,Object> bucketFacetInfo = (Map<String,Object>)fcontext.facetInfo.get("missing"); if (bucketFacetInfo != null || !skipThisFacet) { SimpleOrderedMap<Object> missingBucket = new SimpleOrderedMap<>(); fillBucket(missingBucket, getFieldMissingQuery(fcontext.searcher, freq.field), null, skipThisFacet, bucketFacetInfo); res.add("missing", missingBucket); } } if (freq.numBuckets && !skipThisFacet) { calculateNumBuckets(res); } // If there are just a couple of leaves, and if the domain is large, then // going by term is likely the most efficient? // If the domain is small, or if the number of leaves is large, then doing // the normal collection method may be best. return res; } private SimpleOrderedMap<Object> refineBucket(Object bucketVal, boolean skip, Map<String,Object> facetInfo) throws IOException { SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>(); FieldType ft = sf.getType(); bucket.add("val", bucketVal); // String internal = ft.toInternal( tobj.toString() ); // TODO - we need a better way to get from object to query... Query domainQ = ft.getFieldQuery(null, sf, bucketVal.toString()); fillBucket(bucket, domainQ, null, skip, facetInfo); return bucket; } }