/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Stack; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey; import org.apache.lucene.util.PriorityQueue; /** * A {@link TopDocsCollector} that controls diversity in results by ensuring no * more than maxHitsPerKey results from a common source are collected in the * final results. * * An example application might be a product search in a marketplace where no * more than 3 results per retailer are permitted in search results. * * <p> * To compare behaviour with other forms of collector, a useful analogy might be * the problem of making a compilation album of 1967's top hit records: * <ol> * <li>A vanilla query's results might look like a "Best of the Beatles" album - * high quality but not much diversity</li> * <li>A GroupingSearch would produce the equivalent of "The 10 top-selling * artists of 1967 - some killer and quite a lot of filler"</li> * <li>A "diversified" query would be the top 20 hit records of that year - with * a max of 3 Beatles hits in order to maintain diversity</li> * </ol> * This collector improves on the "GroupingSearch" type queries by * <ul> * <li>Working in one pass over the data</li> * <li>Not requiring the client to guess how many groups are required</li> * <li>Removing low-scoring "filler" which sits at the end of each group's hits</li> * </ul> * * This is an abstract class and subclasses have to provide a source of keys for * documents which is then used to help identify duplicate sources. * * @lucene.experimental * */ public abstract class DiversifiedTopDocsCollector extends TopDocsCollector<ScoreDocKey> { ScoreDocKey spare; private ScoreDocKeyQueue globalQueue; private int numHits; private Map<Long, ScoreDocKeyQueue> perKeyQueues; protected int maxNumPerKey; private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>(); public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) { super(new ScoreDocKeyQueue(numHits)); // Need to access pq.lessThan() which is protected so have to cast here... this.globalQueue = (ScoreDocKeyQueue) pq; perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>(); this.numHits = numHits; this.maxNumPerKey = maxHitsPerKey; } /** * Get a source of values used for grouping keys */ protected abstract NumericDocValues getKeys(LeafReaderContext context); @Override public boolean needsScores() { return true; } @Override protected TopDocs newTopDocs(ScoreDoc[] results, int start) { if (results == null) { return EMPTY_TOPDOCS; } // We need to compute maxScore in order to set it in TopDocs. If start == 0, // it means the largest element is already in results, use its score as // maxScore. Otherwise pop everything else, until the largest element is // extracted and use its score as maxScore. float maxScore = Float.NaN; if (start == 0) { maxScore = results[0].score; } else { for (int i = globalQueue.size(); i > 1; i--) { globalQueue.pop(); } maxScore = globalQueue.pop().score; } return new TopDocs(totalHits, results, maxScore); } protected ScoreDocKey insert(ScoreDocKey addition, int docBase, NumericDocValues keys) throws IOException { if ((globalQueue.size() >= numHits) && (globalQueue.lessThan(addition, globalQueue.top()))) { // Queue is full and proposed addition is not a globally // competitive score return addition; } // The addition stands a chance of being entered - check the // key-specific restrictions. // We delay fetching the key until we are certain the score is globally // competitive. We need to adjust the ScoreDoc's global doc value to be // a leaf reader value when looking up keys int leafDocID = addition.doc - docBase; long value; if (keys.advanceExact(leafDocID)) { value = keys.longValue(); } else { value = 0; } addition.key = value; // For this to work the choice of key class needs to implement // hashcode and equals. ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key); if (thisKeyQ == null) { if (sparePerKeyQueues.size() == 0) { thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey); } else { thisKeyQ = sparePerKeyQueues.pop(); } perKeyQueues.put(addition.key, thisKeyQ); } ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition); if (perKeyOverflow == addition) { // This key group has reached capacity and our proposed addition // was not competitive in the group - do not insert into the // main PQ or the key will be overly-populated in final results. return addition; } if (perKeyOverflow == null) { // This proposed addition is also locally competitive within the // key group - make a global entry and return ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition); perKeyGroupRemove(globalOverflow); return globalOverflow; } // For the given key, we have reached max capacity but the new addition // is better than a prior entry that still exists in the global results // - request the weaker-scoring entry to be removed from the global // queue. globalQueue.remove(perKeyOverflow); // Add the locally-competitive addition into the globally queue globalQueue.add(addition); return perKeyOverflow; } private void perKeyGroupRemove(ScoreDocKey globalOverflow) { if (globalOverflow == null) { return; } ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key); ScoreDocKey perKeyLowest = q.pop(); // The least globally-competitive item should also always be the least // key-local item assert (globalOverflow == perKeyLowest); if (q.size() == 0) { perKeyQueues.remove(globalOverflow.key); sparePerKeyQueues.push(q); } } @Override public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { final int base = context.docBase; final NumericDocValues keySource = getKeys(context); return new LeafCollector() { Scorer scorer; @Override public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } @Override public void collect(int doc) throws IOException { float score = scorer.score(); // This collector cannot handle NaN assert !Float.isNaN(score); totalHits++; doc += base; if (spare == null) { spare = new ScoreDocKey(doc, score); } else { spare.doc = doc; spare.score = score; } spare = insert(spare, base, keySource); } }; } static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> { ScoreDocKeyQueue(int size) { super(size); } @Override protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) { if (hitA.score == hitB.score) return hitA.doc > hitB.doc; else return hitA.score < hitB.score; } } // /** * An extension to ScoreDoc that includes a key used for grouping purposes */ static public class ScoreDocKey extends ScoreDoc { Long key; protected ScoreDocKey(int doc, float score) { super(doc, score); } public Long getKey() { return key; } @Override public String toString() { return "key:" + key + " doc=" + doc + " s=" + score; } } }