package org.apache.lucene.search; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.List; import org.apache.lucene.util.ArrayUtil; /** * A Scorer for OR like queries, counterpart of <code>ConjunctionScorer</code>. * This Scorer implements {@link Scorer#advance(int)} and uses advance() on the given Scorers. * * This implementation uses the minimumMatch constraint actively to efficiently * prune the number of candidates, it is hence a mixture between a pure DisjunctionScorer * and a ConjunctionScorer. */ class MinShouldMatchSumScorer extends Scorer { /** The overall number of non-finalized scorers */ private int numScorers; /** The minimum number of scorers that should match */ private final int mm; /** A static array of all subscorers sorted by decreasing cost */ private final Scorer sortedSubScorers[]; /** A monotonically increasing index into the array pointing to the next subscorer that is to be excluded */ private int sortedSubScorersIdx = 0; private final Scorer subScorers[]; // the first numScorers-(mm-1) entries are valid private int nrInHeap; // 0..(numScorers-(mm-1)-1) /** mmStack is supposed to contain the most costly subScorers that still did * not run out of docs, sorted by increasing sparsity of docs returned by that subScorer. * For now, the cost of subscorers is assumed to be inversely correlated with sparsity. */ private final Scorer mmStack[]; // of size mm-1: 0..mm-2, always full /** The document number of the current match. */ private int doc = -1; /** The number of subscorers that provide the current match. */ protected int nrMatchers = -1; private double score = Float.NaN; /** * Construct a <code>MinShouldMatchSumScorer</code>. * * @param weight The weight to be used. * @param subScorers A collection of at least two subscorers. * @param minimumNrMatchers The positive minimum number of subscorers that should * match to match this query. * <br>When <code>minimumNrMatchers</code> is bigger than * the number of <code>subScorers</code>, no matches will be produced. * <br>When minimumNrMatchers equals the number of subScorers, * it is more efficient to use <code>ConjunctionScorer</code>. */ public MinShouldMatchSumScorer(Weight weight, List<Scorer> subScorers, int minimumNrMatchers) throws IOException { super(weight); this.nrInHeap = this.numScorers = subScorers.size(); if (minimumNrMatchers <= 0) { throw new IllegalArgumentException("Minimum nr of matchers must be positive"); } if (numScorers <= 1) { throw new IllegalArgumentException("There must be at least 2 subScorers"); } this.mm = minimumNrMatchers; this.sortedSubScorers = subScorers.toArray(new Scorer[this.numScorers]); // sorting by decreasing subscorer cost should be inversely correlated with // next docid (assuming costs are due to generating many postings) ArrayUtil.timSort(sortedSubScorers, new Comparator<Scorer>() { @Override public int compare(Scorer o1, Scorer o2) { return Long.signum(o2.cost() - o1.cost()); } }); // take mm-1 most costly subscorers aside this.mmStack = new Scorer[mm-1]; for (int i = 0; i < mm-1; i++) { mmStack[i] = sortedSubScorers[i]; } nrInHeap -= mm-1; this.sortedSubScorersIdx = mm-1; // take remaining into heap, if any, and heapify this.subScorers = new Scorer[nrInHeap]; for (int i = 0; i < nrInHeap; i++) { this.subScorers[i] = this.sortedSubScorers[mm-1+i]; } minheapHeapify(); assert minheapCheck(); } /** * Construct a <code>DisjunctionScorer</code>, using one as the minimum number * of matching subscorers. */ public MinShouldMatchSumScorer(Weight weight, List<Scorer> subScorers) throws IOException { this(weight, subScorers, 1); } @Override public final Collection<ChildScorer> getChildren() { ArrayList<ChildScorer> children = new ArrayList<>(numScorers); for (int i = 0; i < numScorers; i++) { children.add(new ChildScorer(subScorers[i], "SHOULD")); } return children; } @Override public int nextDoc() throws IOException { assert doc != NO_MORE_DOCS; while (true) { // to remove current doc, call next() on all subScorers on current doc within heap while (subScorers[0].docID() == doc) { if (subScorers[0].nextDoc() != NO_MORE_DOCS) { minheapSiftDown(0); } else { minheapRemoveRoot(); numScorers--; if (numScorers < mm) { return doc = NO_MORE_DOCS; } } //assert minheapCheck(); } evaluateSmallestDocInHeap(); if (nrMatchers >= mm) { // doc satisfies mm constraint break; } } return doc; } private void evaluateSmallestDocInHeap() throws IOException { // within heap, subScorer[0] now contains the next candidate doc doc = subScorers[0].docID(); if (doc == NO_MORE_DOCS) { nrMatchers = Integer.MAX_VALUE; // stop looping return; } // 1. score and count number of matching subScorers within heap score = subScorers[0].score(); nrMatchers = 1; countMatches(1); countMatches(2); // 2. score and count number of matching subScorers within stack, // short-circuit: stop when mm can't be reached for current doc, then perform on heap next() // TODO instead advance() might be possible, but complicates things for (int i = mm-2; i >= 0; i--) { // first advance sparsest subScorer if (mmStack[i].docID() >= doc || mmStack[i].advance(doc) != NO_MORE_DOCS) { if (mmStack[i].docID() == doc) { // either it was already on doc, or got there via advance() nrMatchers++; score += mmStack[i].score(); } else { // scorer advanced to next after doc, check if enough scorers left for current doc if (nrMatchers + i < mm) { // too few subScorers left, abort advancing return; // continue looping TODO consider advance() here } } } else { // subScorer exhausted numScorers--; if (numScorers < mm) { // too few subScorers left doc = NO_MORE_DOCS; nrMatchers = Integer.MAX_VALUE; // stop looping return; } if (mm-2-i > 0) { // shift RHS of array left System.arraycopy(mmStack, i+1, mmStack, i, mm-2-i); } // find next most costly subScorer within heap TODO can this be done better? while (!minheapRemove(sortedSubScorers[sortedSubScorersIdx++])) { //assert minheapCheck(); } // add the subScorer removed from heap to stack mmStack[mm-2] = sortedSubScorers[sortedSubScorersIdx-1]; if (nrMatchers + i < mm) { // too few subScorers left, abort advancing return; // continue looping TODO consider advance() here } } } } // TODO: this currently scores, but so did the previous impl // TODO: remove recursion. // TODO: consider separating scoring out of here, then modify this // and afterNext() to terminate when nrMatchers == minimumNrMatchers // then also change freq() to just always compute it from scratch private void countMatches(int root) throws IOException { if (root < nrInHeap && subScorers[root].docID() == doc) { nrMatchers++; score += subScorers[root].score(); countMatches((root<<1)+1); countMatches((root<<1)+2); } } /** * Returns the score of the current document matching the query. Initially * invalid, until {@link #nextDoc()} is called the first time. */ @Override public float score() throws IOException { return (float) score; } @Override public int docID() { return doc; } @Override public int freq() throws IOException { return nrMatchers; } /** * Advances to the first match beyond the current whose document number is * greater than or equal to a given target. <br> * The implementation uses the advance() method on the subscorers. * * @param target the target document number. * @return the document whose number is greater than or equal to the given * target, or -1 if none exist. */ @Override public int advance(int target) throws IOException { if (numScorers < mm) return doc = NO_MORE_DOCS; // advance all Scorers in heap at smaller docs to at least target while (subScorers[0].docID() < target) { if (subScorers[0].advance(target) != NO_MORE_DOCS) { minheapSiftDown(0); } else { minheapRemoveRoot(); numScorers--; if (numScorers < mm) { return doc = NO_MORE_DOCS; } } //assert minheapCheck(); } evaluateSmallestDocInHeap(); if (nrMatchers >= mm) { return doc; } else { return nextDoc(); } } @Override public long cost() { // cost for merging of lists analog to DisjunctionSumScorer long costCandidateGeneration = 0; for (int i = 0; i < nrInHeap; i++) costCandidateGeneration += subScorers[i].cost(); // TODO is cost for advance() different to cost for iteration + heap merge // and how do they compare overall to pure disjunctions? final float c1 = 1.0f, c2 = 1.0f; // maybe a constant, maybe a proportion between costCandidateGeneration and sum(subScorer_to_be_advanced.cost())? return (long) ( c1 * costCandidateGeneration + // heap-merge cost c2 * costCandidateGeneration * (mm-1) // advance() cost ); } /** * Organize subScorers into a min heap with scorers generating the earliest document on top. */ protected final void minheapHeapify() { for (int i = (nrInHeap >> 1) - 1; i >= 0; i--) { minheapSiftDown(i); } } /** * The subtree of subScorers at root is a min heap except possibly for its root element. * Bubble the root down as required to make the subtree a heap. */ protected final void minheapSiftDown(int root) { // TODO could this implementation also move rather than swapping neighbours? Scorer scorer = subScorers[root]; int doc = scorer.docID(); int i = root; while (i <= (nrInHeap >> 1) - 1) { int lchild = (i << 1) + 1; Scorer lscorer = subScorers[lchild]; int ldoc = lscorer.docID(); int rdoc = Integer.MAX_VALUE, rchild = (i << 1) + 2; Scorer rscorer = null; if (rchild < nrInHeap) { rscorer = subScorers[rchild]; rdoc = rscorer.docID(); } if (ldoc < doc) { if (rdoc < ldoc) { subScorers[i] = rscorer; subScorers[rchild] = scorer; i = rchild; } else { subScorers[i] = lscorer; subScorers[lchild] = scorer; i = lchild; } } else if (rdoc < doc) { subScorers[i] = rscorer; subScorers[rchild] = scorer; i = rchild; } else { return; } } } protected final void minheapSiftUp(int i) { Scorer scorer = subScorers[i]; final int doc = scorer.docID(); // find right place for scorer while (i > 0) { int parent = (i - 1) >> 1; Scorer pscorer = subScorers[parent]; int pdoc = pscorer.docID(); if (pdoc > doc) { // move root down, make space subScorers[i] = subScorers[parent]; i = parent; } else { // done, found right place break; } } subScorers[i] = scorer; } /** * Remove the root Scorer from subScorers and re-establish it as a heap */ protected final void minheapRemoveRoot() { if (nrInHeap == 1) { //subScorers[0] = null; // not necessary nrInHeap = 0; } else { nrInHeap--; subScorers[0] = subScorers[nrInHeap]; //subScorers[nrInHeap] = null; // not necessary minheapSiftDown(0); } } /** * Removes a given Scorer from the heap by placing end of heap at that * position and bubbling it either up or down */ protected final boolean minheapRemove(Scorer scorer) { // find scorer: O(nrInHeap) for (int i = 0; i < nrInHeap; i++) { if (subScorers[i] == scorer) { // remove scorer subScorers[i] = subScorers[--nrInHeap]; //if (i != nrInHeap) subScorers[nrInHeap] = null; // not necessary minheapSiftUp(i); minheapSiftDown(i); return true; } } return false; // scorer already exhausted } boolean minheapCheck() { return minheapCheck(0); } private boolean minheapCheck(int root) { if (root >= nrInHeap) return true; int lchild = (root << 1) + 1; int rchild = (root << 1) + 2; if (lchild < nrInHeap && subScorers[root].docID() > subScorers[lchild].docID()) return false; if (rchild < nrInHeap && subScorers[root].docID() > subScorers[rchild].docID()) return false; return minheapCheck(lchild) && minheapCheck(rchild); } }