/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import java.util.Map; import org.apache.lucene.search.TermAutomatonQuery.EnumAndScorer; import org.apache.lucene.search.TermAutomatonQuery.TermAutomatonWeight; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RunAutomaton; // TODO: add two-phase and needsScores support. maybe use conjunctionDISI internally? class TermAutomatonScorer extends Scorer { private final EnumAndScorer[] subs; private final EnumAndScorer[] subsOnDoc; private final PriorityQueue<EnumAndScorer> docIDQueue; private final PriorityQueue<EnumAndScorer> posQueue; private final RunAutomaton runAutomaton; private final Map<Integer,BytesRef> idToTerm; // We reuse this array to check for matches starting from an initial // position; we increase posShift every time we move to a new possible // start: private PosState[] positions; int posShift; // This is -1 if wildcard (null) terms were not used, else it's the id // of the wildcard term: private final int anyTermID; private final Similarity.SimScorer docScorer; private int numSubsOnDoc; private final long cost; private int docID = -1; private int freq; public TermAutomatonScorer(TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, Map<Integer,BytesRef> idToTerm, Similarity.SimScorer docScorer) throws IOException { super(weight); //System.out.println(" automaton:\n" + weight.automaton.toDot()); this.runAutomaton = new TermRunAutomaton(weight.automaton, subs.length); this.docScorer = docScorer; this.idToTerm = idToTerm; this.subs = subs; this.docIDQueue = new DocIDQueue(subs.length); this.posQueue = new PositionQueue(subs.length); this.anyTermID = anyTermID; this.subsOnDoc = new EnumAndScorer[subs.length]; this.positions = new PosState[4]; for(int i=0;i<this.positions.length;i++) { this.positions[i] = new PosState(); } long cost = 0; // Init docIDQueue: for(EnumAndScorer sub : subs) { if (sub != null) { cost += sub.posEnum.cost(); subsOnDoc[numSubsOnDoc++] = sub; } } this.cost = cost; } /** Sorts by docID so we can quickly pull out all scorers that are on * the same (lowest) docID. */ private static class DocIDQueue extends PriorityQueue<EnumAndScorer> { public DocIDQueue(int maxSize) { super(maxSize, false); } @Override protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) { return a.posEnum.docID() < b.posEnum.docID(); } } /** Sorts by position so we can visit all scorers on one doc, by * position. */ private static class PositionQueue extends PriorityQueue<EnumAndScorer> { public PositionQueue(int maxSize) { super(maxSize, false); } @Override protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) { return a.pos < b.pos; } } /** Pops all enums positioned on the current (minimum) doc */ private void popCurrentDoc() { assert numSubsOnDoc == 0; assert docIDQueue.size() > 0; subsOnDoc[numSubsOnDoc++] = docIDQueue.pop(); docID = subsOnDoc[0].posEnum.docID(); while (docIDQueue.size() > 0 && docIDQueue.top().posEnum.docID() == docID) { subsOnDoc[numSubsOnDoc++] = docIDQueue.pop(); } } /** Pushes all previously pop'd enums back into the docIDQueue */ private void pushCurrentDoc() { for(int i=0;i<numSubsOnDoc;i++) { docIDQueue.add(subsOnDoc[i]); } numSubsOnDoc = 0; } @Override public DocIdSetIterator iterator() { return new DocIdSetIterator() { @Override public int docID() { return docID; } @Override public long cost() { return cost; } @Override public int nextDoc() throws IOException { // we only need to advance docs that are positioned since all docs in the // pq are guaranteed to be beyond the current doc already for(int i=0;i<numSubsOnDoc;i++) { EnumAndScorer sub = subsOnDoc[i]; if (sub.posEnum.nextDoc() != NO_MORE_DOCS) { sub.posLeft = sub.posEnum.freq()-1; sub.pos = sub.posEnum.nextPosition(); } } pushCurrentDoc(); return doNext(); } @Override public int advance(int target) throws IOException { // Both positioned docs and docs in the pq might be behind target // 1. Advance the PQ if (docIDQueue.size() > 0) { EnumAndScorer top = docIDQueue.top(); while (top.posEnum.docID() < target) { if (top.posEnum.advance(target) != NO_MORE_DOCS) { top.posLeft = top.posEnum.freq()-1; top.pos = top.posEnum.nextPosition(); } top = docIDQueue.updateTop(); } } // 2. Advance subsOnDoc for(int i=0;i<numSubsOnDoc;i++) { EnumAndScorer sub = subsOnDoc[i]; if (sub.posEnum.advance(target) != NO_MORE_DOCS) { sub.posLeft = sub.posEnum.freq()-1; sub.pos = sub.posEnum.nextPosition(); } } pushCurrentDoc(); return doNext(); } private int doNext() throws IOException { assert numSubsOnDoc == 0; assert docIDQueue.top().posEnum.docID() > docID; while (true) { //System.out.println(" doNext: cycle"); popCurrentDoc(); //System.out.println(" docID=" + docID); if (docID == NO_MORE_DOCS) { return docID; } countMatches(); if (freq > 0) { return docID; } for(int i=0;i<numSubsOnDoc;i++) { EnumAndScorer sub = subsOnDoc[i]; if (sub.posEnum.nextDoc() != NO_MORE_DOCS) { sub.posLeft = sub.posEnum.freq()-1; sub.pos = sub.posEnum.nextPosition(); } } pushCurrentDoc(); } } }; } private PosState getPosition(int pos) { return positions[pos-posShift]; } private void shift(int pos) { int limit = pos-posShift; for(int i=0;i<limit;i++) { positions[i].count = 0; } posShift = pos; } private void countMatches() throws IOException { freq = 0; for(int i=0;i<numSubsOnDoc;i++) { posQueue.add(subsOnDoc[i]); } // System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID + " id=" + reader.document(docID).get("id")); // System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID); int lastPos = -1; posShift = -1; while (posQueue.size() != 0) { EnumAndScorer sub = posQueue.pop(); // This is a graph intersection, and pos is the state this token // leaves from. Until index stores posLength (which we could // stuff into a payload using a simple TokenFilter), this token // always transitions from state=pos to state=pos+1: final int pos = sub.pos; if (posShift == -1) { posShift = pos; } if (pos+1-posShift >= positions.length) { PosState[] newPositions = new PosState[ArrayUtil.oversize(pos+1-posShift, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(positions, 0, newPositions, 0, positions.length); for(int i=positions.length;i<newPositions.length;i++) { newPositions[i] = new PosState(); } positions = newPositions; } // System.out.println(" term=" + idToTerm.get(sub.termID).utf8ToString() + " pos=" + pos + " (count=" + getPosition(pos).count + " lastPos=" + lastPos + ") posQueue.size=" + posQueue.size() + " posShift=" + posShift); PosState posState; PosState nextPosState; // Maybe advance ANY matches: if (lastPos != -1) { if (anyTermID != -1) { int startLastPos = lastPos; while (lastPos < pos) { posState = getPosition(lastPos); if (posState.count == 0 && lastPos > startLastPos) { // Petered out... lastPos = pos; break; } // System.out.println(" iter lastPos=" + lastPos + " count=" + posState.count); nextPosState = getPosition(lastPos+1); // Advance all states from lastPos -> pos, if they had an any arc: for(int i=0;i<posState.count;i++) { int state = runAutomaton.step(posState.states[i], anyTermID); if (state != -1) { // System.out.println(" add pos=" + (lastPos+1) + " state=" + state); nextPosState.add(state); } } lastPos++; } } } posState = getPosition(pos); nextPosState = getPosition(pos+1); // If there are no pending matches at neither this position or the // next position, then it's safe to shift back to positions[0]: if (posState.count == 0 && nextPosState.count == 0) { shift(pos); posState = getPosition(pos); nextPosState = getPosition(pos+1); } // Match current token: for(int i=0;i<posState.count;i++) { // System.out.println(" check cur state=" + posState.states[i]); int state = runAutomaton.step(posState.states[i], sub.termID); if (state != -1) { // System.out.println(" --> " + state); nextPosState.add(state); if (runAutomaton.isAccept(state)) { // System.out.println(" *** (1)"); freq++; } } } // Also consider starting a new match from this position: int state = runAutomaton.step(0, sub.termID); if (state != -1) { // System.out.println(" add init state=" + state); nextPosState.add(state); if (runAutomaton.isAccept(state)) { // System.out.println(" *** (2)"); freq++; } } if (sub.posLeft > 0) { // Put this sub back into the posQueue: sub.pos = sub.posEnum.nextPosition(); sub.posLeft--; posQueue.add(sub); } lastPos = pos; } int limit = lastPos+1-posShift; // reset for(int i=0;i<=limit;i++) { positions[i].count = 0; } } @Override public String toString() { return "TermAutomatonScorer(" + weight + ")"; } @Override public int freq() { return freq; } @Override public int docID() { return docID; } @Override public float score() throws IOException { // TODO: we could probably do better here, e.g. look @ freqs of actual terms involved in this doc and score differently return docScorer.score(docID, freq); } static class TermRunAutomaton extends RunAutomaton { public TermRunAutomaton(Automaton a, int termCount) { super(a, termCount); } } private static class PosState { // Which automaton states we are in at this position int[] states = new int[2]; // How many states int count; public void add(int state) { if (states.length == count) { states = ArrayUtil.grow(states); } states[count++] = state; } } }