package org.apache.lucene.search.spans; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; /** Calculates spans that match several queries "near" each other. In-order * matches score higher than out-of-order matches. */ class OrNearSpans implements Spans { private SpanOrNearQuery query; private Similarity similarity; private int nClauses; // Number of original clauses private ArrayList cells; // Spans in position order private int slop; // from query private boolean penalizeOutOfOrder; // from query private boolean more = true; // true iff not done private boolean firstTime = true; // true before first next() private int matchDist; // Slop for current match private float matchTotalScore; // Sum of scores for current match private int matchEndCell; // Last cell # in current match private int matchNumCells; // Number of matching cells private class SpansCell { Spans spans; int index; SpansCell(Spans spans, int index) { this.spans = spans; this.index = index; } final int doc() { return spans.doc(); } final int start() { return spans.start(); } final int end() { return spans.end(); } final float score() { return spans.score(); } final boolean next() throws IOException { return spans.next(); } final boolean skipTo(int doc) throws IOException { return spans.skipTo(doc); } int compareTo(SpansCell other) { if (doc() == other.doc()) { if (start() == other.start()) { if (end() == other.end()) return index - other.index; else return end() - other.end(); } else { return start() - other.start(); } } else return doc() - other.doc(); } } // class SpansCell private static final Comparator cellComparator = new Comparator() { public int compare(Object o1, Object o2) { return ((SpansCell)o1).compareTo((SpansCell)o2); } }; public OrNearSpans(SpanOrNearQuery query, IndexReader reader, Searcher searcher) throws IOException { this.query = query; this.slop = query.getSlop(); this.penalizeOutOfOrder = query.penalizeOutOfOrder(); SpanQuery[] clauses = query.getClauses(); nClauses = clauses.length; cells = new ArrayList(nClauses); for (int i = 0; i < nClauses; i++) cells.add(new SpansCell(clauses[i].getSpans(reader, searcher), i)); similarity = searcher.getSimilarity(); } public boolean next() throws IOException { if (firstTime) { more = initCells(-1); firstTime = false; } else if (more) { if (!nextCell()) more = advance(-1); } // All done. return more; } private boolean initCells(int skipTo) throws IOException { // Init each cell. If it runs off the end, delete it. for (int i = 0; i < cells.size(); i++) { SpansCell cell = (SpansCell)cells.get(i); boolean cellMore = (skipTo >= 0) ? cell.skipTo(skipTo) : cell.next(); // If it ran off the end, remove it from the list. if (!cellMore) { cells.remove(i); --i; } } Collections.sort(cells, cellComparator); if (cells.isEmpty()) return false; // Init scoring parameters. matchEndCell = 0; matchNumCells = 0; matchTotalScore = ((SpansCell)cells.get(0)).score(); matchDist = 0; return true; } // initCells() private boolean advance(int skipTo) throws IOException { // Advance the first cell. If it runs out, just remove it (the other cells // remain in their proper order.) // SpansCell cell = (SpansCell)cells.get(0); boolean cellMore = (skipTo >= 0) ? cell.skipTo(skipTo) : cell.next(); if (cellMore) { // Now put it in the right place in the position ordered array. int i; for (i = 0; i < cells.size() - 1; i++) { SpansCell next = (SpansCell)cells.get(i + 1); if (cell.compareTo(next) < 0) break; cells.set(i, next); } cells.set(i, cell); } else { cells.remove(0); if (cells.isEmpty()) return false; } // Init scoring parameters. matchEndCell = 0; matchNumCells = 0; matchTotalScore = ((SpansCell)cells.get(0)).score(); matchDist = 0; // All done. return true; } // advance() // Attempt to extend the match by one more cell. private boolean nextCell() { SpansCell prevCell = (SpansCell)cells.get(matchEndCell); while (true) { // If we run out of cells, we can't extend. if ((matchEndCell + 1) == cells.size()) return false; // Okay, get the next cell. SpansCell curCell = (SpansCell)cells.get(matchEndCell + 1); // If the cells are in different docs, they can't be connected. if (curCell.doc() != prevCell.doc()) return false; // If the cells overlap, skip the new one. if (curCell.start() < prevCell.end()) { ++matchEndCell; continue; } // Add up the edit distance (accounting for out-of-order if requested). assert curCell.compareTo(prevCell) >= 0; int curDist; if (penalizeOutOfOrder && curCell.index < prevCell.index + 1) curDist = curCell.end() - prevCell.start(); else curDist = curCell.start() - prevCell.end(); if (curDist < 0) curDist = -curDist; matchDist += curDist; // If we're beyond the maximum allowable slop, stop here. if (matchDist > slop) return false; // Cool. This looks like the place to be. matchTotalScore += curCell.score(); ++matchEndCell; ++matchNumCells; return true; } } // nextCell() public boolean skipTo(int target) throws IOException { if (firstTime) { more = initCells(target); firstTime = false; } else if (more) { // Skip as needed while (more && ((SpansCell)cells.get(0)).doc() < target) more = advance(target); } return more; } public int doc() { return ((SpansCell)cells.get(0)).doc(); } public int start() { return ((SpansCell)cells.get(0)).start(); } public int end() { return ((SpansCell)cells.get(matchEndCell)).end(); } public float score() { // Calculate the score for this span. The bulk of the score comes from // the sum of the matched sub-spans. This guarantees (assuming all term // scores are equal) that matches with more terms will always score // higher than those with fewer terms. // float coordFactor = (float)(matchNumCells + 1) / (nClauses + 1); // But a bit of the score comes from the edit distance involved in this // match. // float distFactor = similarity.sloppyFreq(matchDist) / (nClauses + 1); // Combine these together with the boost to make the final score. return matchTotalScore * (coordFactor + distFactor) * query.getBoost(); } public String toString() { return "spans(" + query.toString() + ")@" + (firstTime ? "START" : (more ? (doc() + ":" + start() + "-" + end()) : "END")); } public Explanation explain() throws IOException { Explanation result = new Explanation(0, "weight(" + toString() + "), product of:"); // Explain the total of the matches (simplify if only one match) Explanation totalExpl; if (matchEndCell == 0) totalExpl = ((SpansCell)cells.get(0)).spans.explain(); else { float totalScore = 0.0f; totalExpl = new Explanation(0, "totalMatchScore, sum of:"); SpansCell prevCell = null; for (int i = 0; i <= matchEndCell; i++) { SpansCell cell = (SpansCell)cells.get(i); if (prevCell != null && cell.start() < prevCell.end()) continue; totalScore += cell.score(); totalExpl.addDetail(cell.spans.explain()); prevCell = cell; } totalExpl.setValue(totalScore); } result.addDetail(totalExpl); // Explain the boost, if any. Explanation boostExpl = new Explanation(query.getBoost(), "boost"); if (boostExpl.getValue() != 1.0f) result.addDetail(boostExpl); // Explain the distance factor Explanation distExpl = new Explanation(0, "distFactor(sloppyFreq/" + (nClauses + 1) + ")"); Explanation slopExpl = new Explanation(similarity.sloppyFreq(matchDist), "sloppyFreq(slop=" + matchDist + ")"); distExpl.addDetail(slopExpl); distExpl.setValue(slopExpl.getValue() / (nClauses + 1)); // Explain the coordination factor Explanation coordExpl = new Explanation( (float)(matchNumCells + 1) / (nClauses + 1), "coordFactor(" + (matchNumCells + 1) + "/" + (nClauses + 1) + ")"); // Explain the combined factors. Explanation combinedFactorsExpl = new Explanation( distExpl.getValue() + coordExpl.getValue(), "combinedFactors = sum of"); combinedFactorsExpl.addDetail(distExpl); combinedFactorsExpl.addDetail(coordExpl); result.addDetail(combinedFactorsExpl); result.setValue( totalExpl.getValue() * boostExpl.getValue() * combinedFactorsExpl.getValue()); return result; } }