package org.apache.lucene.search; /* * Copyright (c) 2005, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import org.apache.lucene.search.spans.FieldSpans; import org.apache.lucene.search.spans.Span; import org.apache.lucene.search.spans.SpanPosComparator; import org.apache.lucene.search.spans.SpanRecordingScorer; /** * This class, an instance of which is passed to a SpanHitCollector for each * hit, retrieves FieldSpans when requested. This is performed lazily so that * it can be avoided for hits that don't make the grade. */ public class FieldSpanSource { String[] fields; SpanRecordingScorer[][] scorersPerField; ScoreOrder[] scoreOrder = new ScoreOrder[0]; int curDoc = -1; /** * Package-private on purpose. Should only be created by RecordingSearcher. */ FieldSpanSource(SpanRecordingScorer[] scorers) { // Make a list of scorers per field HashMap map = new HashMap(); for (int i = 0; i < scorers.length; i++) { String field = scorers[i].getField(); ArrayList list = (ArrayList)map.get(field); if (list == null) { list = new ArrayList(); map.put(field, list); } list.add(scorers[i]); } // Convert the map to handy arrays. fields = (String[])map.keySet().toArray(new String[map.size()]); scorersPerField = new SpanRecordingScorer[fields.length][]; for (int i = 0; i < fields.length; i++) { ArrayList list = (ArrayList)map.get(fields[i]); scorersPerField[i] = (SpanRecordingScorer[])list.toArray( new SpanRecordingScorer[list.size()]); } } // constructor /** * Retrieve the spans for the given document. * * @param doc Document to get spans for. Typically, the FieldSpanSource can * only get spans for the most recent document collected. * @return Recorded spans for the document. */ public synchronized FieldSpans getSpans(int doc) { if (doc != curDoc) throw new UnsupportedOperationException( "FieldSpanSource can only retrieve spans for current document"); // Process the spans for each field FieldSpans ret = new FieldSpans(); for (int i = 0; i < fields.length; i++) addSpans(doc, fields[i], scorersPerField[i], ret); return ret; } /** * For the given field and list of scorers, calculate (and deduplicate if * necessary) the spans for that field. * * @param doc Document for which spans are being recorded * @param field The field being considered * @param scorers All scorers for that field * @param out Where to store the resulting spans. */ private void addSpans(int doc, String field, SpanRecordingScorer[] scorers, FieldSpans out) { // Figure out how many spans total there are for this field. Also // accumulate the set of terms. // int nToDedupe = 0; int maxSpans = 0; Set terms = null; for (int i = 0; i < scorers.length; i++) { // Skip scorers that didn't record for this doc. if (scorers[i].getSpanDoc() != doc) continue; // Count spans. nToDedupe += scorers[i].getSpanCount(); // Track the max # of spans to actually record (which may be less than // the number of spans for the doc.) // maxSpans = Math.max(maxSpans, scorers[i].getMaxSpans()); // Accumulate the set of terms matched by the queries if (terms == null) terms = scorers[i].getTerms(); else { Set newTerms = new HashSet(); newTerms.addAll(terms); newTerms.addAll(scorers[i].getTerms()); terms = newTerms; } } // No spans? No work to do. if (nToDedupe == 0) return; // Collect the raw spans together. Span[] toDedupe = new Span[nToDedupe]; int start = 0; for (int i = 0; i < scorers.length; i++) { if (scorers[i].getSpanDoc() != doc) continue; int count = scorers[i].getSpanCount(); System.arraycopy(scorers[i].getSpans(), 0, toDedupe, start, count); start += count; } assert start == nToDedupe : "internal error: mis-counted spans"; // Sort the spans in ascending order by start/end. Arrays.sort(toDedupe, SpanPosComparator.theInstance); // For reference during overlap checks, determine the length of the // longest span. // int longestSpan = 0; for (int i = 0; i < nToDedupe; i++) longestSpan = Math.max(longestSpan, toDedupe[i].end - toDedupe[i].start); // Expand the score order array if we need to. if (scoreOrder.length < nToDedupe) { ScoreOrder[] newScoreOrder = new ScoreOrder[nToDedupe + 5]; System.arraycopy(scoreOrder, 0, newScoreOrder, 0, scoreOrder.length); for (int i = scoreOrder.length; i < newScoreOrder.length; i++) newScoreOrder[i] = new ScoreOrder(); scoreOrder = newScoreOrder; } // Record the links in start/end order. for (int i = 0; i < nToDedupe; i++) { scoreOrder[i].span = toDedupe[i]; scoreOrder[i].posOrder = i; scoreOrder[i].cancelled = false; scoreOrder[i].prevInPosOrder = ((i - 1) >= 0) ? scoreOrder[i - 1] : null; scoreOrder[i].nextInPosOrder = ((i + 1) < nToDedupe) ? scoreOrder[i + 1] : null; scoreOrder[i].nextDeduped = null; } // Now make a second sort, this time by descending score. Arrays.sort(scoreOrder, 0, nToDedupe, theScoreComparator); // De-duplicate the score array, starting with the high scores first. int nDeduped = 0; int totalDeduped = 0; ScoreOrder firstDeduped = null; ScoreOrder lastDeduped = null; ScoreOrder o; for (int i = 0; i < nToDedupe; i++) { // Skip entries that have already been cancelled. if (scoreOrder[i].cancelled) continue; // We found an entry we want to keep. Link it into our list. totalDeduped++; if (nDeduped < maxSpans) { if (firstDeduped == null) firstDeduped = scoreOrder[i]; else lastDeduped.nextDeduped = scoreOrder[i]; lastDeduped = scoreOrder[i]; nDeduped++; } // Cancel any overlapping entries before this one, stopping at // one that can't overlap because it's beyond the length of the // longest span. // // You might ask, "why not just stop when you hit the first // non-overlapping span?" Good question grasshopper. The answer // is that, since the spans are sorted by ascending start, // there may be a big span further back that overlaps, and we // have no way of knowing. // final Span scoreSpan = scoreOrder[i].span; o = scoreOrder[i].prevInPosOrder; while (o != null && (o.span.start + longestSpan) > scoreSpan.start) { assert o.span.start <= scoreSpan.start; if (o.span.end > scoreSpan.start) o.cancelled = true; assert o.posOrder == 0 || o.prevInPosOrder.posOrder == o.posOrder - 1; o = o.prevInPosOrder; } // Cancel overlapping entries after this one. Since the spans // are sorted by ascending start pos, we can stop at the first // non-overlapping span. // o = scoreOrder[i].nextInPosOrder; while (o != null && o.span.start < scoreSpan.end) { o.cancelled = true; assert o.posOrder == nToDedupe - 1 || o.nextInPosOrder.posOrder == o.posOrder + 1; o = o.nextInPosOrder; } } // Build the final result array. Span[] outSpans = new Span[nDeduped]; int rank = 0; float prevScore = Float.MAX_VALUE; int i = 0; for (o = firstDeduped; o != null; o = o.nextDeduped) { assert !o.cancelled : "kept span was cancelled"; Span s = (Span)o.span.clone(); assert s.score <= prevScore : "incorrect dedupe list linking"; if (rank == nDeduped - 1) assert o == lastDeduped; prevScore = s.score; s.rank = rank++; outSpans[i++] = s; } assert rank == nDeduped : "incorrect dedupe list linking"; // Apply a final sort by position. Arrays.sort(outSpans, SpanPosComparator.theInstance); // And output it. out.recordSpans(field, totalDeduped, outSpans, terms); } // deduplicate() /** Keeps track of the next and previous spans, in score order */ private class ScoreOrder { Span span; int posOrder; boolean cancelled; ScoreOrder nextInPosOrder; ScoreOrder prevInPosOrder; ScoreOrder nextDeduped; } /** Used to sort spans by descending score, then by position */ private static class ScoreComparator implements Comparator { public int compare(Object o1, Object o2) { Span s1 = ((ScoreOrder)o1).span; Span s2 = ((ScoreOrder)o2).span; if (s1.score < s2.score) return 1; if (s1.score > s2.score) return -1; if (s1.start == s2.start) return s2.end - s1.end; // If overlapping, sort longer spans first. return s1.start - s2.start; } } private static ScoreComparator theScoreComparator = new ScoreComparator(); } // class FieldSpanSource