package org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Collection; import java.util.Set; /** A Spans that is formed from the ordered subspans of a SpanNearQuery * where the subspans do not overlap and have a maximum slop between them. * <p> * The formed spans only contains minimum slop matches.<br> * The matching slop is computed from the distance(s) between * the non overlapping matching Spans.<br> * Successive matches are always formed from the successive Spans * of the SpanNearQuery. * <p> * The formed spans may contain overlaps when the slop is at least 1. * For example, when querying using * <pre>t1 t2 t3</pre> * with slop at least 1, the fragment: * <pre>t1 t2 t1 t3 t2 t3</pre> * matches twice: * <pre>t1 t2 .. t3 </pre> * <pre> t1 .. t2 t3</pre> * * * Expert: * Only public for subclassing. Most implementations should not need this class */ public class NearSpansOrdered extends Spans { private final int allowedSlop; private boolean firstTime = true; private boolean more = false; /** The spans in the same order as the SpanNearQuery */ private final Spans[] subSpans; /** Indicates that all subSpans have same doc() */ private boolean inSameDoc = false; private int matchDoc = -1; private int matchStart = -1; private int matchEnd = -1; private List<byte[]> matchPayload; private final Spans[] subSpansByDoc; private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() { public int compare(Spans o1, Spans o2) { return o1.doc() - o2.doc(); } }; private SpanNearQuery query; private boolean collectPayloads = true; public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException { this(spanNearQuery, reader, true); } public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads) throws IOException { if (spanNearQuery.getClauses().length < 2) { throw new IllegalArgumentException("Less than 2 clauses: " + spanNearQuery); } this.collectPayloads = collectPayloads; allowedSlop = spanNearQuery.getSlop(); SpanQuery[] clauses = spanNearQuery.getClauses(); subSpans = new Spans[clauses.length]; matchPayload = new LinkedList<byte[]>(); subSpansByDoc = new Spans[clauses.length]; for (int i = 0; i < clauses.length; i++) { subSpans[i] = clauses[i].getSpans(reader); subSpansByDoc[i] = subSpans[i]; // used in toSameDoc() } query = spanNearQuery; // kept for toString() only. } // inherit javadocs @Override public int doc() { return matchDoc; } // inherit javadocs @Override public int start() { return matchStart; } // inherit javadocs @Override public int end() { return matchEnd; } public Spans[] getSubSpans() { return subSpans; } // TODO: Remove warning after API has been finalized // TODO: Would be nice to be able to lazy load payloads @Override public Collection<byte[]> getPayload() throws IOException { return matchPayload; } // TODO: Remove warning after API has been finalized @Override public boolean isPayloadAvailable() { return matchPayload.isEmpty() == false; } // inherit javadocs @Override public boolean next() throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].next()) { more = false; return false; } } more = true; } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } // inherit javadocs @Override public boolean skipTo(int target) throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].skipTo(target)) { more = false; return false; } } more = true; } else if (more && (subSpans[0].doc() < target)) { if (subSpans[0].skipTo(target)) { inSameDoc = false; } else { more = false; return false; } } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } /** Advances the subSpans to just after an ordered match with a minimum slop * that is smaller than the slop allowed by the SpanNearQuery. * @return true iff there is such a match. */ private boolean advanceAfterOrdered() throws IOException { while (more && (inSameDoc || toSameDoc())) { if (stretchToOrder() && shrinkToAfterShortestMatch()) { return true; } } return false; // no more matches } /** Advance the subSpans to the same document */ private boolean toSameDoc() throws IOException { Arrays.sort(subSpansByDoc, spanDocComparator); int firstIndex = 0; int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc(); while (subSpansByDoc[firstIndex].doc() != maxDoc) { if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) { more = false; inSameDoc = false; return false; } maxDoc = subSpansByDoc[firstIndex].doc(); if (++firstIndex == subSpansByDoc.length) { firstIndex = 0; } } for (int i = 0; i < subSpansByDoc.length; i++) { assert (subSpansByDoc[i].doc() == maxDoc) : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0] + "\n at doc " + subSpansByDoc[i].doc() + ", but should be at " + maxDoc; } inSameDoc = true; return true; } /** Check whether two Spans in the same document are ordered. * @param spans1 * @param spans2 * @return true iff spans1 starts before spans2 * or the spans start at the same position, * and spans1 ends before spans2. */ static final boolean docSpansOrdered(Spans spans1, Spans spans2) { assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc(); int start1 = spans1.start(); int start2 = spans2.start(); /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */ return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2); } /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans * starts and ends as parameters. */ private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) { return (start1 == start2) ? (end1 < end2) : (start1 < start2); } /** Order the subSpans within the same document by advancing all later spans * after the previous one. */ private boolean stretchToOrder() throws IOException { matchDoc = subSpans[0].doc(); for (int i = 1; inSameDoc && (i < subSpans.length); i++) { while (! docSpansOrdered(subSpans[i-1], subSpans[i])) { if (! subSpans[i].next()) { inSameDoc = false; more = false; break; } else if (matchDoc != subSpans[i].doc()) { inSameDoc = false; break; } } } return inSameDoc; } /** The subSpans are ordered in the same doc, so there is a possible match. * Compute the slop while making the match as short as possible by advancing * all subSpans except the last one in reverse order. */ private boolean shrinkToAfterShortestMatch() throws IOException { matchStart = subSpans[subSpans.length - 1].start(); matchEnd = subSpans[subSpans.length - 1].end(); Set<byte[]> possibleMatchPayloads = new HashSet<byte[]>(); if (subSpans[subSpans.length - 1].isPayloadAvailable()) { possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload()); } Collection<byte[]> possiblePayload = null; int matchSlop = 0; int lastStart = matchStart; int lastEnd = matchEnd; for (int i = subSpans.length - 2; i >= 0; i--) { Spans prevSpans = subSpans[i]; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection<byte[]> payload = prevSpans.getPayload(); possiblePayload = new ArrayList<byte[]>(payload.size()); possiblePayload.addAll(payload); } int prevStart = prevSpans.start(); int prevEnd = prevSpans.end(); while (true) { // Advance prevSpans until after (lastStart, lastEnd) if (! prevSpans.next()) { inSameDoc = false; more = false; break; // Check remaining subSpans for final match. } else if (matchDoc != prevSpans.doc()) { inSameDoc = false; // The last subSpans is not advanced here. break; // Check remaining subSpans for last match in this document. } else { int ppStart = prevSpans.start(); int ppEnd = prevSpans.end(); // Cannot avoid invoking .end() if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) { break; // Check remaining subSpans. } else { // prevSpans still before (lastStart, lastEnd) prevStart = ppStart; prevEnd = ppEnd; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection<byte[]> payload = prevSpans.getPayload(); possiblePayload = new ArrayList<byte[]>(payload.size()); possiblePayload.addAll(payload); } } } } if (collectPayloads && possiblePayload != null) { possibleMatchPayloads.addAll(possiblePayload); } assert prevStart <= matchStart; if (matchStart > prevEnd) { // Only non overlapping spans add to slop. matchSlop += (matchStart - prevEnd); } /* Do not break on (matchSlop > allowedSlop) here to make sure * that subSpans[0] is advanced after the match, if any. */ matchStart = prevStart; lastStart = prevStart; lastEnd = prevEnd; } boolean match = matchSlop <= allowedSlop; if(collectPayloads && match && possibleMatchPayloads.size() > 0) { matchPayload.addAll(possibleMatchPayloads); } return match; // ordered and allowed slop } @Override public String toString() { return getClass().getName() + "("+query.toString()+")@"+ (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END")); } }