package org.apache.lucene.chunk; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.Spans; /** * Wraps a SpanQuery, converting chunk spans to look like they're all part * of the main document. Uses a {@link DocNumMap} to find out the mapping * from chunks to the main document. The start and end offsets in each span * are then multiplied by the chunk offset times the non-overlapping word * count per chunk.<p> * * For instance, say that main document 812 has chunks 813-945. And say that * the chunk size is 125 words with an overlap of 25. Spans in chunk 813 * will have their offsets unchanged; those in 814 will range from 100-199, * those in 815 will range from 200-299, etc.<p> * * <b>Warning:</b> The spans that result from this query might not be in * strict start/end order. Thus, this query is only suitable as a top-level * span query, and <u>never</u> as part of another span query. */ public class SpanDechunkingQuery extends SpanQuery { private SpanQuery wrapped; private DocNumMap docNumMap; /** Construct a query that will mark the results of a normal span query.*/ public SpanDechunkingQuery(SpanQuery wrap) { this.wrapped = wrap; } /** Establish a document number map. Must be called before getSpans(). */ public void setDocNumMap(DocNumMap docNumMap) { this.docNumMap = docNumMap; } /** Retrieve the SpanQuery being wrapped */ public SpanQuery getWrapped() { return wrapped; } // inherit javadoc public String getField() { return wrapped.getField(); } // inherit javadoc public Collection getTerms() { return wrapped.getTerms(); } // inherit javadoc public Query[] getSubQueries() { Query[] result = new Query[1]; result[0] = wrapped; return result; } // inherit javadoc public Query rewrite(IndexReader reader) throws IOException { SpanQuery rewrittenWrapped = (SpanQuery)wrapped.rewrite(reader); if (rewrittenWrapped == wrapped) return this; SpanDechunkingQuery clone = (SpanDechunkingQuery)this.clone(); clone.wrapped = rewrittenWrapped; return clone; } // inherit javadoc public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("spanDechunk("); buffer.append(wrapped.toString(field)); buffer.append(", spanRecording="); buffer.append(getSpanRecording()); buffer.append(")"); return buffer.toString(); } // inherit javadoc public Spans getSpans(final IndexReader reader, final Searcher searcher) throws IOException { // Record some parameters for handy access. final int chunkSize = docNumMap.getChunkSize(); final int chunkBump = chunkSize - docNumMap.getChunkOverlap(); final Similarity similarity = wrapped.getSimilarity(searcher); return new Spans() { private Spans spans = wrapped.getSpans(reader, searcher); private int firstChunk = -1; private int lastChunk = -1; private int mainDoc = -1; private float lengthNorm = 0.0f; private int chunkOffset; public boolean next() throws IOException { if (!spans.next()) return false; update(); return true; } private void update() { // See if we have started a new main document. int chunk = spans.doc(); if (chunk > lastChunk) { // Get params for the new main doc. mainDoc = docNumMap.getDocNum(chunk); if (mainDoc >= 0) { firstChunk = docNumMap.getFirstChunk(mainDoc); lastChunk = docNumMap.getLastChunk(mainDoc); lengthNorm = similarity.lengthNorm(wrapped.getField(), (lastChunk + 1) - firstChunk); } else { // This can occur if we start reading an index that contains a partially // complete document (especially likely with large documents). Just suppress // the spans by setting lengthNorm to 0 which will result in zero scores. // lengthNorm = 0.0f; firstChunk = lastChunk = -1; } } // Now calculate an appropriate offset for the current chunk. chunkOffset = (chunk - firstChunk) * chunkBump; } public boolean skipTo(int target) throws IOException { int first = docNumMap.getFirstChunk(target); if (first < 0 || target < first) first = target; if (!spans.skipTo(first)) return false; update(); return true; } public int doc() { return mainDoc; } public int start() { int start = spans.start(); assert start >= 0 && start < chunkSize; return start + chunkOffset; } public int end() { int end = spans.end(); assert end > 0 && end <= chunkSize; return end + chunkOffset; } public float score() { return spans.score() * lengthNorm * getBoost(); } public String toString() { return "spans(" + SpanDechunkingQuery.this.toString() + ")"; } public Explanation explain() throws IOException { Explanation result = new Explanation(0, "weight(" + toString() + "), product of:"); Explanation boostExpl = new Explanation(getBoost(), "boost"); if (boostExpl.getValue() != 1.0f) result.addDetail(boostExpl); Explanation lengthExpl = new Explanation(lengthNorm, "lengthNorm"); result.addDetail(lengthExpl); Explanation inclExpl = spans.explain(); result.addDetail(inclExpl); result.setValue( boostExpl.getValue() * lengthExpl.getValue() * inclExpl.getValue()); return result; } }; } }