package org.cdlib.xtf.textEngine;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
/**
* Just like a SpanNearQuery with slop set to zero, except that it also looks for
* the special 'start-of-field' and 'end-of-field' tokens inserted by the
* text indexer. Thus, it will match either the entire field, or none of it.
*
* @author Martin Haye
*/
public class SpanExactQuery extends SpanQuery
{
// The clauses to match (not including the special start and end tokens)
private SpanQuery[] clauses;
/**
* Construct an exact query on a set of clauses.
*
* @param clauses Clauses to match.
*/
public SpanExactQuery(SpanQuery[] clauses)
{
// Must have at least one clause to work.
if (clauses == null || clauses.length == 0)
throw new RuntimeException("SpanExactQuery requires at least one clause");
// Record the input parms
this.clauses = clauses;
} // constructor
// inherit javadoc
public Query rewrite(IndexReader reader)
throws IOException
{
List newClauses = new ArrayList(clauses.length);
boolean anyChanged = false;
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
SpanQuery rewrittenClause = (SpanQuery)clause.rewrite(reader);
newClauses.add(rewrittenClause);
if (clause != rewrittenClause)
anyChanged = true;
}
if (!anyChanged)
return this;
SpanExactQuery clone = (SpanExactQuery)this.clone();
clone.clauses = (SpanQuery[])newClauses.toArray(
new SpanQuery[newClauses.size()]);
return clone;
}
/** Return the clauses whose spans are matched. */
public SpanQuery[] getClauses() {
return clauses;
}
/** Return all the sub-queries (clauses in our case) */
public Query[] getSubQueries() {
return clauses;
}
/**
* Iterate all the spans from the text query that match the sectionType
* query also.
*/
public Spans getSpans(final IndexReader reader, final Searcher searcher)
throws IOException
{
// Modify the first and last clauses to include the special start-of-field
// and end-of-field markers.
//
ArrayList newClauses = new ArrayList(clauses.length);
for (int i = 0; i < clauses.length; i++)
{
if (!(clauses[i] instanceof SpanTermQuery))
throw new RuntimeException("Exact queries only support plain terms");
// We only fool with the first and last clauses
boolean isFirst = (i == 0);
boolean isLast = (i == clauses.length - 1);
if (!isFirst && !isLast) {
newClauses.add(clauses[i]);
continue;
}
// Get the term out.
SpanTermQuery oldClause = (SpanTermQuery)clauses[i];
String oldTerm = oldClause.getTerm().text();
String field = oldClause.getTerm().field();
int length = oldClause.getTermLength();
// Handy things we'll need later
SpanQuery detachedStartQuery = new SpanTermQuery(new Term(
field,
"" +
Constants.FIELD_START_MARKER));
SpanQuery detachedEndQuery = new SpanTermQuery(new Term(
field,
"" +
Constants.FIELD_END_MARKER));
// We may need to OR up to four clauses together. Why? Consider
// this data:
//
// <subject>(foo)</subject>
//
// This will be indexed with start/end markers like this:
//
// ?(foo)? (where ? represents the markers)
//
// Normally our exact query would just look for ?foo?, but that
// won't match the above data because the parentheses force the
// markers to be indexed as individual terms rather than as part
// of the word "foo". But we really want to be able to get an
// exact match on data that has punctuation (and on data that
// doesn't.) So we form a query that looks like this:
//
// OR("?foo?", "? foo?", "?foo ?", "? foo ?")
//
// Note the spacing. This basically queries for all combinations
// of attached or detached markers. In the case of a one-term
// exact query, you end up with four clauses in the OR as above.
// But in the case of multiple terms, like a query for "cat bar flu"
// you would get:
//
// PHRASE(OR("?cat", "? cat"), "bar", OR("flu?", "flu ?"))
//
// So following is logic that accomplishes all this magic...
//
ArrayList orClauses = new ArrayList();
// We may need the start marker: (0) absent; (1) detached; or
// (2) attached.
//
for (int startAtt = 0; startAtt < 3; ++startAtt)
{
// It can only be absent if this isn't the first token.
// Likewise, it can only be present if this is the first.
//
if ((startAtt == 0 && isFirst) || (startAtt != 0 && !isFirst))
continue;
// Okay, we have the same three choices for the end marker.
for (int endAtt = 0; endAtt < 3; ++endAtt)
{
// It can only be absent if this isn't the last token.
// Likewise, it can only be present if this is the last.
//
if ((endAtt == 0 && isLast) || (endAtt != 0 && !isLast))
continue;
// We'll form a phrase of up to three terms.
ArrayList phraseClauses = new ArrayList();
// First, a detached start marker.
if (startAtt == 1)
phraseClauses.add(detachedStartQuery);
// Next, the term itself with or without attached markers.
String newTerm = oldTerm;
if (startAtt == 2)
newTerm = Constants.FIELD_START_MARKER + newTerm;
if (endAtt == 2)
newTerm = newTerm + Constants.FIELD_END_MARKER;
SpanQuery newClause = new SpanTermQuery(new Term(field, newTerm),
length);
newClause.setBoost(oldClause.getBoost());
phraseClauses.add(newClause);
// Finally, a detached end marker.
if (endAtt == 1)
phraseClauses.add(detachedEndQuery);
// If only one term, skip the phrase query.
int nTerms = phraseClauses.size();
if (nTerms == 1)
orClauses.add(phraseClauses.get(0));
else {
orClauses.add(
new SpanNearQuery(
(SpanQuery[])phraseClauses.toArray(new SpanQuery[nTerms]),
0,
true));
}
} // for endAtt
} // for startAtt
// If only one clause, skip the OR
if (orClauses.size() == 1) {
newClauses.add(orClauses.get(0));
continue;
}
// Make an OR to stick them together
SpanOrQuery orQuery = new SpanOrQuery(
(SpanQuery[])orClauses.toArray(new SpanQuery[orClauses.size()]));
newClauses.add(orQuery);
} // for i
// And make a near query out of the whole thing.
SpanQuery q = new SpanNearQuery(
(SpanQuery[])newClauses.toArray(new SpanQuery[newClauses.size()]),
0,
true);
q.setSpanRecording(getSpanRecording());
// Return the spans from the rewritten query.
return q.getSpans(reader, searcher);
}
public String getField() {
return clauses[0].getField();
}
public Collection getTerms()
{
Collection terms = new ArrayList();
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
terms.addAll(clause.getTerms());
}
return terms;
}
public String toString(String field)
{
StringBuffer buffer = new StringBuffer();
buffer.append("spanExact(");
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
buffer.append(clause.toString(field));
if (i < clauses.length - 1)
buffer.append(", ");
}
buffer.append(")");
return buffer.toString();
}
} // class SpanExactQuery