/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.search.node;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.AttributeSource;
import org.sindice.siren.search.node.NodeBooleanClause.Occur;
/**
* An abstract {@link NodePrimitiveQuery} that matches documents
* containing a subset of terms provided by a {@link
* FilteredTermEnum} enumeration.
*
* <p>This query cannot be used directly; you must subclass
* it and define {@link #getTermsEnum(Terms,AttributeSource)} to provide a {@link
* FilteredTermsEnum} that iterates through the terms to be
* matched.
*
* <p><b>NOTE</b>: since {@link #CONSTANT_SCORE_FILTER_REWRITE} is deactivated
* in SIREn, the {@link #setRewriteMethod} will always be either
* {@link #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} or {@link
* #SCORING_BOOLEAN_QUERY_REWRITE}. Therefore, you may encounter a
* {@link NodeBooleanQuery.TooManyClauses} exception during
* searching, which happens when the number of terms to be
* searched exceeds {@link NodeBooleanQuery#getMaxClauseCount()}.
*
* <p>The recommended rewrite method is {@link
* #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU
* computing unhelpful scores, and it tries to pick the most
* performant rewrite method given the query. If you
* need scoring (like {@link NodeFuzzyQuery}, use
* {@link TopTermsScoringNodeBooleanQueryRewrite} which uses
* a priority queue to only collect competitive terms
* and not hit this limitation.
*
* Note that {@link QueryParser} produces
* SirenMultiTermQueries using {@link #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by
* default.
*
* <p>
* Code taken from {@link MultiTermQuery} and adapted for SIREn.
*/
public abstract class MultiNodeTermQuery extends NodePrimitiveQuery {
protected final String field;
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
/** Abstract class that defines how the query is rewritten. */
public static abstract class RewriteMethod {
public abstract Query rewrite(IndexReader reader, MultiNodeTermQuery query)
throws IOException;
/**
* Returns the {@link MultiNodeTermQuery}s {@link TermsEnum}
* @see MultiNodeTermQuery#getTermsEnum(Terms, AttributeSource)
*/
protected TermsEnum getTermsEnum(final MultiNodeTermQuery query, final Terms terms,
final AttributeSource atts)
throws IOException {
// allow RewriteMethod subclasses to pull a TermsEnum from the MTQ
return query.getTermsEnum(terms, atts);
}
}
/**
* Rewrite method currently deactivated in SIREn.
*
* <p> A rewrite method that first creates a private Filter,
* by visiting each term in sequence and marking all docs
* for that term. Matching documents are assigned a
* constant score equal to the query's boost.
*
* <p> This method is faster than the BooleanQuery
* rewrite methods when the number of matched terms or
* matched documents is non-trivial. Also, it will never
* hit an errant {@link NodeBooleanQuery.TooManyClauses}
* exception.
*
* @see #setRewriteMethod
**/
public static final RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new RewriteMethod() {
@Override
public Query rewrite(final IndexReader reader, final MultiNodeTermQuery query) {
// TODO: Reactivate filter-based approach when a correct implementation of
// SirenMultiTermQueryWrapperFilter is found.
// Query result = new ConstantScoreQuery(new SirenMultiTermQueryWrapperFilter<MultiTermQuery>(query));
// result.setBoost(query.getBoost());
// return result;
throw new UnsupportedOperationException("Filter-based rewrite method " +
"is currently deactivated");
}
};
/**
* A rewrite method that first translates each term into
* {@link NodeBooleanClause.Occur#SHOULD} clause in a
* SirenBooleanQuery, and keeps the scores as computed by the
* query. Note that typically such scores are
* meaningless to the user, and require non-trivial CPU
* to compute, so it's almost always better to use {@link
* #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* NodeBooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link NodeBooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod
**/
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = NodeScoringRewrite.SCORING_BOOLEAN_QUERY_REWRITE;
/**
* Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
* query's boost.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* NodeBooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link NodeBooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod
**/
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = NodeScoringRewrite.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
/**
* A rewrite method that first translates each term into
* {@link NodeBooleanClause.Occur#SHOULD} clause in a SirenBooleanQuery, and
* keeps the scores as computed by the query.
*
* <p>
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count. It is the default rewrite method for
* {@link FuzzyQuery}.
*
* @see #setRewriteMethod
*/
static final class TopTermsScoringNodeBooleanQueryRewrite extends TopNodeTermsRewrite<NodeBooleanQuery> {
/**
* Create a {@link TopTermsScoringNodeBooleanQueryRewrite} for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link NodeBooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsScoringNodeBooleanQueryRewrite(final int size) {
super(size);
}
@Override
protected int getMaxSize() {
return NodeBooleanQuery.getMaxClauseCount();
}
@Override
protected NodeBooleanQuery getTopLevelQuery() {
return new NodeBooleanQuery();
}
@Override
protected void addClause(final NodeBooleanQuery topLevel, final Term term,
final int docCount, final float boost,
final TermContext states) {
final NodeTermQuery tq = new NodeTermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, Occur.SHOULD);
}
}
/**
* A rewrite method that first translates each term into
* {@link NodeBooleanClause.Occur#SHOULD} clause in a {@link NodeBooleanQuery},
* but the scores are only computed as the boost.
* <p>
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count.
*
* @see #setRewriteMethod
*/
static final class TopTermsBoostOnlyNodeBooleanQueryRewrite extends TopNodeTermsRewrite<NodeBooleanQuery> {
/**
* Create a TopTermsBoostOnlySirenBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link NodeBooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsBoostOnlyNodeBooleanQueryRewrite(final int size) {
super(size);
}
@Override
protected int getMaxSize() {
return NodeBooleanQuery.getMaxClauseCount();
}
@Override
protected NodeBooleanQuery getTopLevelQuery() {
return new NodeBooleanQuery();
}
@Override
protected void addClause(final NodeBooleanQuery topLevel, final Term term,
final int docFreq, final float boost,
final TermContext states) {
final NodePrimitiveQuery q = new NodeConstantScoreQuery(new NodeTermQuery(term, states));
q.setBoost(boost);
topLevel.add(q, NodeBooleanClause.Occur.SHOULD);
}
}
/**
* A rewrite method that tries to pick the best
* constant-score rewrite method based on term and
* document counts from the query. If both the number of
* terms and documents is small enough, then {@link
* #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used.
* Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is
* used.
*
* <p> The {@link #CONSTANT_SCORE_FILTER_REWRITE} method is currently
* deactivated in SIREn.
*/
static class NodeConstantScoreAutoRewrite extends org.sindice.siren.search.node.NodeConstantScoreAutoRewrite {}
/**
* Read-only default instance of {@link
* NodeConstantScoreAutoRewrite}, with {@link
* NodeConstantScoreAutoRewrite#setTermCountCutoff} set to
* {@link
* NodeConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF}
* and {@link
* NodeConstantScoreAutoRewrite#setDocCountPercent} set to
* {@link
* NodeConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}.
* Note that you cannot alter the configuration of this
* instance; you'll need to create a private instance
* instead.
**/
public final static RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new NodeConstantScoreAutoRewrite() {
@Override
public void setTermCountCutoff(final int count) {
throw new UnsupportedOperationException("Please create a private instance");
}
@Override
public void setDocCountPercent(final double percent) {
throw new UnsupportedOperationException("Please create a private instance");
}
};
/**
* Constructs a query matching terms that cannot be represented with a single
* Term.
*/
public MultiNodeTermQuery(final String field) {
this.field = field;
assert field != null;
}
/** Returns the field name for this query */
public final String getField() { return field; }
/**
* Construct the enumeration to be used, expanding the
* pattern term. This method should only be called if
* the field exists (ie, implementations can assume the
* field does exist). This method should not return null
* (should instead return {@link TermsEnum#EMPTY} if no
* terms match). The TermsEnum must already be
* positioned to the first matching term.
* The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
* provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
* This is currently only used by {@link TopNodeTermsRewrite}
*/
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
/**
* Convenience method, if no attributes are needed:
* This simply passes empty attributes and is equal to:
* <code>getTermsEnum(terms, new AttributeSource())</code>
*/
protected final TermsEnum getTermsEnum(final Terms terms) throws IOException {
return this.getTermsEnum(terms, new AttributeSource());
}
@Override
public Query rewrite(final IndexReader reader) throws IOException {
return rewriteMethod.rewrite(reader, this);
}
/**
* @see #setRewriteMethod
*/
public RewriteMethod getRewriteMethod() {
return rewriteMethod;
}
/**
* Sets the rewrite method to be used when executing the
* query. You can use one of the four core methods, or
* implement your own subclass of {@link RewriteMethod}. */
public void setRewriteMethod(final RewriteMethod method) {
rewriteMethod = method;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Float.floatToIntBits(this.getBoost());
result = prime * result + rewriteMethod.hashCode();
result = prime * result + lowerBound;
result = prime * result + upperBound;
result = prime * result + levelConstraint;
if (field != null) result = prime * result + field.hashCode();
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (this.getClass() != obj.getClass())
return false;
final MultiNodeTermQuery other = (MultiNodeTermQuery) obj;
if (Float.floatToIntBits(this.getBoost()) != Float.floatToIntBits(other.getBoost()))
return false;
if (!rewriteMethod.equals(other.rewriteMethod)) {
return false;
}
if (!(this.lowerBound == other.lowerBound &&
this.upperBound == other.upperBound &&
this.levelConstraint == other.levelConstraint &&
StringUtils.equals(this.datatype, other.datatype))) {
return false;
}
return (other.field == null ? field == null : other.field.equals(field));
}
}