/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.search.node;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppySimScorer;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ToStringUtils;
import org.sindice.siren.index.DocsNodesAndPositionsEnum;
/**
* A {@link NodePrimitiveQuery} that matches nodes containing a particular
* sequence of terms. A {@link NodePhraseQuery} is built for input like
* <code>"new york"</code>.
* <p>
* This query may be combined with other queries with a {@link NodeBooleanQuery}
* or a {@link TwigQuery}.
* <p>
* Code taken from {@link PhraseQuery} and adapted for the Siren use case.
*/
public class NodePhraseQuery extends NodePrimitiveQuery {
private String field;
private final ArrayList<Term> terms = new ArrayList<Term>(4);
private final ArrayList<Integer> positions = new ArrayList<Integer>(4);
private int maxPosition = 0;
/** Constructs an empty phrase query. */
public NodePhraseQuery() {}
/**
* Adds a term to the end of the query phrase. The relative position of the
* term is the one immediately after the last term added.
*/
public void add(final Term term) {
int position = 0;
if (positions.size() > 0) {
position = (positions.get(positions.size() - 1)).intValue() + 1;
}
this.add(term, position);
}
/**
* Adds a term to the end of the query phrase. The relative position of the
* term within the phrase is specified explicitly. This allows e.g. phrases
* with more than one term at the same position or phrases with gaps (e.g. in
* connection with stopwords).
*
* @param term
* @param position
*/
public void add(final Term term, final int position) {
if (terms.size() == 0) {
field = term.field();
}
else if (term.field() != field) {
throw new IllegalArgumentException(
"All phrase terms must be in the same field: " + term);
}
terms.add(term);
positions.add(Integer.valueOf(position));
if (position > maxPosition) maxPosition = position;
}
/** Returns the set of terms in this phrase. */
public Term[] getTerms() {
return terms.toArray(new Term[0]);
}
/**
* Returns the relative positions of terms in this phrase.
*/
public int[] getPositions() {
final int[] result = new int[positions.size()];
for (int i = 0; i < positions.size(); i++) {
result[i] = (positions.get(i)).intValue();
}
return result;
}
@Override
public Query rewrite(final IndexReader reader) throws IOException {
if (terms.isEmpty()) {
final NodeBooleanQuery bq = new NodeBooleanQuery();
bq.setBoost(this.getBoost());
return bq;
}
else if (terms.size() == 1) {
final NodeTermQuery tq = new NodeTermQuery(terms.get(0));
tq.setBoost(this.getBoost());
return tq;
}
else {
return super.rewrite(reader);
}
}
private class NodePhraseWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private transient TermContext states[];
public NodePhraseWeight(final IndexSearcher searcher) throws IOException {
this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.size()];
final TermStatistics termStats[] = new TermStatistics[terms.size()];
for (int i = 0; i < terms.size(); i++) {
final Term term = terms.get(i);
states[i] = TermContext.build(context, term, true);
termStats[i] = searcher.termStatistics(term, states[i]);
}
stats = similarity.computeWeight(NodePhraseQuery.this.getBoost(), searcher.collectionStatistics(field), termStats);
}
@Override
public String toString() {
return "weight(" + NodePhraseQuery.this + ")";
}
@Override
public Query getQuery() {
return NodePhraseQuery.this;
}
@Override
public float getValueForNormalization() {
return stats.getValueForNormalization();
}
@Override
public void normalize(final float queryNorm, final float topLevelBoost) {
stats.normalize(queryNorm, topLevelBoost);
}
@Override
public Scorer scorer(final AtomicReaderContext context, final boolean scoreDocsInOrder,
final boolean topScorer, final Bits acceptDocs)
throws IOException {
assert !terms.isEmpty();
final AtomicReader reader = context.reader();
final Bits liveDocs = acceptDocs;
final PostingsAndPosition[] postings = new PostingsAndPosition[terms.size()];
final Terms fieldTerms = reader.terms(field);
if (fieldTerms == null) {
return null;
}
// Reuse single TermsEnum below:
final TermsEnum te = fieldTerms.iterator(null);
for (int i = 0; i < terms.size(); i++) {
final Term t = terms.get(i);
final TermState state = states[i].get(context.ord);
if (state == null) { /* term doesnt exist in this segment */
assert this.termNotInReader(reader, t): "no termstate found but term exists in reader";
return null;
}
te.seekExact(t.bytes(), state);
final DocsNodesAndPositionsEnum postingsEnum = NodePhraseQuery.this.getDocsNodesAndPositionsEnum(te.docsAndPositions(liveDocs, null));
// PhraseQuery on a field that did not index positions (maybe not a siren field)
if (postingsEnum == null) {
assert te.seekExact(t.bytes(), false) : "termstate found but no term exists in reader";
// term does exist, but has no positions
throw new IllegalStateException("field \"" + t.field() + "\" was " +
"indexed without position data; cannot run NodePhraseQuery " +
"(term=" + t.text() + ")");
}
postings[i] = new PostingsAndPosition(postingsEnum, positions.get(i).intValue());
}
return new NodeExactPhraseScorer(this, postings,
similarity.sloppySimScorer(stats, context),
similarity.exactSimScorer(stats, context));
}
// TODO: Review this explanation for node match
@Override
public Explanation explain(final AtomicReaderContext context, final int doc)
throws IOException {
final NodeScorer scorer = (NodeScorer) this.scorer(context, true, false, context.reader().getLiveDocs());
if (scorer != null) {
if (scorer.skipToCandidate(doc) && scorer.doc() == doc) {
final SloppySimScorer docScorer = similarity.sloppySimScorer(stats, context);
final ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+this.getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
while (scorer.nextNode()) {
final ComplexExplanation nodeMatch = new ComplexExplanation();
nodeMatch.setDescription("in "+scorer.node()+"), result of:");
final float freq = scorer.freqInNode();
final Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
nodeMatch.setValue(scoreExplanation.getValue());
nodeMatch.setMatch(true);
nodeMatch.addDetail(scoreExplanation);
result.addDetail(nodeMatch);
}
result.setMatch(true);
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
}
// only called from assert
private boolean termNotInReader(final AtomicReader reader, final Term term)
throws IOException {
return reader.docFreq(term) == 0;
}
}
@Override
public Weight createWeight(final IndexSearcher searcher)
throws IOException {
return new NodePhraseWeight(searcher);
}
/**
* @see org.apache.lucene.search.Query#extractTerms(java.util.Set)
*/
@Override
public void extractTerms(final Set<Term> queryTerms) {
queryTerms.addAll(terms);
}
@Override
public String toString(final String f) {
final StringBuffer buffer = new StringBuffer();
buffer.append("\"");
final String[] pieces = new String[maxPosition + 1];
for (int i = 0; i < terms.size(); i++) {
final int pos = (positions.get(i)).intValue();
String s = pieces[pos];
if (s == null) {
s = (terms.get(i)).text();
}
else {
s = s + "|" + (terms.get(i)).text();
}
pieces[pos] = s;
}
for (int i = 0; i < pieces.length; i++) {
if (i > 0) {
buffer.append(' ');
}
final String s = pieces[i];
if (s == null) {
buffer.append('?');
}
else {
this.escapeDoubleQuote(buffer, s);
}
}
buffer.append("\"");
buffer.append(ToStringUtils.boost(this.getBoost()));
return this.wrapToStringWithDatatype(buffer).toString();
}
/**
* Prefix with a backslash any unescaped double quote
*/
private void escapeDoubleQuote(final StringBuffer buffer, final String s) {
int index = 0;
int prevIndex = 0;
while ((index = s.indexOf('"', prevIndex)) != -1) {
buffer.append(s, prevIndex, index);
if (buffer.charAt(buffer.length() - 1) != '\\') {
buffer.append('\\');
}
buffer.append('"');
prevIndex = index + 1;
}
buffer.append(s, prevIndex, s.length());
}
@Override
public boolean equals(final Object o) {
if (!(o instanceof NodePhraseQuery)) {
return false;
}
final NodePhraseQuery other = (NodePhraseQuery) o;
return (this.getBoost() == other.getBoost()) &&
this.terms.equals(other.terms) &&
this.positions.equals(other.positions) &&
this.levelConstraint == other.levelConstraint &&
this.lowerBound == other.lowerBound &&
this.upperBound == other.upperBound &&
StringUtils.equals(this.datatype, other.datatype);
}
@Override
public int hashCode() {
return Float.floatToIntBits(this.getBoost())
^ terms.hashCode()
^ positions.hashCode()
^ levelConstraint
^ upperBound
^ lowerBound;
}
static class PostingsAndPosition {
final DocsNodesAndPositionsEnum postings;
final int position;
public PostingsAndPosition(final DocsNodesAndPositionsEnum postings, final int position) {
this.postings = postings;
this.position = position;
}
}
}