/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queryparser.spans;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParserConstants;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanQuery;
/**
* This parser leverages the power of SpanQuery and can combine them with
* traditional boolean logic and multiple field information.
* This parser includes functionality from:
* <ul>
* <li> {@link org.apache.lucene.queryparser.classic.QueryParser classic QueryParser}: most of its syntax</li>
* <li> {@link org.apache.lucene.queryparser.surround.parser.QueryParser SurroundQueryParser}: recursive parsing for "near" and "not" clauses.</li>
* <li> {@link org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser}:
* can handle "near" queries that include multiterms ({@link org.apache.lucene.search.WildcardQuery},
* {@link org.apache.lucene.search.FuzzyQuery}, {@link org.apache.lucene.search.RegexpQuery}).</li>
* <li> AnalyzingQueryParser: has an option to analyze multiterms.</li>
* </ul>
*
* <p>
* <b>Background</b>
* This parser is designed to expose as much of the sophistication as is available within the Query/SpanQuery components.
* The basic approach of this parser is to build BooleanQueries comprised of SpanQueries. The parser recursively works
* through boolean/fielded chunks and then recursively works through SpanQueries.
*
* <p>
* Goals for this parser:
* <ul>
* <li>Expose as much of the underlying capabilities as possible.</li>
* <li>Keep the syntax as close to Lucene's classic
* {@link org.apache.lucene.queryparser.classic.QueryParser} as possible.</li>
* <li>Make analysis of multiterms a fundamental part of the parser
* {@link AnalyzingQueryParserBase}.</li>
* </ul>
* <p><b>Similarities and Differences</b>
*
* <p> Same as classic syntax:
* <ul>
* <li> term: test </li>
* <li> fuzzy: roam~0.8, roam~2</li>
* <li> wildcard: te?t, test*, t*st</li>
* <li> regex: <code>/[mb]oat/</code></li>
* <li> phrase: "jakarta apache"</li>
* <li> phrase with slop: "jakarta apache"~3</li>
* <li> "or" clauses: jakarta apache</li>
* <li> grouping clauses: (jakarta apache)</li>
* <li> field: author:hatcher title:lucene</li>
* <li> boolean operators: (lucene AND apache) NOT jakarta
* <li> required/not required operators: +lucene +apache -jakarta</li>
* <li> boolean with field:(author:hatcher AND author:gospodnetic) AND title:lucene</li>
* </ul>
* <p> Main additions in SpanQueryParser syntax vs. classic:
* <ul>
* <li> Can require "in order" for phrases with slop with the ~> defaultOperator: "jakarta apache"~>3</li>
* <li> Can specify "not near" "bieber fever"!~3,10 ::
* find "bieber" but not if "fever" appears within 3 words before or
* 10 words after it.</li>
* <li> Fully recursive phrasal queries with [ and ]; as in: [[jakarta apache]~3 lucene]~>4 ::
* find "jakarta" within 3 words of "apache", and that hit has to be within four
* words before "lucene".</li>
* <li> Can also use [] for single level phrasal queries instead of "" as in: [jakarta apache]</li>
* <li> Can use "or" clauses in phrasal queries: "apache (lucene solr)"~3 ::
* find "apache" and then either "lucene" or "solr" within three words.
* </li>
* <li> Can use multiterms in phrasal queries: "jakarta~1 ap*che"~2</li>
* <li> Did I mention recursion: [[jakarta~1 ap*che]~2 (solr~ /l[ou]+[cs][en]+/)]~10 ::
* Find something like "jakarta" within two words of "ap*che" and that hit
* has to be within ten words of something like "solr" or that lucene regex.</li>
* <li> How about: "fever (travlota~2 disco "saturday night" beeber~1)"!~3,10 :: find fever but not if something like
* travlota or disco or "saturday night" or something like beeber appears within 3 words before or 10 words after.</li>
* <li> Can require at least x number of hits at boolean level: "apache AND (lucene solr tika)~2</li>
* <li> Can have a negative query: -jakarta will return all documents that do not contain jakarta</li>
* </ul>
* <p>
* Trivial additions:
* <ul>
* <li> Can specify prefix length in fuzzy queries: jakarta~1,2 (edit distance=1, prefix=2)</li>
* <li> Can specify prefix Optimal String Alignment (OSA) vs Levenshtein
* in fuzzy queries: jakarta~1 (OSA) vs jakarta~>1 (Levenshtein)</li>
* </ul>
*
* <p> <b>Analysis</b>
* You can specify different analyzers
* to handle whole term versus multiterm components.
*
* <p>
* <b>Using quotes for a single term</b>
* Unlike the Classic QueryParser which uses double quotes around a single term
* to effectively escape operators, the SpanQueryParser uses single quotes.
* 'abc~2' will be treated as a single term 'abc~2' not as a fuzzy term.
* Remember to use quotes for anything with backslashes or hyphens:
* 12/02/04 (is broken into a term "12", a regex "/02/" and a term "04")
* '12/02/04' is treated a a single token.
* <p>
* If a single term (according to whitespace) is found within double quotes or square brackets,
* and the Analyzer returns one term "cat", that will be treated as a single term.
* If a single term (according to whitespace) is found within double quotes or square brackets,
* and the Analyzer returns more than one term (e.g. non-whitespace language), that
* will be treated as a SpanNear query.
*
*
* <p> <b>Stop word handling</b>
* <p>The parser tries to replicate the behavior of the Classic QueryParser. Stop words
* are generally ignored.
* <p> However, in a "near" query, extra slop is added for each stop word that
* occurs after the first non-stop word and before the last non-stop word
* (or, initial and trailing stop words are ignored in the additions to slop).
* For example, "walked the dog" is converted to "walked dog"~>1 behind the scenes. Like the Classic
* QueryParser this will lead to false positives with any word between "walked" and "dog". Unlike
* Classic QueryParser, this will also lead to false positives of "walked dog".
* <p>
* Examples
* <ul>
* <li>Term: "the" will return an empty SpanQuery (similar to classic queryparser)</li>
* <li>BooleanOr: (the apache jakarta) will drop the stop word and return a
* {@link org.apache.lucene.search.spans.SpanOrQuery} for "apache"
* or "jakarta"
* <li>SpanNear: "apache and jakarta" will drop the "and", add one to the slop and match on
* any occurrence of "apache" followed by "jakarta" with zero or one word intervening.<li>
* </ul>
* <p> Expert: Other subtle differences between SpanQueryParser and classic QueryParser.
* <ul>
* <li>Fuzzy queries with slop > 2 are handled by SlowFuzzyQuery. The developer can set the minFuzzySim to limit
* the maximum edit distance (i.e. turn off SlowFuzzyQuery by setting fuzzyMinSim = 2.0f.</li>
* <li>Fuzzy queries with edit distance >=1 are rounded so that an exception is not thrown.</li>
* </ul>
* <p> Truly Expert: there are a few other very subtle differences that are documented in comments
* in the sourcecode in the header of SpanQueryParser.
* <p>
* <b>NOTE</b> You must add the sandbox jar to your class path to include
* the currently deprecated {@link org.apache.lucene.sandbox.queries.SlowFuzzyQuery}.
* <p> Limitations of SpanQueryParser compared with classic QueryParser:
* <ol>
* <li> There is some learning curve to figure out the subtle differences in syntax between
* when one is within a phrase and when not. Including:
* <ol>
* <li>Boolean operators are not allowed within phrases: "solr (apache AND lucene)".
* Consider rewriting:[solr [apache lucene]]</li>
* <li>Field information is not allowed within phrases.</li>
* <li>Minimum hit counts for boolean "or" queries are not allowed within phrases: [apache (lucene solr tika)~2]</li>
* </ol>
* <li> This parser is not built with .jj or the antlr parser framework.
* Regrettably, because it is generating a {@link org.apache.lucene.search.spans.SpanQuery},
* it can't use all of the generalizable queryparser infrastructure that was added with Lucene 4.+.</li>
* </ol>
*/
public class SpanQueryParser extends AbstractSpanQueryParser implements QueryParserConstants {
/*
* Some subtle differences between classic QueryParser and SpanQueryParser
*
* 1) In a range query, this parser is not escaping terms. So [12-02-03 TO 12-04-03] and
* [12/02/03 TO 12/04/03] need to be single-quoted: ['12-02-03' TO '12-04-03'].
*
* 2) The SpanQueryParser does not recognize quotes as a way to escape non-regexes.
* In classic syntax a path string of "/abc/def/ghi" is denoted by the double quotes; in
* SpanQueryParser, the user has to escape the / as in \/abc\/def\/ghi or use single quotes:
* '/abc/def/ghi'
*
* 3) "term^3~" is not handled. Boosts must currently come after fuzzy mods in SpanQueryParser.
*
* 4) SpanQueryParser rounds fuzzy sims that are > 1.0. This test fails: assertParseException("term~1.1")
*
* 5) SpanQueryParser adds a small amount to its own floatToEdits calculation
* so that near exact percentages (e.g. 80% of a 5 char word should yield 1)
* aren't floored and therefore miss.
*
* For SpanQueryParser, brwon~0.80 hits on "brown".
*
* 6) By using single-quote escaping, SpanQueryParser will pass issue raised
* by LUCENE-1189, which is a token with an odd number of \ ending in a phrasal boundary.
*
* The test case that was to prove a fix for LUCENE-1189 is slightly different than the original
* issue: \"(name:[///mike\\\\\\\") or (name:\"alphonse\")";
*
* 8) SpanQueryParser does not convert regexes to lowercase as a default. There is a
* separate parameter for whether or not to do this.
*/
public SpanQueryParser(String f, Analyzer a, Analyzer multitermAnalyzer) {
super(f, a, multitermAnalyzer);
}
@Override
public Query parse(String s) throws ParseException {
Query q = _parse(s);
q = rewriteAllNegative(q);
return q;
}
private Query _parse(String queryString) throws ParseException {
if (queryString == null || queryString.equals("")) {
return getEmptySpanQuery();
}
SpanQueryLexer lexer = new SpanQueryLexer();
List<SQPToken> tokens = lexer.getTokens(queryString);
SQPClause overallClause = new SQPOrClause(0, tokens.size());
return parseRecursively(tokens, getField(), overallClause);
}
private Query parseRecursively(final List<SQPToken> tokens,
String field, SQPClause clause)
throws ParseException {
int start = clause.getTokenOffsetStart();
int end = clause.getTokenOffsetEnd();
testStartEnd(tokens, start, end);
//if this is a positionRange query, it needs to be handled
//by the span parser
if (clause.getStartPosition() != null || clause.getEndPosition() != null) {
return _parsePureSpanClause(tokens, field, clause);
}
List<BooleanClause> clauses = new ArrayList<>();
int conj = CONJ_NONE;
int mods = MOD_NONE;
String currField = field;
int i = start;
while (i < end) {
Query q = null;
SQPToken token = tokens.get(i);
//if boolean defaultOperator or field, update local buffers and continue
if (token instanceof SQPBooleanOpToken) {
SQPBooleanOpToken t = (SQPBooleanOpToken)token;
if (t.isConj()) {
conj = t.getType();
mods = MOD_NONE;
} else {
mods = t.getType();
}
i++;
continue;
} else if (token instanceof SQPField) {
currField = ((SQPField)token).getField();
i++;
continue;
}
//if or clause, recurse through tokens
if (token instanceof SQPOrClause) {
//recurse!
SQPOrClause tmpOr = (SQPOrClause)token;
q = parseRecursively(tokens, currField, tmpOr);
//if it isn't already boosted, apply the boost from the token
if (!(q instanceof BoostQuery) && !(q instanceof SpanBoostQuery) &&
tmpOr.getBoost() != null) {
if (q instanceof SpanQuery) {
q = new SpanBoostQuery((SpanQuery)q, tmpOr.getBoost());
} else {
q = new BoostQuery(q, tmpOr.getBoost());
}
}
i = tmpOr.getTokenOffsetEnd();
} else if (token instanceof SQPNearClause) {
SQPNearClause tmpNear = (SQPNearClause)token;
q = _parsePureSpanClause(tokens, currField, tmpNear);
i = tmpNear.getTokenOffsetEnd();
} else if (token instanceof SQPNotNearClause) {
SQPNotNearClause tmpNotNear = (SQPNotNearClause)token;
q = _parsePureSpanClause(tokens, currField, tmpNotNear);
i = tmpNotNear.getTokenOffsetEnd();
} else if (token instanceof SQPTerminal) {
SQPTerminal tmpTerm = (SQPTerminal)token;
if (tmpTerm.getStartPosition() != null || tmpTerm.getEndPosition() != null) {
q = buildSpanTerminal(currField, tmpTerm);
} else {
q = buildTerminal(currField, tmpTerm);
}
i++;
} else {
//throw exception because this could lead to an infinite loop
//if a new token type is added but not properly accounted for.
throw new IllegalArgumentException("Don't know how to process token of this type: " + token.getClass());
}
if (!isEmptyQuery(q)) {
addClause(clauses, conj, mods, q);
}
//reset mods and conj and field
mods = MOD_NONE;
conj = CONJ_NONE;
currField = field;
}
if (clauses.size() == 0) {
return getEmptySpanQuery();
}
if (clauses.size() == 1 &&
clauses.get(0).getOccur() != Occur.MUST_NOT) {
return clauses.get(0).getQuery();
}
BooleanQuery.Builder bq = new BooleanQuery.Builder();
try {
for (BooleanClause bc : clauses) {
bq.add(bc);
}
} catch (BooleanQuery.TooManyClauses e) {
throw new ParseException(e.getMessage());
}
if (clause instanceof SQPOrClause) {
SQPOrClause orClause = (SQPOrClause)clause;
if (orClause.getMinimumNumberShouldMatch() != null) {
bq.setMinimumNumberShouldMatch(orClause.getMinimumNumberShouldMatch());
}
}
return bq.build();
}
private Query testAllDocs(String tmpField, SQPTerminal tmpTerm) {
if (tmpField.equals("*") &&
tmpTerm instanceof SQPTerm &&
((SQPTerm)tmpTerm).getString().equals("*")) {
Query q = new MatchAllDocsQuery();
if (tmpTerm.getBoost() != null) {
q = new BoostQuery(q, tmpTerm.getBoost());
}
return q;
}
return null;
}
private void testStartEnd(List<SQPToken> tokens, int start, int end)
throws ParseException {
SQPToken s = tokens.get(start);
if (s instanceof SQPBooleanOpToken) {
int type = ((SQPBooleanOpToken)s).getType();
if ( type == CONJ_AND || type == CONJ_OR) {
throw new ParseException("Can't start clause with AND or OR");
}
}
SQPToken e = tokens.get(end-1);
if (e instanceof SQPField) {
throw new ParseException("Can't end clause with a field token");
}
if (e instanceof SQPBooleanOpToken) {
throw new ParseException("Can't end clause with a boolean defaultOperator");
}
}
/**
* If the query contains only Occur.MUST_NOT clauses,
* this will add a MatchAllDocsQuery.
* @return query
*/
private Query rewriteAllNegative(Query q) {
if (q instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery)q;
List<BooleanClause> clauses = bq.clauses();
if (clauses.size() == 0) {
return q;
}
for (BooleanClause clause : clauses) {
if (! clause.getOccur().equals(Occur.MUST_NOT)) {
//something other than must_not exists, stop here and return q
return q;
}
}
BooleanQuery.Builder b = new BooleanQuery.Builder();
for (BooleanClause clause : bq.clauses()) {
b.add(clause);
}
b.add(new MatchAllDocsQuery(), Occur.MUST);
return b.build();
}
return q;
}
/**
* Argh! Copied directly from QueryParserBase. Preferred to
* get rid of parts that don't belong with the SpanQueryParser
* in favor of this duplication of code. Could we add this
* to QueryBuilder?
*
* @param clauses clauses
* @param conj conjunction flag
* @param mods mods flag
* @param q query
*/
private void addClause(List<BooleanClause> clauses, int conj, int mods, Query q) {
boolean required, prohibited;
// If this term is introduced by AND, make the preceding term required,
// unless it's already prohibited
if (clauses.size() > 0 && conj == CONJ_AND) {
BooleanClause c = clauses.get(clauses.size()-1);
if (!c.isProhibited())
clauses.set(clauses.size()-1, new BooleanClause(c.getQuery(), Occur.MUST));
}
if (clauses.size() > 0 && defaultOperator == QueryParser.Operator.AND && conj == CONJ_OR) {
// If this term is introduced by OR, make the preceding term optional,
// unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
// notice if the input is a OR b, first term is parsed as required; without
// this modification a OR b would parsed as +a OR b
BooleanClause c = clauses.get(clauses.size()-1);
if (!c.isProhibited())
clauses.set(clauses.size()-1, new BooleanClause(c.getQuery(), Occur.SHOULD));
}
// We might have been passed a null query; the term might have been
// filtered away by the analyzer.
if (q == null)
return;
if (defaultOperator == QueryParser.Operator.OR) {
// We set REQUIRED if we're introduced by AND or +; PROHIBITED if
// introduced by NOT or -; make sure not to set both.
prohibited = (mods == MOD_NOT);
required = (mods == MOD_REQ);
if (conj == CONJ_AND && !prohibited) {
required = true;
}
} else {
// We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
// if not PROHIBITED and not introduced by OR
prohibited = (mods == MOD_NOT);
required = (!prohibited && conj != CONJ_OR);
}
if (required && !prohibited)
clauses.add(new BooleanClause(q, BooleanClause.Occur.MUST));
else if (!required && !prohibited)
clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
else if (!required && prohibited)
clauses.add(new BooleanClause(q, BooleanClause.Occur.MUST_NOT));
else
throw new RuntimeException("Clause cannot be both required and prohibited");
}
}