package org.apache.lucene.queryparser.spans;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.List;
import java.util.Stack;
import org.apache.lucene.queryparser.classic.ParseException;
class SpanQueryLexer {
private enum TOKEN_TYPE {
UNSPECIFIED, //nothing special, for a token, this is EXACT
QUOTED,
REGEX,
}
private static final int DEFAULT_MIN_REQUIRED_IN_OR = 2;
private static final String AND = "AND";
private static final String NOT = "NOT";
private static final String OR = "OR"; //silently removed from queries...beware!
private static final String ESCAPED_AND = "\\AND";
private static final String ESCAPED_NOT = "\\NOT";
private static final String ESCAPED_OR = "\\OR";
private static final int DQUOTE = (int) '"';
private static final int SQUOTE = (int) '\'';
private static final int OPEN_PAREN = (int) '(';
private static final int CLOSE_PAREN = (int) ')';
private static final int OPEN_SQUARE = (int) '[';
private static final int CLOSE_SQUARE = (int) ']';
private static final int OPEN_CURLY = (int) '{';
private static final int CLOSE_CURLY = (int) '}';
private static final int TILDE = (int) '~';
private static final int U = (int) 'u';
private static final int CARET = (int) '^';
private static final int FORWARD_SLASH = (int) '/';
private static final int BACK_SLASH = (int) '\\';
private static final int COLON = (int) ':';
private static final int PLUS = (int) '+';
private static final int MINUS = (int) '-';
private static final int EXCLAMATION = (int) '!';
private static final int AMPERSAND = '&';
private static final int PIPE = '|';
private static final int COMMA = (int) ',';
private static final int GREATER_THAN = (int) '>';
private static final int LESS_THAN = (int) '<';
private static final int DECIMAL_POINT = (int)'.';
private static final int STAR = (int)'*';
private static final int QMARK = (int)'?';
private static final int AT = (int)'@';
private static final int PERIOD = (int)'.';
private static final int CHAR_O = (int)'O';
private static final int CHAR_T = (int)'T';
private static final int CHAR_N = (int)'N';
private static final int CHAR_D = (int)'D';
private static final int CHAR_R = (int)'R';
private static final int CHAR_A = (int)'A';
private boolean inDQuote = false;
private int wildcardChars = 0;
private int wildcardQuestionMarks = 0;
private TOKEN_TYPE type = TOKEN_TYPE.UNSPECIFIED;
private int nearDepth = 0;
private PushbackReader reader;
private final StringBuilder tokenBuffer = new StringBuilder();
private final List<SQPToken> tokens = new ArrayList<>();
private final Stack<SQPOpenClause> stack = new Stack<>();
public List<SQPToken> getTokens(String s) throws ParseException {
if (s.trim().length() == 0) {
return tokens;
}
//TODO: no need to init if initialize lexer w string and change to getTokens()
tokens.clear();
stack.clear();
resetTokenBuffer();
nearDepth = 0;
inDQuote = false;
reader = new PushbackReader(new StringReader(s), 11);//need to be able to push back string representation of integer
try {
while (nextToken()) {
//do nothing;
}
} catch (IOException e) {
throw new ParseException(e.getMessage());
}
if (! stack.isEmpty()) {
throw new ParseException("Couldn't find matching end to: "+stack.pop().getType());
}
return tokens;
}
private boolean nextToken() throws IOException, ParseException {
int c = reader.read();
//slurp leading whitespace
while (Character.isWhitespace(c)) {
c = reader.read();
}
while (true) {
if (Character.isWhitespace(c)) {
flushBuffer();
return true;
}
switch (c) {
case -1:
flushBuffer();
return false;
case STAR :
wildcardChars++;
break;
case QMARK :
wildcardChars++;
wildcardQuestionMarks++;
break;
case TILDE :
if (tokenBuffer.length() > 0) {
handleFuzzyTerm();
c = reader.read();
continue;
}
break;
case AT :
if (tokenBuffer.length() > 0) {
tryToUnread(AT); //unread so that we can tryToRead...
BoostPositionRange boostPositionRange = tryToReadBoostOrPositionRange(false);
//if a position was found, great, if not continue on
if (boostPositionRange != null && boostPositionRange.hasPosition()) {
flushBuffer(boostPositionRange);
return true;
}
}
break;
case CARET : // hit boost marker
if (tokenBuffer.length() > 0) {
tryToUnread(CARET); //unread so that we can tryToRead...
BoostPositionRange boostPositionRange = tryToReadBoostOrPositionRange(false);
if (boostPositionRange != null) {
flushBuffer(boostPositionRange);
return true;
}
}
break;
case SQUOTE : //single quote
flushBuffer();
type = TOKEN_TYPE.QUOTED;
return readToMatchingEndToken(SQUOTE);
case FORWARD_SLASH : //regex
flushBuffer();
type = TOKEN_TYPE.REGEX;
return readToMatchingEndToken(FORWARD_SLASH);
case BACK_SLASH:
int next = reader.read();
if (next == -1) {
throw new ParseException("Can't end string with \\");
} else if (next == U) {
tryToReadEscapedUnicode();
} else {
//need to append it for now
tokenBuffer.appendCodePoint(BACK_SLASH);
tokenBuffer.appendCodePoint(next);
}
c = reader.read();
continue;
case COLON:
String fieldName = tokenBuffer.toString();
resetTokenBuffer();
tryToAddField(fieldName);
return true;
case OPEN_PAREN:
flushBuffer();
handleOpenClause(SQPClause.TYPE.PAREN);
return true;
case OPEN_SQUARE:
flushBuffer();
handleOpenClause(SQPClause.TYPE.BRACKET);
return true;
case OPEN_CURLY:
flushBuffer();
handleOpenClause(SQPClause.TYPE.CURLY);
return true;
case DQUOTE:
handleDQuote();
return true;
case CLOSE_PAREN :
handleCloseClause(SQPClause.TYPE.PAREN);
return true;
case CLOSE_CURLY :
handleCloseClause(SQPClause.TYPE.CURLY);
return true;
case CLOSE_SQUARE :
handleCloseClause(SQPClause.TYPE.BRACKET);
return true;
case PLUS :
if (tokenBuffer.length() == 0 && isNotNextWhitespaceOrEnd()) {
flushBuffer();
SQPBooleanOpToken plusToken = new SQPBooleanOpToken(SpanQueryParserBase.MOD_REQ);
testBooleanTokens(tokens, plusToken);
tokens.add(plusToken);
return true;
}
break;
case MINUS :
if (tokenBuffer.length() == 0 && isNotNextWhitespaceOrEnd()) {
flushBuffer();
SQPBooleanOpToken minusToken = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT);
testBooleanTokens(tokens, minusToken);
tokens.add(minusToken);
return true;
}
break;
case EXCLAMATION :
if (tokenBuffer.length() == 0 && ! isNextBreak() && nearDepth == 0) {
flushBuffer();
SQPBooleanOpToken notToken = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT);
testBooleanTokens(tokens, notToken);
tokens.add(notToken);
return true;
}
break;
case AMPERSAND :
if (tokenBuffer.length() == 0 && nearDepth == 0) {
int n = reader.read();
if (n == AMPERSAND && isNextBreak()) {
flushBuffer();
SQPBooleanOpToken andToken = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND);
testBooleanTokens(tokens, andToken);
tokens.add(andToken);
return true;
} else {
tryToUnread(n);
}
}
break;
case PIPE :
if (tokenBuffer.length() == 0) {
int n = reader.read();
if (n == PIPE && isNextBreak()) {
flushBuffer();
SQPBooleanOpToken orToken = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_OR);
testBooleanTokens(tokens, orToken);
tokens.add(orToken);
return true;
} else {
tryToUnread(n);
}
}
break;
}
tokenBuffer.appendCodePoint(c);
c = reader.read();
}
}
private void handleFuzzyTerm() throws ParseException, IOException {
SQPFuzzyTerm term = new SQPFuzzyTerm(stripEscapes(tokenBuffer.toString()));
if (wildcardChars > 0) {
throw new ParseException("Need to escape wildcards in fuzzy terms.");
}
int c = reader.read();
if (c == GREATER_THAN) {
term.setTranspositions(false);
} else {
tryToUnread(c);
}
Float maxEdits = tryToReadUnsignedFloat();
if (maxEdits != null) {
float maxEditsFloat = maxEdits.floatValue();
int maxEditsInt = (int)maxEditsFloat;
if (maxEditsInt != maxEditsFloat) {
throw new ParseException("Can't use a float value on a fuzzy term any more: "+maxEdits);
}
term.setMaxEdits(maxEditsInt);
}
c = reader.read();
if (c == COMMA) {
Integer prefixLen = tryToReadInteger();
if (prefixLen == null) {
tryToUnread(COMMA);
} else {
term.setPrefixLength(prefixLen);
}
} else {
tryToUnread(c);
}
tryToReadBoostOrPositionRange(term);
tokens.add(term);
resetTokenBuffer();
}
private void handleDQuote() throws IOException, ParseException {
if (inDQuote) {
inDQuote = false;
handleCloseClause(SQPClause.TYPE.QUOTE);
} else {
inDQuote = true;
handleOpenClause(SQPClause.TYPE.QUOTE);
}
}
private void handleCloseClause(SQPClause.TYPE closeType) throws IOException, ParseException {
//flush the token buffer
flushBuffer();
SQPOpenClause open = null;
try {
open = stack.pop();
} catch (EmptyStackException e) {
throw new ParseException("Unable to find starting clause marker for this end: " + closeType.name());
}
testMatchingOpenClose(open, closeType);
SQPClause newClause = null;
if (closeType == SQPClause.TYPE.PAREN) {
int next = reader.read();
Integer minMatch = -1;
if (next == TILDE) {
if (nearDepth > 0) {
throw new ParseException("Can't specify minimumNumberShouldMatch within a \"near\" clause");
}
minMatch = tryToReadInteger();
if (minMatch == null) {
minMatch = DEFAULT_MIN_REQUIRED_IN_OR;
}
} else {
tryToUnread(next);
}
newClause = new SQPOrClause(open.getTokenOffsetStart() + 1, tokens.size());
if (minMatch > -1) {
((SQPOrClause) newClause).setMinimumNumberShouldMatch(minMatch);
}
} else { //has to be a span near or span not
nearDepth--;
//next0
int n0 = reader.read();
if (n0 == EXCLAMATION) { //span not
int n1 = reader.read();
if (n1 == TILDE) {
Integer notPost = null;
Integer notPre = tryToReadInteger();
int n2 = reader.read();
if (n2 == COMMA) {
notPost = tryToReadInteger();
} else {
tryToUnread(n2);
}
if (notPost == null) {
notPost = notPre;
}
newClause = new SQPNotNearClause(open.getTokenOffsetStart()+1, tokens.size(), closeType, notPre, notPost);
} else {
throw new ParseException("Not near query must be followed by ~");
}
} else if (n0 == TILDE) { //span with slop
Boolean inOrder = false;
int n1 = reader.read();
if (n1 == GREATER_THAN) {
inOrder = true;
} else {
tryToUnread(n1);
}
Integer slop = tryToReadInteger();
newClause = new SQPNearClause(open.getTokenOffsetStart() + 1, tokens.size(),
open.getType(), inOrder, slop);
} else { // no special marker at end of near phrase
tryToUnread(n0);
/* if (open.getTokenOffsetStart() + 2 == tokens.size()) {
testBadRange(open.getType() == SQPClause.TYPE.CURLY || closeType == SQPClause.TYPE.CURLY);
System.out.println("SINGLE TERM:"+tokens.get(tokens.size()-1));
//if single child between double quotes or brackets, treat it as a quoted SQPTerm
SQPTerm t = new SQPTerm(((SQPTerminal) tokens.get(tokens.size() - 1)).getString(), true);
Float boost = tryToReadBoost();
if (boost != null) {
t.setBoost(boost);
}
tokens.remove(tokens.size()-1);//remove original single term
tokens.remove(tokens.size()-1);//remove opening clause marker
tokens.add(t);
return;
} else {*/
newClause = new SQPNearClause(open.getTokenOffsetStart() + 1, tokens.size(),
open.getType(), null, null);
//}
}
}
tryToReadBoostOrPositionRange(newClause);
if (newClause instanceof SQPOrClause) {
SQPOrClause sprOr = (SQPOrClause)newClause;
if (sprOr.getMinimumNumberShouldMatch() != null &&
(sprOr.getStartPosition() != null || sprOr.getEndPosition() != null)) {
throw new ParseException("Can't have both minShouldMatch and positionRange parameters on an 'or' clause");
}
}
if (testForRangeQuery(open, newClause, closeType)) {
return;
}
tokens.set(open.getTokenOffsetStart(), newClause);
}
private boolean isNotNextWhitespaceOrEnd() throws IOException {
int n1 = reader.read();
if (n1 == -1) {
return false;
}
reader.unread(n1);
return !Character.isWhitespace(n1);
}
private boolean isNextBreak() throws IOException {
int n1 = reader.read();
if (Character.isWhitespace(n1)){
reader.unread(n1);
return true;
}
boolean response = false;
switch (n1) {
case -1 :
response = true;
break;
case OPEN_CURLY :
response = true;
break;
case CLOSE_CURLY :
response = true;
break;
case OPEN_PAREN :
response = true;
break;
case CLOSE_PAREN :
response = true;
break;
case OPEN_SQUARE :
response = true;
break;
case CLOSE_SQUARE :
response = true;
break;
case CARET :
response = true;
break;
}
tryToUnread(n1);
return response;
}
private SQPBoostableOrPositionRangeToken tryToReadBoostOrPositionRange(
SQPBoostableOrPositionRangeToken term)
throws ParseException, IOException {
BoostPositionRange bpr = tryToReadBoostOrPositionRange(true);
if (bpr == null) {
return term;
}
term.setBoost(bpr.boost);
term.setStartPosition(bpr.start);
term.setEndPosition(bpr.end);
return term;
}
private BoostPositionRange tryToReadBoostOrPositionRange(boolean throwExceptionOnPartialRead)
throws ParseException, IOException {
//^1.2@10..20 or @..10^1.2
//try to read boost, then position range
//if you have a position range, try to read boost after it
//if you didn't already find a first boost
Float boost = tryToReadBoost();
BoostPositionRange positionRange = tryToReadPositionRange(throwExceptionOnPartialRead);
if (positionRange != null && boost == null) {
boost = tryToReadBoost();
}
if (boost != null) {
if (positionRange == null) {
positionRange = new BoostPositionRange(boost);
} else {
positionRange.setBoost(boost);
}
}
return positionRange;
}
//tries to read a boost if it is there
//returns null if no parseable boost
private Float tryToReadBoost() throws ParseException, IOException {
int c = reader.read();
if (c == CARET) {
Float boost = tryToReadUnsignedFloat();
if (boost == null) {
return boost;
}
int next = reader.read();
if (next == CARET) {
throw new ParseException("Can't end boost with caret");
}
tryToUnread(next);
return boost;
} else {
tryToUnread(c);
}
return null;
}
/**
* tries to read @2..10 or @..10 or @2..
*
* returns null if no position range was found
*/
private BoostPositionRange tryToReadPositionRange(boolean throwExceptionOnPartialRead)
throws ParseException, IOException {
int chr = reader.read();
if (chr != AT) {
tryToUnread(chr);
return null;
}
//we have @, look for period or integer
chr = reader.read();
Integer start = null;
Integer end = null;
if (chr != PERIOD) { //maybe it's an integer @20.. ?
tryToUnread(chr);
start = tryToReadInteger();
if (start != null) {
end = tryToReadEndPositionRange(start);
} else {
return handlePartialRangeRead(throwExceptionOnPartialRead);
}
} else {//@. now try to read another period and maybe an integer
tryToUnread(chr);//put the . back on and try to read endpositionrange
end = tryToReadEndPositionRange(start);
}
//if there wasn't a ..
if (end != null && end == -1) {
if (start != null) {
tryToUnread(Integer.toString(start));
}
return handlePartialRangeRead(throwExceptionOnPartialRead);
}
if ((start != null || end != null) && ! isNextBreak()) {
throw new ParseException(
"Found the end of an apparent SpanPositionRangeQuery in the middle of a term." +
" Need to escape it or add a space.");
}
if (start == null && end == null) {
if (! isNextBreak()) { //couldn't find start or end and there's no break...must be middle of token
return handlePartialRangeRead(throwExceptionOnPartialRead);
}
throw new ParseException("Must specify a start or end value for a position range query:" +
"@2.. @..10 @2..10");
}
return new BoostPositionRange(start, end);
}
private BoostPositionRange handlePartialRangeRead(boolean throwExceptionOnPartialRead) throws ParseException{
if (throwExceptionOnPartialRead) {
throw new ParseException("Read partial range position after clause; needs to be of format @2..10 or @..10 or @2..");
}
return null;
}
//returns -1 if there wasn't a ".." or if start was null and there was no end
//returns null if there was a "..\b" with no value
private Integer tryToReadEndPositionRange(Integer start) throws IOException, ParseException {
int chr = reader.read();
if (chr != PERIOD) {
tryToUnread(chr);
return -1;
}
int chr2 = reader.read();
if (chr2 != PERIOD) {
//@.x -- not a range
tryToUnread(chr2);
tryToUnread(chr);
return -1;
}
//so far, we have ..
Integer end = tryToReadInteger();
if (start == null && end == null) {
tryToUnread(PERIOD);
tryToUnread(PERIOD);
return -1;
}
return end;
}
//After a closeClause is built, this tests to see if it is
//actually a range query. If it is, then this replaces the clause
//with a SQPRangeTerm and returns true
private boolean testForRangeQuery(SQPOpenClause openClause, SQPClause closeClause, SQPClause.TYPE closeType) throws ParseException {
//if paren or quote clause, not a range query
if (openClause.getType() == SQPClause.TYPE.PAREN ||
openClause.getType() == SQPClause.TYPE.QUOTE) {
return false;
}
if (closeClause instanceof SQPNotNearClause) {
return false;
}
SQPNearClause clause = (SQPNearClause) closeClause;
//test to see if this looks like a range
//does it contain three items; are they all terms, is the middle one "TO"
//if it is, handle it; if there are problems throw an exception, otherwise return false
//if there's a curly bracket to start or end, then it
//must be a compliant range query or else throw parse exception
boolean hasCurly = (openClause.getType() == SQPClause.TYPE.CURLY ||
closeType == SQPClause.TYPE.CURLY);
//if there are any modifiers on the close bracket
if (clause.getInOrder() != null || clause.getSlop() != null) {
if (hasCurly) {
throw new ParseException("Can't have modifiers on a range query. " +
"Or, you can't use curly brackets for a phrase/near query");
}
return false;
}
//now check from the end of the list, and see if this
//has <term> TO <term> <start clause>
//if there aren't three tokens since the start token offset, return
if (openClause.getTokenOffsetStart() != tokens.size() - 4) {
return testBadRange(hasCurly);
}
//check for "TO"
SQPToken t = tokens.get(tokens.size() - 2);
if (t instanceof SQPTerm &&
((SQPTerm) t).getString().equals("TO")) {
} else {
return testBadRange(hasCurly);
}
SQPToken candStart = tokens.get(tokens.size()-3);
SQPToken candEnd = tokens.get(tokens.size()-1);
if (candStart instanceof SQPTerminal &&
candEnd instanceof SQPTerminal) {
//great
} else {
return testBadRange(hasCurly);
}
String endString = getCandidateRangeTermString((SQPTerminal)candEnd);
String startString = getCandidateRangeTermString((SQPTerminal)candStart);
//
boolean startInclusive = (openClause.getType() == SQPClause.TYPE.BRACKET);
boolean endInclusive = (closeType == SQPClause.TYPE.BRACKET);
SQPToken range = new SQPRangeTerm(startString, endString, startInclusive, endInclusive);
//remove last term
tokens.remove(tokens.size() - 1);
//remove TO
tokens.remove(tokens.size() - 1);
//remove first term
tokens.remove(tokens.size() - 1);
//remove start clause marker
tokens.remove(tokens.size() - 1);
tokens.add(range);
Float boost = closeClause.getBoost();
if (boost != null) {
((SQPBoostableOrPositionRangeToken)range).setBoost(boost);
}
return true;
}
private String getCandidateRangeTermString(SQPTerminal t) throws ParseException {
if (t instanceof SQPRangeTerm) {
throw new ParseException("Can't include a range term within a range query. Make sure to escape the range characters.");
} else if (t instanceof SQPFuzzyTerm) {
throw new ParseException("Can't include a fuzzy term within a range query. Make sure to escape the fuzzy term characters.");
} else if (t instanceof SQPRegexTerm) {
throw new ParseException("Can't include a regex term within a range query. Make sure to escape the regex term characters.");
} else if (t instanceof SQPPrefixTerm) {
throw new ParseException("Can't include a prefix term within a range query. Make sure to escape the *.");
} else if (t instanceof SQPWildcardTerm) {
String wc = ((SQPWildcardTerm)t).getString();
if (wc.equals("*")) {
return null;
}
throw new ParseException("Can't include a wildcard term within a range query. Make sure to escape the * and ?.");
} else if (t instanceof SQPTerm) {
return ((SQPTerm)t).getString();
}
throw new IllegalArgumentException("Unrecognizable class in getCandidateRangeTermString: "+t.getClass());
}
private boolean testBadRange(boolean hasCurly) throws ParseException {
if (hasCurly) {
throw new ParseException("Curly brackets should only be used in range queries");
}
return false;
}
//tests that open and closing clause markers match
//throws parse exception if not
private void testMatchingOpenClose(SQPOpenClause open, SQPClause.TYPE closeType)
throws ParseException {
SQPClause.TYPE openType = open.getType();
if (openType == closeType) {
//no op
} else if (openType == SQPClause.TYPE.BRACKET) {
if (closeType != SQPClause.TYPE.BRACKET &&
closeType != SQPClause.TYPE.CURLY) {
throw new ParseException("Mismatching phrasal elements:" +
openType.name() + " and " + closeType.name());
}
} else if (openType == SQPOpenClause.TYPE.CURLY) {
if (closeType != SQPClause.TYPE.BRACKET &&
closeType != SQPClause.TYPE.CURLY) {
throw new ParseException("Mismatching phrasal elements:" +
openType.name() + " and " + closeType.name());
}
}
}
private void handleOpenClause(SQPClause.TYPE type) {
SQPOpenClause open = new SQPOpenClause(tokens.size(), type);
stack.push(open);
tokens.add(open);
if (type != SQPClause.TYPE.PAREN) {
nearDepth++;
}
}
//this reads everything to a matching end token, e.g. ' or /.
//the targChar token is escaped by being doubled.
//This unescapes the targChar
private boolean readToMatchingEndToken(int targChar) throws ParseException, IOException {
int c = reader.read();
boolean hitEndOfString = false;
while (true) {
if (c == -1) {
//won't work with bmp targChar!
throw new ParseException("Didn't find matching: " + (char) targChar);
} else if (c == targChar) {
int next = reader.read();
if (next == -1) {
hitEndOfString = true;
break;
} else if (next == targChar) {
tokenBuffer.appendCodePoint(targChar);
c = reader.read();
continue;
} else {
reader.unread(next);
break;
}
}
tokenBuffer.appendCodePoint(c);
c = reader.read();
}
if (tokenBuffer.length() == 0) {
throw new ParseException("must have some content between " + (char) targChar + "s");
}
String contents = tokenBuffer.toString();
SQPToken token = null;
if (type == TOKEN_TYPE.REGEX) {
token = new SQPRegexTerm(contents);
} else if (type == TOKEN_TYPE.QUOTED) {
token = new SQPTerm(contents, true);
} else {
throw new IllegalArgumentException("Don't know how to handle: "+targChar+" while building tokens");
}
tryToReadBoostOrPositionRange((SQPBoostableOrPositionRangeToken)token);
tokens.add(token);
resetTokenBuffer();
return !hitEndOfString;
}
private void flushBuffer() throws ParseException, IOException {
flushBuffer(null);
}
private void flushBuffer(BoostPositionRange bpr) throws ParseException, IOException {
if (tokenBuffer.length() == 0) {
return;
}
String term = tokenBuffer.toString();
//test for AllDocs
if (term.length() == 1 && term.codePointAt(0) == STAR) {
if (tokens.size() > 0) {
SQPToken t = tokens.get(tokens.size()-1);
if (t instanceof SQPField &&
((SQPField)t).getField().equals("*")){
SQPAllDocsTerm allDocs = new SQPAllDocsTerm();
if (bpr != null) {
allDocs.setBoost(bpr.boost);
}
tokens.set(tokens.size()-1, allDocs);
resetTokenBuffer();
return;
}
}
}
//The regex over-captures on a term...Term could be:
//AND or NOT boolean defaultOperator; and term could have boost
boolean checkForEscapedOperators = false;
//does the term == AND or NOT or OR
if (nearDepth == 0 && type == TOKEN_TYPE.UNSPECIFIED) {
SQPToken token = null;
if (term.equals(AND)) {
token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_AND);
} else if (term.equals(NOT)) {
token = new SQPBooleanOpToken(SpanQueryParserBase.MOD_NOT);
} else if (term.equals(OR)) {
token = new SQPBooleanOpToken(SpanQueryParserBase.CONJ_OR);
}
if (token != null) {
if (bpr != null && bpr.hasAValue()) {
throw new ParseException("Can't have boost on a boolean operator (AND|NOT|OR)");
}
testBooleanTokens(tokens, (SQPBooleanOpToken)token);
tokens.add(token);
resetTokenBuffer();
return;
} else {
//only check for escaped operators if they're where you'd need to escape them
checkForEscapedOperators = true;
}
}
//now trim escapes if there are no wildcard characters
if (wildcardQuestionMarks == 0) {
term = stripEscapes(tokenBuffer.toString());
}
SQPToken token = null;
if (type == TOKEN_TYPE.REGEX) {
token = new SQPRegexTerm(term);
} else if (wildcardChars > 0) {
token = buildWildcard(term);
} else if (type == TOKEN_TYPE.QUOTED) {
token = new SQPTerm(term, true);
} else if (checkForEscapedOperators) {
if (term.equals(ESCAPED_AND)) {
token = new SQPTerm(AND, false);
} else if (term.equals(ESCAPED_NOT)) {
token = new SQPTerm(NOT, false);
} else if (term.equals(ESCAPED_OR)) {
token = new SQPTerm(OR, false);
} else {
token = new SQPTerm(term, false);
}
} else {
token = new SQPTerm(term, false);
}
if (bpr != null) {
((SQPBoostableOrPositionRangeToken)token).setBoost(bpr.boost);
((SQPBoostableOrPositionRangeToken)token).setStartPosition(bpr.start);
((SQPBoostableOrPositionRangeToken)token).setEndPosition(bpr.end);
}
tokens.add(token);
resetTokenBuffer();
}
private String stripEscapes(String term) throws IOException {
Reader r = new StringReader(term);
StringBuilder sb = new StringBuilder();
int c = r.read();
while (c != -1) {
if (c == BACK_SLASH) {
c = r.read();
if (c == -1) {
break;
}
}
sb.appendCodePoint(c);
c = r.read();
}
return sb.toString();
}
private SQPTerminal buildWildcard(String term) {
if (term.equals("*")) {
return new SQPWildcardTerm("*");
}
if (wildcardChars == 1 && term.endsWith("*")) {
String prefix = term.substring(0,term.length()-1);
return new SQPPrefixTerm(prefix);
}
return new SQPWildcardTerm(term);
}
private void resetTokenBuffer() {
tokenBuffer.setLength(0);
wildcardChars = 0;
wildcardQuestionMarks = 0;
type = TOKEN_TYPE.UNSPECIFIED;
}
private void tryToAddField(String term) throws ParseException, IOException {
if (term.length() == 0) {
throw new ParseException("Field name must have length > 0");
}
if (nearDepth != 0) {
throw new ParseException("Can't specify a field within a \"Near\" clause");
}
if (tokens.size() > 0 && tokens.get(tokens.size()-1) instanceof SQPField) {
throw new ParseException("A field must contain a terminal");
}
SQPToken token = new SQPField(stripEscapes(term));
tokens.add(token);
}
/**
* Test whether this token can be added to the list of tokens
* based on classic queryparser rules
*/
private void testBooleanTokens(List<SQPToken> tokens, SQPBooleanOpToken token)
throws ParseException {
//there are possible exceptions with tokens.size()==0, but they
//are the same exceptions as at clause beginning.
//Need to test elsewhere for start of clause issues.
if (tokens.size() == 0) {
return;
}
SQPToken t = tokens.get(tokens.size()-1);
if (t instanceof SQPBooleanOpToken) {
int curr = ((SQPBooleanOpToken)t).getType();
int nxt = token.getType();
boolean ex = false;
if (SQPBooleanOpToken.isMod(curr)) {
ex = true;
} else if (curr == SpanQueryParser.CONJ_AND &&
nxt == SpanQueryParser.CONJ_AND) {
ex = true;
} else if (curr == SpanQueryParser.CONJ_OR &&
! SQPBooleanOpToken.isMod(nxt) ) {
ex = true;
} else if (curr == SpanQueryParser.MOD_NOT) {
ex = true;
}
if (ex) {
throw new ParseException("Illegal combination of boolean conjunctions and modifiers");
}
}
}
private void tryToUnread(String s) throws IOException {
final int length = s.length();
List<Integer> ints = new ArrayList<>();
for (int offset = 0; offset < length; ) {
final int codepoint = s.codePointAt(offset);
ints.add(codepoint);
offset += Character.charCount(codepoint);
}
for (int i = ints.size()-1; i > -1; i--) {
tryToUnread(ints.get(i));
}
}
private void tryToUnread(int c) throws IOException {
if (c != -1) {
reader.unread(c);
}
}
//returns null if reading an integer fails
private Integer tryToReadInteger() throws IOException {
StringBuilder sb = new StringBuilder();
while (true) {
int c = reader.read();
int val = c-48;
if (val >= 0 && val <= 9) {
sb.append(val);
} else {
tryToUnread(c);
break;
}
}
if (sb.length() == 0) {
return null;
}
return Integer.parseInt(sb.toString());
}
private Float tryToReadUnsignedFloat() throws ParseException, IOException {
StringBuilder sb = new StringBuilder();
boolean seenDecimalPoint = false;
int c = reader.read();
if (c == MINUS) {
throw new ParseException("Negative values not allowed.");
} else if (c == PLUS) {
throw new ParseException("Plus sign not allowed.");
} else {
tryToUnread(c);
}
while (true) {
c = reader.read();
int val = c-48;
if (c == DECIMAL_POINT) {
if (seenDecimalPoint) {
tryToUnread(c);
break;
} else {
seenDecimalPoint = true;
sb.appendCodePoint(DECIMAL_POINT);
}
} else if (val >= 0 && val <= 9) {
sb.append(val);
} else {
tryToUnread(c);
break;
}
}
String tmpFloatString = sb.toString();
if (tmpFloatString.length() == 0) {
return null;
} else if (tmpFloatString.equals(".")) {
//or do we want to unread and move on?
//tryToUnread(DECIMAL);
//return null;
throw new ParseException("Single \".\" appears where there should be a float!");
}
return Float.parseFloat(tmpFloatString);
}
private void tryToReadEscapedUnicode() throws ParseException, IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 4; i++) {
int c = reader.read();
if (c == -1) {
throw new ParseException("Invalid escaped unicode character. >"+sb.toString()+"< and the end of the query string");
}
if (isHex(c)){
sb.appendCodePoint(c);
} else {
throw new ParseException("Invalid escaped unicode character. >"+sb.toString()+"< and " +new String(Character.toChars(c)));
}
}
tokenBuffer.appendCodePoint(Integer.parseInt(sb.toString(), 16));
}
private boolean isHex(int c) {
if (c >= 48 && c <= 57) {
return true;
} else if (c >= 65 && c <= 70) {
return true;
} else if (c >= 97 && c <= 102) {
return true;
}
return false;
}
private static class BoostPositionRange {
Float boost;
Integer start;
Integer end;
BoostPositionRange(Float boost) {
this.boost = boost;
}
BoostPositionRange(Integer start, Integer end) {
setStartEnd(start, end);
}
void setBoost(Float boost) {
this.boost = boost;
}
void setStartEnd(Integer start, Integer end) {
if (start != null && end != null && start > end) {
this.start = end;
this.end = start;
} else {
this.start = start;
this.end = end;
}
}
boolean hasAValue() {
return (boost == null &&
start == null &&
end == null);
}
public boolean hasPosition() {
return (start != null || end != null);
}
}
}