/*
* (C) Copyright 2014 Nuxeo SA (http://nuxeo.com/) and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Florent Guillaume
*/
package org.nuxeo.ecm.core.storage;
import static java.lang.Boolean.FALSE;
import static java.lang.Boolean.TRUE;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.CharUtils;
import org.apache.commons.lang.StringUtils;
import org.nuxeo.ecm.core.query.QueryParseException;
import org.nuxeo.ecm.core.query.sql.NXQL;
import org.nuxeo.ecm.core.query.sql.model.BooleanLiteral;
import org.nuxeo.ecm.core.query.sql.model.DateLiteral;
import org.nuxeo.ecm.core.query.sql.model.DoubleLiteral;
import org.nuxeo.ecm.core.query.sql.model.Expression;
import org.nuxeo.ecm.core.query.sql.model.Function;
import org.nuxeo.ecm.core.query.sql.model.IntegerLiteral;
import org.nuxeo.ecm.core.query.sql.model.Literal;
import org.nuxeo.ecm.core.query.sql.model.LiteralList;
import org.nuxeo.ecm.core.query.sql.model.MultiExpression;
import org.nuxeo.ecm.core.query.sql.model.Operand;
import org.nuxeo.ecm.core.query.sql.model.Operator;
import org.nuxeo.ecm.core.query.sql.model.Predicate;
import org.nuxeo.ecm.core.query.sql.model.Reference;
import org.nuxeo.ecm.core.query.sql.model.StringLiteral;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
/**
* Evaluator for an {@link Expression}.
*
* @since 5.9.4
*/
public abstract class ExpressionEvaluator {
/** pseudo NXQL to resolve ancestor ids. */
public static final String NXQL_ECM_ANCESTOR_IDS = "ecm:__ancestorIds";
/** pseudo NXQL to resolve internal path. */
public static final String NXQL_ECM_PATH = "ecm:__path";
/** pseudo NXQL to resolve read acls. */
public static final String NXQL_ECM_READ_ACL = "ecm:__read_acl";
public static final String NXQL_ECM_FULLTEXT_SIMPLE = "ecm:__fulltextSimple";
public static final String NXQL_ECM_FULLTEXT_BINARY = "ecm:__fulltextBinary";
protected static final String DATE_CAST = "DATE";
protected static final String PHRASE_QUOTE = "\"";
protected static final String NEG_PHRASE_QUOTE = "-\"";
protected static final String OR = "or";
/**
* Interface for a class that knows how to resolve a path into an id.
*/
public interface PathResolver {
/**
* Returns the id for a given path.
*
* @param path the path
* @return the id, or {@code null} if not found
*/
String getIdForPath(String path);
}
public final PathResolver pathResolver;
public final Set<String> principals;
public final boolean fulltextSearchDisabled;
public boolean hasFulltext;
public ExpressionEvaluator(PathResolver pathResolver, String[] principals, boolean fulltextSearchDisabled) {
this.pathResolver = pathResolver;
this.principals = principals == null ? null : new HashSet<String>(Arrays.asList(principals));
this.fulltextSearchDisabled = fulltextSearchDisabled;
}
public Object walkExpression(Expression expr) {
Operator op = expr.operator;
Operand lvalue = expr.lvalue;
Operand rvalue = expr.rvalue;
Reference ref = lvalue instanceof Reference ? (Reference) lvalue : null;
String name = ref != null ? ref.name : null;
String cast = ref != null ? ref.cast : null;
if (DATE_CAST.equals(cast)) {
checkDateLiteralForCast(rvalue, name);
}
if (op == Operator.STARTSWITH) {
return walkStartsWith(lvalue, rvalue);
} else if (NXQL.ECM_PATH.equals(name)) {
return walkEcmPath(op, rvalue);
} else if (NXQL.ECM_ANCESTORID.equals(name)) {
return walkAncestorId(op, rvalue);
} else if (name != null && name.startsWith(NXQL.ECM_FULLTEXT) && !NXQL.ECM_FULLTEXT_JOBID.equals(name)) {
return walkEcmFulltext(name, op, rvalue);
} else if (op == Operator.SUM) {
throw new UnsupportedOperationException("SUM");
} else if (op == Operator.SUB) {
throw new UnsupportedOperationException("SUB");
} else if (op == Operator.MUL) {
throw new UnsupportedOperationException("MUL");
} else if (op == Operator.DIV) {
throw new UnsupportedOperationException("DIV");
} else if (op == Operator.LT) {
return walkLt(lvalue, rvalue);
} else if (op == Operator.GT) {
return walkGt(lvalue, rvalue);
} else if (op == Operator.EQ) {
return walkEq(lvalue, rvalue);
} else if (op == Operator.NOTEQ) {
return walkNotEq(lvalue, rvalue);
} else if (op == Operator.LTEQ) {
return walkLtEq(lvalue, rvalue);
} else if (op == Operator.GTEQ) {
return walkGtEq(lvalue, rvalue);
} else if (op == Operator.AND) {
if (expr instanceof MultiExpression) {
return walkMultiExpression((MultiExpression) expr);
} else {
return walkAnd(lvalue, rvalue);
}
} else if (op == Operator.NOT) {
return walkNot(lvalue);
} else if (op == Operator.OR) {
return walkOr(lvalue, rvalue);
} else if (op == Operator.LIKE) {
return walkLike(lvalue, rvalue, true, false);
} else if (op == Operator.ILIKE) {
return walkLike(lvalue, rvalue, true, true);
} else if (op == Operator.NOTLIKE) {
return walkLike(lvalue, rvalue, false, false);
} else if (op == Operator.NOTILIKE) {
return walkLike(lvalue, rvalue, false, true);
} else if (op == Operator.IN) {
return walkIn(lvalue, rvalue, true);
} else if (op == Operator.NOTIN) {
return walkIn(lvalue, rvalue, false);
} else if (op == Operator.ISNULL) {
return walkIsNull(lvalue);
} else if (op == Operator.ISNOTNULL) {
return walkIsNotNull(lvalue);
} else if (op == Operator.BETWEEN) {
return walkBetween(lvalue, rvalue, true);
} else if (op == Operator.NOTBETWEEN) {
return walkBetween(lvalue, rvalue, false);
} else {
throw new QueryParseException("Unknown operator: " + op);
}
}
protected void checkDateLiteralForCast(Operand value, String name) {
if (value instanceof DateLiteral && !((DateLiteral) value).onlyDate) {
throw new QueryParseException("DATE() cast must be used with DATE literal, not TIMESTAMP: " + name);
}
}
protected Boolean walkEcmPath(Operator op, Operand rvalue) {
if (op != Operator.EQ && op != Operator.NOTEQ) {
throw new QueryParseException(NXQL.ECM_PATH + " requires = or <> operator");
}
if (!(rvalue instanceof StringLiteral)) {
throw new QueryParseException(NXQL.ECM_PATH + " requires literal path as right argument");
}
String path = ((StringLiteral) rvalue).value;
if (path.length() > 1 && path.endsWith("/")) {
path = path.substring(0, path.length() - 1);
}
String id = pathResolver.getIdForPath(path);
Object right = walkReference(new Reference(NXQL.ECM_UUID));
if (id == null) {
return FALSE;
}
Boolean eq = eq(id, right);
return op == Operator.EQ ? eq : not(eq);
}
protected Boolean walkAncestorId(Operator op, Operand rvalue) {
if (op != Operator.EQ && op != Operator.NOTEQ) {
throw new QueryParseException(NXQL.ECM_ANCESTORID + " requires = or <> operator");
}
if (!(rvalue instanceof StringLiteral)) {
throw new QueryParseException(NXQL.ECM_ANCESTORID + " requires literal id as right argument");
}
String ancestorId = ((StringLiteral) rvalue).value;
Object[] ancestorIds = (Object[]) walkReference(new Reference(NXQL_ECM_ANCESTOR_IDS));
boolean eq = op == Operator.EQ ? true : false;
if (ancestorIds == null) {
// placeless
return eq ? FALSE : TRUE;
}
for (Object id : ancestorIds) {
if (ancestorId.equals(id)) {
return eq ? TRUE : FALSE;
}
}
return eq ? FALSE : TRUE;
}
protected Boolean walkEcmFulltext(String name, Operator op, Operand rvalue) {
if (op != Operator.EQ && op != Operator.LIKE) {
throw new QueryParseException(NXQL.ECM_FULLTEXT + " requires = or LIKE operator");
}
if (!(rvalue instanceof StringLiteral)) {
throw new QueryParseException(NXQL.ECM_FULLTEXT + " requires literal string as right argument");
}
if (fulltextSearchDisabled) {
throw new QueryParseException("Fulltext search disabled by configuration");
}
String query = ((StringLiteral) rvalue).value;
if (name.equals(NXQL.ECM_FULLTEXT)) {
// standard fulltext query
hasFulltext = true;
String simple = (String) walkReference(new Reference(NXQL_ECM_FULLTEXT_SIMPLE));
String binary = (String) walkReference(new Reference(NXQL_ECM_FULLTEXT_BINARY));
return fulltext(simple, binary, query);
} else {
// secondary index match with explicit field
// do a regexp on the field
if (name.charAt(NXQL.ECM_FULLTEXT.length()) != '.') {
throw new QueryParseException(name + " has incorrect syntax for a secondary fulltext index");
}
String prop = name.substring(NXQL.ECM_FULLTEXT.length() + 1);
String ft = query.replace(" ", "%");
rvalue = new StringLiteral(ft);
return walkLike(new Reference(prop), rvalue, true, true);
}
}
public Boolean walkNot(Operand value) {
return not(bool(walkOperand(value)));
}
public Boolean walkIsNull(Operand value) {
return Boolean.valueOf(walkOperand(value) == null);
}
public Boolean walkIsNotNull(Operand value) {
return Boolean.valueOf(walkOperand(value) != null);
}
// ternary logic
public Boolean walkMultiExpression(MultiExpression expr) {
Boolean res = TRUE;
for (Operand value : expr.values) {
Boolean bool = bool(walkOperand(value));
// don't short-circuit on null, we want to walk all references deterministically
res = and(res, bool);
}
return res;
}
public Boolean walkAnd(Operand lvalue, Operand rvalue) {
Boolean left = bool(walkOperand(lvalue));
Boolean right = bool(walkOperand(rvalue));
return and(left, right);
}
public Boolean walkOr(Operand lvalue, Operand rvalue) {
Boolean left = bool(walkOperand(lvalue));
Boolean right = bool(walkOperand(rvalue));
return or(left, right);
}
public Boolean walkEq(Operand lvalue, Operand rvalue) {
Object right = walkOperand(rvalue);
if (isMixinTypes(lvalue)) {
if (!(right instanceof String)) {
throw new QueryParseException("Invalid EQ rhs: " + rvalue);
}
return walkMixinTypes(Collections.singletonList((String) right), true);
}
Object left = walkOperand(lvalue);
return eqMaybeList(left, right);
}
public Boolean walkNotEq(Operand lvalue, Operand rvalue) {
if (isMixinTypes(lvalue)) {
Object right = walkOperand(rvalue);
if (!(right instanceof String)) {
throw new QueryParseException("Invalid NE rhs: " + rvalue);
}
return walkMixinTypes(Collections.singletonList((String) right), false);
}
return not(walkEq(lvalue, rvalue));
}
public Boolean walkLt(Operand lvalue, Operand rvalue) {
Integer cmp = cmp(lvalue, rvalue);
return cmp == null ? null : cmp < 0;
}
public Boolean walkGt(Operand lvalue, Operand rvalue) {
Integer cmp = cmp(lvalue, rvalue);
return cmp == null ? null : cmp > 0;
}
public Boolean walkLtEq(Operand lvalue, Operand rvalue) {
Integer cmp = cmp(lvalue, rvalue);
return cmp == null ? null : cmp <= 0;
}
public Boolean walkGtEq(Operand lvalue, Operand rvalue) {
Integer cmp = cmp(lvalue, rvalue);
return cmp == null ? null : cmp >= 0;
}
public Object walkBetween(Operand lvalue, Operand rvalue, boolean positive) {
LiteralList l = (LiteralList) rvalue;
Predicate va = new Predicate(lvalue, Operator.GTEQ, l.get(0));
Predicate vb = new Predicate(lvalue, Operator.LTEQ, l.get(1));
Predicate pred = new Predicate(va, Operator.AND, vb);
if (!positive) {
pred = new Predicate(pred, Operator.NOT, null);
}
return walkExpression(pred);
}
public Boolean walkIn(Operand lvalue, Operand rvalue, boolean positive) {
Object right = walkOperand(rvalue);
if (!(right instanceof List)) {
throw new QueryParseException("Invalid IN rhs: " + rvalue);
}
if (isMixinTypes(lvalue)) {
return walkMixinTypes((List<String>) right, positive);
}
Object left = walkOperand(lvalue);
Boolean in = inMaybeList(left, (List<Object>) right);
return positive ? in : not(in);
}
public Object walkOperand(Operand op) {
if (op instanceof Literal) {
return walkLiteral((Literal) op);
} else if (op instanceof LiteralList) {
return walkLiteralList((LiteralList) op);
} else if (op instanceof Function) {
return walkFunction((Function) op);
} else if (op instanceof Expression) {
return walkExpression((Expression) op);
} else if (op instanceof Reference) {
return walkReference((Reference) op);
} else {
throw new QueryParseException("Unknown operand: " + op);
}
}
public Object walkLiteral(Literal lit) {
if (lit instanceof BooleanLiteral) {
return walkBooleanLiteral((BooleanLiteral) lit);
} else if (lit instanceof DateLiteral) {
return walkDateLiteral((DateLiteral) lit);
} else if (lit instanceof DoubleLiteral) {
return walkDoubleLiteral((DoubleLiteral) lit);
} else if (lit instanceof IntegerLiteral) {
return walkIntegerLiteral((IntegerLiteral) lit);
} else if (lit instanceof StringLiteral) {
return walkStringLiteral((StringLiteral) lit);
} else {
throw new QueryParseException("Unknown literal: " + lit);
}
}
public Boolean walkBooleanLiteral(BooleanLiteral lit) {
return Boolean.valueOf(lit.value);
}
public Calendar walkDateLiteral(DateLiteral lit) {
if (lit.onlyDate) {
Calendar date = lit.toCalendar();
if (date != null) {
date.set(Calendar.HOUR_OF_DAY, 0);
date.set(Calendar.MINUTE, 0);
date.set(Calendar.SECOND, 0);
date.set(Calendar.MILLISECOND, 0);
}
return date;
} else {
return lit.toCalendar();
}
}
public Double walkDoubleLiteral(DoubleLiteral lit) {
return Double.valueOf(lit.value);
}
public Long walkIntegerLiteral(IntegerLiteral lit) {
return Long.valueOf(lit.value);
}
public String walkStringLiteral(StringLiteral lit) {
return lit.value;
}
public List<Object> walkLiteralList(LiteralList litList) {
List<Object> list = new ArrayList<Object>(litList.size());
for (Literal lit : litList) {
list.add(walkLiteral(lit));
}
return list;
}
public Boolean walkLike(Operand lvalue, Operand rvalue, boolean positive, boolean caseInsensitive) {
Object left = walkOperand(lvalue);
Object right = walkOperand(rvalue);
if (!(right instanceof String)) {
throw new QueryParseException("Invalid LIKE rhs: " + rvalue);
}
return likeMaybeList(left, (String) right, positive, caseInsensitive);
}
public Object walkFunction(Function func) {
throw new UnsupportedOperationException("Function");
}
public Boolean walkStartsWith(Operand lvalue, Operand rvalue) {
if (!(lvalue instanceof Reference)) {
throw new QueryParseException("Invalid STARTSWITH query, left hand side must be a property: " + lvalue);
}
String name = ((Reference) lvalue).name;
if (!(rvalue instanceof StringLiteral)) {
throw new QueryParseException(
"Invalid STARTSWITH query, right hand side must be a literal path: " + rvalue);
}
String path = ((StringLiteral) rvalue).value;
if (path.length() > 1 && path.endsWith("/")) {
path = path.substring(0, path.length() - 1);
}
if (NXQL.ECM_PATH.equals(name)) {
return walkStartsWithPath(path);
} else {
return walkStartsWithNonPath(lvalue, path);
}
}
protected Boolean walkStartsWithPath(String path) {
// resolve path
String ancestorId = pathResolver.getIdForPath(path);
// don't return early on null ancestorId, we want to walk all references deterministically
Object[] ancestorIds = (Object[]) walkReference(new Reference(NXQL_ECM_ANCESTOR_IDS));
if (ancestorId == null) {
// no such path
return FALSE;
}
if (ancestorIds == null) {
// placeless
return FALSE;
}
for (Object id : ancestorIds) {
if (ancestorId.equals(id)) {
return TRUE;
}
}
return FALSE;
}
protected Boolean walkStartsWithNonPath(Operand lvalue, String path) {
Object left = walkReference((Reference) lvalue);
// exact match
Boolean bool = eqMaybeList(left, path);
if (TRUE.equals(bool)) {
return TRUE;
}
// prefix match TODO escape % chars
String pattern = path + "/%";
return likeMaybeList(left, pattern, true, false);
}
/**
* Evaluates a reference over the context state.
*
* @param ref the reference
*/
public abstract Object walkReference(Reference ref);
protected boolean isMixinTypes(Operand op) {
if (!(op instanceof Reference)) {
return false;
}
return ((Reference) op).name.equals(NXQL.ECM_MIXINTYPE);
}
protected Boolean bool(Object value) {
if (value == null) {
return null;
}
if (!(value instanceof Boolean)) {
throw new QueryParseException("Not a boolean: " + value);
}
return (Boolean) value;
}
// ternary logic
protected Boolean not(Boolean value) {
if (value == null) {
return null;
}
return !value;
}
// ternary logic
protected Boolean and(Boolean left, Boolean right) {
if (TRUE.equals(left)) {
return right;
} else {
return left;
}
}
// ternary logic
protected Boolean or(Boolean left, Boolean right) {
if (TRUE.equals(left)) {
return left;
} else {
return right;
}
}
// ternary logic
protected Boolean eq(Object left, Object right) {
if (left == null || right == null) {
return null;
}
if (left instanceof Calendar && right instanceof Calendar) {
// avoid timezone issues (NXP-20260)
return ((Calendar) left).getTimeInMillis() == ((Calendar) right).getTimeInMillis();
}
return left.equals(right);
}
// ternary logic
protected Boolean in(Object left, List<Object> right) {
if (left == null) {
return null;
}
boolean hasNull = false;
for (Object r : right) {
if (r == null) {
hasNull = true;
} else if (left.equals(r)) {
return TRUE;
}
}
return hasNull ? null : FALSE;
}
protected Integer cmp(Operand lvalue, Operand rvalue) {
Object left = walkOperand(lvalue);
Object right = walkOperand(rvalue);
return cmp(left, right);
}
// ternary logic
protected Integer cmp(Object left, Object right) {
if (left == null || right == null) {
return null;
}
if (!(left instanceof Comparable)) {
throw new QueryParseException("Not a comparable: " + left);
}
return ((Comparable<Object>) left).compareTo(right);
}
// ternary logic
protected Boolean like(Object left, String right, boolean caseInsensitive) {
if (left == null || right == null) {
return null;
}
if (!(left instanceof String)) {
throw new QueryParseException("Invalid LIKE lhs: " + left);
}
String value = (String) left;
if (caseInsensitive) {
value = value.toLowerCase();
right = right.toLowerCase();
}
String regex = likeToRegex(right);
boolean match = Pattern.matches(regex.toString(), value);
return match;
}
/**
* Turns a NXQL LIKE pattern into a regex.
* <p>
* % and _ are standard wildcards, and \ escapes them.
*
* @since 7.4
*/
public static String likeToRegex(String like) {
StringBuilder regex = new StringBuilder();
char[] chars = like.toCharArray();
boolean escape = false;
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
boolean escapeNext = false;
switch (c) {
case '%':
if (escape) {
regex.append(c);
} else {
regex.append(".*");
}
break;
case '_':
if (escape) {
regex.append(c);
} else {
regex.append(".");
}
break;
case '\\':
if (escape) {
regex.append("\\\\"); // backslash escaped for regexp
} else {
escapeNext = true;
}
break;
default:
// escape mostly everything just in case
if (!CharUtils.isAsciiAlphanumeric(c)) {
regex.append("\\");
}
regex.append(c);
break;
}
escape = escapeNext;
}
if (escape) {
// invalid string terminated by escape character, ignore
}
return regex.toString();
}
// if list, use EXIST (SELECT 1 FROM left WHERE left.item = right)
protected Boolean eqMaybeList(Object left, Object right) {
if (left instanceof Object[]) {
for (Object l : ((Object[]) left)) {
Boolean eq = eq(l, right);
if (TRUE.equals(eq)) {
return TRUE;
}
}
return FALSE;
} else {
return eq(left, right);
}
}
// if list, use EXIST (SELECT 1 FROM left WHERE left.item IN right)
protected Boolean inMaybeList(Object left, List<Object> right) {
if (left instanceof Object[]) {
for (Object l : ((Object[]) left)) {
Boolean in = in(l, right);
if (TRUE.equals(in)) {
return TRUE;
}
}
return FALSE;
} else {
return in(left, right);
}
}
protected Boolean likeMaybeList(Object left, String right, boolean positive, boolean caseInsensitive) {
if (left instanceof Object[]) {
for (Object l : ((Object[]) left)) {
Boolean like = like(l, right, caseInsensitive);
if (TRUE.equals(like)) {
return Boolean.valueOf(positive);
}
}
return Boolean.valueOf(!positive);
} else {
Boolean like = like(left, right, caseInsensitive);
return positive ? like : not(like);
}
}
/**
* Matches the mixin types against a list of values.
* <p>
* Used for:
* <ul>
* <li>ecm:mixinTypes = 'foo'
* <li>ecm:mixinTypes != 'foo'
* <li>ecm:mixinTypes IN ('foo', 'bar')
* <li>ecm:mixinTypes NOT IN ('foo', 'bar')
* </ul>
*
* @param mixins the mixin(s) to match
* @param include {@code true} for = and IN
* @since 7.4
*/
public abstract Boolean walkMixinTypes(List<String> mixins, boolean include);
/*
* ----- simple parsing, don't try to be exhaustive -----
*/
private static final Pattern WORD_PATTERN = Pattern.compile("[\\s\\p{Punct}]+");
private static final String UNACCENTED = "aaaaaaaceeeeiiii\u00f0nooooo\u00f7ouuuuy\u00fey";
private static final String STOP_WORDS_STR = "a an are and as at be by for from how " //
+ "i in is it of on or that the this to was what when where who will with " //
+ "car donc est il ils je la le les mais ni nous or ou pour tu un une vous " //
+ "www com net org";
private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(StringUtils.split(STOP_WORDS_STR, ' ')));
/**
* Checks if the fulltext combination of string1 and string2 matches the query expression.
*/
protected static Boolean fulltext(String string1, String string2, String queryString) {
if (queryString == null || (string1 == null && string2 == null)) {
return null;
}
// query
List<String> query = new ArrayList<String>();
String phrase = null;
int phraseWordCount = 1;
int maxPhraseWordCount = 1; // maximum number of words in a phrase
for (String word : StringUtils.split(queryString.toLowerCase(), ' ')) {
if (WORD_PATTERN.matcher(word).matches()) {
continue;
}
if (phrase != null) {
if (word.endsWith(PHRASE_QUOTE)) {
phrase += " " + word.substring(0, word.length() - 1);
query.add(phrase);
phraseWordCount++;
if (maxPhraseWordCount < phraseWordCount) {
maxPhraseWordCount = phraseWordCount;
}
phrase = null;
phraseWordCount = 1;
} else {
phrase += " " + word;
phraseWordCount++;
}
} else {
if (word.startsWith(PHRASE_QUOTE)) {
phrase = word.substring(1);
} else if (word.startsWith(NEG_PHRASE_QUOTE)) {
phrase = "-" + word.substring(2);
} else {
if (word.startsWith("+")) {
word = word.substring(1);
}
query.add(word);
}
}
}
if (query.isEmpty()) {
return FALSE;
}
// fulltext
Set<String> fulltext = new HashSet<String>();
fulltext.addAll(parseFullText(string1, maxPhraseWordCount));
fulltext.addAll(parseFullText(string2, maxPhraseWordCount));
return Boolean.valueOf(fulltext(fulltext, query));
}
private static Set<String> parseFullText(String string, int phraseSize) {
if (string == null) {
return Collections.emptySet();
}
Set<String> set = new HashSet<String>();
Deque<String> phraseWords = new LinkedList<>();
for (String word : WORD_PATTERN.split(string)) {
word = parseWord(word);
if (word != null) {
word = word.toLowerCase();
set.add(word);
if (phraseSize > 1) {
phraseWords.addLast(word);
if (phraseWords.size() > 1) {
if (phraseWords.size() > phraseSize) {
phraseWords.removeFirst();
}
addPhraseWords(set, phraseWords);
}
}
}
}
while (phraseWords.size() > 2) {
phraseWords.removeFirst();
addPhraseWords(set, phraseWords);
}
return set;
}
/**
* Adds to the set all the sub-phrases from the start of the phraseWords.
*/
private static void addPhraseWords(Set<String> set, Deque<String> phraseWords) {
String[] array = phraseWords.toArray(new String[0]);
for (int len = 2; len <= array.length; len++) {
String phrase = StringUtils.join(array, ' ', 0, len);
set.add(phrase);
}
}
private static String parseWord(String string) {
int len = string.length();
if (len < 3) {
return null;
}
StringBuilder buf = new StringBuilder(len);
for (int i = 0; i < len; i++) {
char c = Character.toLowerCase(string.charAt(i));
if (c == '\u00e6') {
buf.append("ae");
} else if (c >= '\u00e0' && c <= '\u00ff') {
buf.append(UNACCENTED.charAt((c) - 0xe0));
} else if (c == '\u0153') {
buf.append("oe");
} else {
buf.append(c);
}
}
// simple heuristic to remove plurals
int l = buf.length();
if (l > 3 && buf.charAt(l - 1) == 's') {
buf.setLength(l - 1);
}
String word = buf.toString();
if (STOP_WORDS.contains(word)) {
return null;
}
return word;
}
// matches "foo OR bar baz" as "foo OR (bar AND baz)"
protected static boolean fulltext(Set<String> fulltext, List<String> query) {
boolean andMatch = true;
for (PeekingIterator<String> it = Iterators.peekingIterator(query.iterator()); it.hasNext(); ) {
String word = it.next();
boolean match;
if (word.endsWith("*") || word.endsWith("%")) {
// prefix match
match = false;
String prefix = word.substring(0, word.length() - 2);
for (String candidate : fulltext) {
if (candidate.startsWith(prefix)) {
match = true;
break;
}
}
} else {
if (word.startsWith("-")) {
word = word.substring(1);//
match = !fulltext.contains(word);
} else {
match = fulltext.contains(word);
}
}
if (!match) {
andMatch = false;
}
if (it.hasNext() && it.peek().equals(OR)) {
// end of AND group
// swallow OR
it.next();
// return if the previous AND group matched
if (andMatch) {
return true;
}
// else start next AND group
andMatch = true;
}
}
return andMatch;
}
// matches "foo OR bar baz" as "(foo OR bar) AND baz"
protected static boolean fulltext1(Set<String> fulltext, List<String> query) {
boolean inOr = false; // if we're in a OR group
boolean orMatch = false; // value of the OR group
for (PeekingIterator<String> it = Iterators.peekingIterator(query.iterator()); it.hasNext(); ) {
String word = it.next();
if (it.hasNext() && it.peek().equals(OR)) {
inOr = true;
orMatch = false;
}
boolean match;
if (word.endsWith("*") || word.endsWith("%")) {
// prefix match
match = false;
String prefix = word.substring(0, word.length() - 2);
for (String candidate : fulltext) {
if (candidate.startsWith(prefix)) {
match = true;
break;
}
}
} else {
if (word.startsWith("-")) {
word = word.substring(1);//
match = !fulltext.contains(word);
} else {
match = fulltext.contains(word);
}
}
if (inOr) {
if (match) {
orMatch = true;
}
if (it.hasNext() && it.peek().equals(OR)) {
// swallow OR and keep going in OR group
it.next();
continue;
}
// finish OR group
match = orMatch;
inOr = false;
}
if (!match) {
return false;
}
}
if (inOr) {
// trailing OR, ignore and finish previous group
if (!orMatch) {
return false;
}
}
return true;
}
}