/*
* (C) Copyright 2006-2014 Nuxeo SA (http://nuxeo.com/) and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Florent Guillaume
*/
package org.nuxeo.ecm.core.storage;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.nuxeo.ecm.core.query.QueryParseException;
/**
* Structured fulltext query analyzer.
*/
public class FulltextQueryAnalyzer {
protected static final String SPACE = " ";
protected static final String PLUS = "+";
protected static final String MINUS = "-";
protected static final char CSPACE = ' ';
protected static final String DOUBLE_QUOTES = "\"";
protected static final String OR = "OR";
protected static final Pattern SEPARATOR = Pattern.compile("[ ]");
protected static final Pattern IGNORED = Pattern.compile("\\p{Punct}+");
/**
* Structured fulltext query operator.
*/
public enum Op {
OR, AND, WORD, NOTWORD
}
/**
* Structured fulltext query.
*/
public static class FulltextQuery {
public Op op;
/** The list of terms, if op is OR or AND */
public List<FulltextQuery> terms;
/** The word, if op is WORD or NOTWORD */
public String word;
/**
* Checks if the word is a phrase.
*/
public boolean isPhrase() {
return word != null && word.contains(SPACE);
}
}
protected FulltextQuery ft = new FulltextQuery();
protected List<FulltextQuery> terms = new LinkedList<FulltextQuery>();
protected FulltextQuery analyze(String query) {
query = query.replaceAll(" +", " ").trim();
if (query.trim().length() == 0) {
return null;
}
ft.op = Op.OR;
ft.terms = new LinkedList<FulltextQuery>();
// current sequence of ANDed terms
boolean wasOr = false;
String[] words = split(query);
for (Iterator<String> it = Arrays.asList(words).iterator(); it.hasNext();) {
boolean plus = false;
boolean minus = false;
String word = it.next();
if (ignored(word)) {
continue;
}
if (word.startsWith(PLUS)) {
plus = true;
word = word.substring(1);
} else if (word.startsWith(MINUS)) {
minus = true;
word = word.substring(1);
}
if (word.startsWith(DOUBLE_QUOTES)) {
// read phrase
word = word.substring(1);
StringBuilder phrase = null;
while (true) {
boolean end = word.endsWith(DOUBLE_QUOTES);
if (end) {
word = word.substring(0, word.length() - 1).trim();
}
if (word.contains(DOUBLE_QUOTES)) {
throw new QueryParseException("Invalid fulltext query (double quotes in word): " + query);
}
if (word.length() != 0) {
if (phrase == null) {
phrase = new StringBuilder();
} else {
phrase.append(CSPACE);
}
phrase.append(word);
}
if (end) {
break;
}
if (!it.hasNext()) {
throw new QueryParseException("Invalid fulltext query (unterminated phrase): " + query);
}
word = it.next();
}
if (phrase == null) {
continue;
}
word = phrase.toString();
} else if (word.equalsIgnoreCase(OR)) {
if (wasOr) {
throw new QueryParseException("Invalid fulltext query (OR OR): " + query);
}
if (terms.isEmpty()) {
throw new QueryParseException("Invalid fulltext query (standalone OR): " + query);
}
wasOr = true;
continue;
}
FulltextQuery w = new FulltextQuery();
if (minus) {
if (word.length() == 0) {
throw new QueryParseException("Invalid fulltext query (standalone -): " + query);
}
w.op = Op.NOTWORD;
} else {
if (plus) {
if (word.length() == 0) {
throw new QueryParseException("Invalid fulltext query (standalone +): " + query);
}
}
w.op = Op.WORD;
}
if (wasOr) {
endAnd();
wasOr = false;
}
w.word = word;
terms.add(w);
}
if (wasOr) {
throw new QueryParseException("Invalid fulltext query (final OR): " + query);
}
// final terms
endAnd();
int size = ft.terms.size();
if (size == 0) {
// all terms were negative
return null;
} else if (size == 1) {
// simplify when no OR
ft = ft.terms.get(0);
}
return ft;
}
protected String[] split(String query) {
return SEPARATOR.split(query);
}
protected boolean ignored(String word) {
if ("-".equals(word) || "+".equals(word) || word.contains("\"")) {
return false; // dealt with later, different error
}
return IGNORED.matcher(word).matches();
}
// add current ANDed terms to global OR
protected void endAnd() {
// put negative words at the end
List<FulltextQuery> pos = new LinkedList<FulltextQuery>();
List<FulltextQuery> neg = new LinkedList<FulltextQuery>();
for (FulltextQuery term : terms) {
if (term.op == Op.NOTWORD) {
neg.add(term);
} else {
pos.add(term);
}
}
if (!pos.isEmpty()) {
terms = pos;
terms.addAll(neg);
if (terms.size() == 1) {
ft.terms.add(terms.get(0));
} else {
FulltextQuery a = new FulltextQuery();
a.op = Op.AND;
a.terms = terms;
ft.terms.add(a);
}
}
terms = new LinkedList<FulltextQuery>();
}
public static void translate(FulltextQuery ft, StringBuilder buf, String or, String and, String andNot,
String wordStart, String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
boolean quotePhraseWords) {
if (ft.op == Op.AND || ft.op == Op.OR) {
buf.append('(');
for (int i = 0; i < ft.terms.size(); i++) {
FulltextQuery term = ft.terms.get(i);
if (i > 0) {
buf.append(' ');
if (ft.op == Op.OR) {
buf.append(or);
} else { // Op.AND
if (term.op == Op.NOTWORD) {
buf.append(andNot);
} else {
buf.append(and);
}
}
buf.append(' ');
}
translate(term, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
quotePhraseWords);
}
buf.append(')');
return;
} else {
String word = ft.word;
if (ft.isPhrase()) {
if (quotePhraseWords) {
boolean first = true;
for (String w : word.split(" ")) {
if (!first) {
buf.append(" ");
}
first = false;
appendWord(w, buf, wordStart, wordEnd, wordCharsReserved);
}
} else {
buf.append(phraseStart);
buf.append(word);
buf.append(phraseEnd);
}
} else {
appendWord(word, buf, wordStart, wordEnd, wordCharsReserved);
}
}
}
protected static void appendWord(String word, StringBuilder buf, String start, String end, Set<Character> reserved) {
boolean quote = true;
if (!reserved.isEmpty()) {
for (char c : word.toCharArray()) {
if (reserved.contains(Character.valueOf(c))) {
quote = false;
break;
}
}
}
if (quote) {
buf.append(start);
}
buf.append(word);
if (quote) {
buf.append(end);
}
}
public static boolean hasPhrase(FulltextQuery ft) {
if (ft.op == Op.AND || ft.op == Op.OR) {
for (FulltextQuery term : ft.terms) {
if (hasPhrase(term)) {
return true;
}
}
return false;
} else {
return ft.isPhrase();
}
}
/**
* Analyzes a fulltext query into a generic datastructure that can be used for each specific database.
* <p>
* List of terms containing only negative words are suppressed. Otherwise negative words are put at the end of the
* lists of terms.
*/
public static FulltextQuery analyzeFulltextQuery(String query) {
return new FulltextQueryAnalyzer().analyze(query);
}
/**
* Translate fulltext into a common pattern used by many servers.
*/
public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String phraseQuote) {
StringBuilder buf = new StringBuilder();
translate(ft, buf, or, and, andNot, "", "", Collections.<Character> emptySet(), phraseQuote, phraseQuote, false);
return buf.toString();
}
/**
* Translate fulltext into a common pattern used by many servers.
*/
public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String wordStart,
String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
boolean quotePhraseWords) {
StringBuilder buf = new StringBuilder();
translate(ft, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
quotePhraseWords);
return buf.toString();
}
}