//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.uima.utils.select;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.Validate;
/**
* Parses a CSS selector into an Evaluator tree.
*/
public class QueryParser<T> {
/** Combining tokens */
private static final String[] combinators = {",", ">", "+", "~", " "};
/** Attribute evaluation tokens */
private static final String[] AttributeEvals = new String[] {"=", "!=", "^=", "$=", "*=", "~="};
/** pattern for matching two arg pseudo selectors :first-child, :last-child, :nth-child, ... */
private static final Pattern NTH_AB =
Pattern.compile("((\\+|-)?(\\d+)?)n(\\s*(\\+|-)?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
/** pattern for matching one arg pseudo selectors :first-child, :last-child, :nth-child, ... */
private static final Pattern NTH_B = Pattern.compile("(\\+|-)?(\\d+)");
/** the token queue */
private TokenQueue tq;
/** the query being parsed */
private String query;
/** the list of evaluators constructed on calling parse */
private List<Evaluator<T>> evals = new ArrayList<>();
/**
* Create a new QueryParser.
*
* @param query CSS query
*/
private QueryParser(String query) {
this.query = query;
tq = new TokenQueue(query);
}
/**
* Parse a CSS query into an Evaluator.
*
* @param query CSS query
* @return Evaluator
*/
public static <T> Evaluator<T> parse(String query) {
try {
QueryParser<T> p = new QueryParser<>(query);
return p.parse();
} catch (IllegalArgumentException e) {
throw new Selector.SelectorParseException(e.getMessage());
}
}
/**
* Parse the query
*
* @return Evaluator
*/
Evaluator<T> parse() {
tq.consumeWhitespace();
if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements
evals.add(new StructuralEvaluator.Root<T>());
combinator(tq.consume());
} else {
findElements();
}
while (!tq.isEmpty()) {
// hierarchy and extras
boolean seenWhite = tq.consumeWhitespace();
if (tq.matchesAny(combinators)) {
combinator(tq.consume());
} else if (seenWhite) {
combinator(' ');
} else { // E.class, E#id, E[attr] etc. AND
findElements(); // take next el, #. etc off queue
}
}
if (evals.size() == 1) {
return evals.get(0);
}
return new CombiningEvaluator.And<>(evals);
}
/**
* Add a combinator to the evaluation and parse the sub query
*
* @param combinator defining character
*/
private void combinator(char combinator) {
tq.consumeWhitespace();
String subQuery = consumeSubQuery(); // support multi > childs
Evaluator<T> rootEval; // the new topmost evaluator
Evaluator<T> currentEval; // the evaluator that the new evaluator will be combined to.
// Could be root, or rightmost or.
Evaluator<T> newEval = parse(subQuery); // the evaluator to add into target evaluator
boolean replaceRightMost = false;
if (evals.size() == 1) {
rootEval = currentEval = evals.get(0);
// make sure OR (,) has precedence:
if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
currentEval = ((CombiningEvaluator.Or<T>) currentEval).rightMostEvaluator();
replaceRightMost = true;
}
} else {
rootEval = currentEval = new CombiningEvaluator.And<>(evals);
}
evals.clear();
// for most combinators: change the current eval into an AND of the current eval and the new
// eval
if (combinator == '>') {
currentEval = new CombiningEvaluator.And<>(newEval,
new StructuralEvaluator.ImmediateParent<>(currentEval));
} else if (combinator == ' ') {
currentEval =
new CombiningEvaluator.And<>(newEval, new StructuralEvaluator.Parent<>(currentEval));
} else if (combinator == '+') {
currentEval = new CombiningEvaluator.And<>(newEval,
new StructuralEvaluator.ImmediatePreviousSibling<>(currentEval));
} else if (combinator == '~') {
currentEval = new CombiningEvaluator.And<>(newEval,
new StructuralEvaluator.PreviousSibling<>(currentEval));
} else if (combinator == ',') { // group or.
CombiningEvaluator.Or<T> or;
if (currentEval instanceof CombiningEvaluator.Or) {
or = (CombiningEvaluator.Or<T>) currentEval;
or.add(newEval);
} else {
or = new CombiningEvaluator.Or<>();
or.add(currentEval);
or.add(newEval);
}
currentEval = or;
} else {
throw new Selector.SelectorParseException("Unknown combinator: " + combinator);
}
if (replaceRightMost) {
((CombiningEvaluator.Or<T>) rootEval).replaceRightMostEvaluator(currentEval);
} else {
rootEval = currentEval;
}
evals.add(rootEval);
}
/**
* Consume the sub query, up to the next combinator or the end of the query.
*
* @return the sub query
*/
private String consumeSubQuery() {
StringBuilder sq = new StringBuilder();
while (!tq.isEmpty()) {
if (tq.matches("(")) {
sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
} else if (tq.matches("[")) {
sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
} else if (tq.matchesAny(combinators)) {
break;
} else {
sq.append(tq.consume());
}
}
return sq.toString();
}
/**
* Match the next section of the query
*/
private void findElements() {
if (tq.matchChomp("#")) {
byId();
} else if (tq.matchChomp(".")) {
byClass();
} else if (tq.matchesWord() || tq.matches("*|")) {
byTypeName();
} else if (tq.matches("[")) {
byAttribute();
} else if (tq.matchChomp("*")) {
allNodes();
} else if (tq.matchChomp(":lt(")) {
indexLessThan();
} else if (tq.matchChomp(":gt(")) {
indexGreaterThan();
} else if (tq.matchChomp(":eq(")) {
indexEquals();
} else if (tq.matches(":has(")) {
has();
} else if (tq.matches(":contains(")) {
contains(false);
} else if (tq.matches(":containsOwn(")) {
contains(true);
} else if (tq.matches(":matches(")) {
matches(false);
} else if (tq.matches(":matchesOwn(")) {
matches(true);
} else if (tq.matches(":not(")) {
not();
} else if (tq.matchChomp(":nth-child(")) {
cssNthChild(false, false);
} else if (tq.matchChomp(":nth-last-child(")) {
cssNthChild(true, false);
} else if (tq.matchChomp(":nth-of-type(")) {
cssNthChild(false, true);
} else if (tq.matchChomp(":nth-last-of-type(")) {
cssNthChild(true, true);
} else if (tq.matchChomp(":first-child")) {
evals.add(new Evaluator.IsFirstChild<>());
} else if (tq.matchChomp(":last-child")) {
evals.add(new Evaluator.IsLastChild<>());
} else if (tq.matchChomp(":first-of-type")) {
evals.add(new Evaluator.IsFirstOfType<>());
} else if (tq.matchChomp(":last-of-type")) {
evals.add(new Evaluator.IsLastOfType<>());
} else if (tq.matchChomp(":only-child")) {
evals.add(new Evaluator.IsOnlyChild<>());
} else if (tq.matchChomp(":only-of-type")) {
evals.add(new Evaluator.IsOnlyOfType<>());
} else if (tq.matchChomp(":empty")) {
evals.add(new Evaluator.IsEmpty<>());
} else if (tq.matchChomp(":root")) {
evals.add(new Evaluator.IsRoot<>());
} else {
throw new Selector.SelectorParseException(
"Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
}
}
/**
* add the Id evaluator
*/
private void byId() {
String id = tq.consumeIdentifier();
Validate.notEmpty(id);
evals.add(new Evaluator.Id<>(id));
}
/**
* add the Class evaluator
*/
private void byClass() {
String className = tq.consumeIdentifier();
Validate.notEmpty(className);
evals.add(new Evaluator.Class<>(className.trim()));
}
/**
* add the Type Name evaluator
*/
private void byTypeName() {
String typeName = tq.consumeNodeSelector();
Validate.notEmpty(typeName);
evals.add(new Evaluator.TypeName<>(typeName.trim()));
}
/**
* add an Attribute evaluator, selecting the correct one as required
*/
private void byAttribute() {
TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
Validate.notEmpty(key);
cq.consumeWhitespace();
if (cq.isEmpty()) {
if (key.startsWith("^")) {
evals.add(new Evaluator.AttributeStarting<>(key.substring(1)));
} else {
evals.add(new Evaluator.Attribute<>(key));
}
} else {
if (cq.matchChomp("=")) {
evals.add(new Evaluator.AttributeWithValue<>(key, cq.remainder()));
} else if (cq.matchChomp("!=")) {
evals.add(new Evaluator.AttributeWithValueNot<>(key, cq.remainder()));
} else if (cq.matchChomp("^=")) {
evals.add(new Evaluator.AttributeWithValueStarting<>(key, cq.remainder()));
} else if (cq.matchChomp("$=")) {
evals.add(new Evaluator.AttributeWithValueEnding<>(key, cq.remainder()));
} else if (cq.matchChomp("*=")) {
evals.add(new Evaluator.AttributeWithValueContaining<>(key, cq.remainder()));
} else if (cq.matchChomp("~=")) {
evals.add(new Evaluator.AttributeWithValueMatching<>(key, Pattern.compile(cq.remainder())));
} else {
throw new Selector.SelectorParseException(
"Could not parse attribute query '%s': unexpected token at '%s'", query,
cq.remainder());
}
}
}
/**
* add the all nodes evaluator
*/
private void allNodes() {
evals.add(new Evaluator.AllNodes<>());
}
/**
* add index less than evaluator
*/
private void indexLessThan() {
evals.add(new Evaluator.IndexLessThan<>(consumeIndex()));
}
/**
* add index greater than evaluator
*/
private void indexGreaterThan() {
evals.add(new Evaluator.IndexGreaterThan<>(consumeIndex()));
}
/**
* add index equals evaluator
*/
private void indexEquals() {
evals.add(new Evaluator.IndexEquals<>(consumeIndex()));
}
/**
* Add nth child evaluator
*
* @param backwards true if matching from last
* @param ofType true if matching type
*/
private void cssNthChild(boolean backwards, boolean ofType) {
String argS = tq.chompTo(")").trim().toLowerCase();
Matcher mAB = NTH_AB.matcher(argS);
Matcher mB = NTH_B.matcher(argS);
final int a;
final int b;
if ("odd".equals(argS)) {
a = 2;
b = 1;
} else if ("even".equals(argS)) {
a = 2;
b = 0;
} else if (mAB.matches()) {
a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1;
b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0;
} else if (mB.matches()) {
a = 0;
b = Integer.parseInt(mB.group().replaceFirst("^\\+", ""));
} else {
throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format",
argS);
}
if (ofType) {
if (backwards) {
evals.add(new Evaluator.IsNthLastOfType<>(a, b));
} else {
evals.add(new Evaluator.IsNthOfType<>(a, b));
}
} else {
if (backwards) {
evals.add(new Evaluator.IsNthLastChild<>(a, b));
} else {
evals.add(new Evaluator.IsNthChild<>(a, b));
}
}
}
/**
* Consume the index off the queue
*
* @return the index
*/
private int consumeIndex() {
String indexS = tq.chompTo(")").trim();
Validate.isTrue(StringUtils.isNumeric(indexS), "Index must be numeric");
return Integer.parseInt(indexS);
}
/**
* add Has evaluator
*/
private void has() {
tq.consume(":has");
String subQuery = tq.chompBalanced('(', ')');
Validate.notEmpty(subQuery, ":has(el) subselect must not be empty");
evals.add(new StructuralEvaluator.Has<>(parse(subQuery)));
}
/**
* Add contains (or containsOwn) evaluator
*
* @param own true if own text
*/
private void contains(boolean own) {
tq.consume(own ? ":containsOwn" : ":contains");
String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
Validate.notEmpty(searchText, ":contains(text) query must not be empty");
if (own) {
evals.add(new Evaluator.ContainsOwnText<>(searchText));
} else {
evals.add(new Evaluator.ContainsText<>(searchText));
}
}
/**
* Add matches (or matchesOwn) evaluator
*
* @param own true if own text
*/
private void matches(boolean own) {
tq.consume(own ? ":matchesOwn" : ":matches");
String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, ":matches(regex) query must not be empty");
if (own) {
evals.add(new Evaluator.MatchesOwn<>(Pattern.compile(regex)));
} else {
evals.add(new Evaluator.Matches<>(Pattern.compile(regex)));
}
}
/**
* add Not evaluator
*/
private void not() {
tq.consume(":not");
String subQuery = tq.chompBalanced('(', ')');
Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
evals.add(new StructuralEvaluator.Not<>(parse(subQuery)));
}
}