/* * eXist Open Source Native XML Database * Copyright (C) 2001-2015 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ package org.exist.indexing.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.*; import org.apache.lucene.search.*; import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.exist.xquery.XPathException; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** * Parses the XML representation of a Lucene query and transforms * it into a tree of {@link org.apache.lucene.search.Query} objects. */ public class XMLToQuery { private final LuceneIndex index; public XMLToQuery(LuceneIndex index) { this.index = index; } public Query parse(String field, Element root, Analyzer analyzer, Properties options) throws XPathException { Query query = null; String localName = root.getLocalName(); if (null != localName) { switch (localName) { case "query": query = parseChildren(field, root, analyzer, options); break; case "term": query = termQuery(field, root, analyzer); break; case "wildcard": query = wildcardQuery(field, root, analyzer, options); break; case "prefix": query = prefixQuery(field, root, options); break; case "fuzzy": query = fuzzyQuery(field, root); break; case "bool": query = booleanQuery(field, root, analyzer, options); break; case "phrase": query = phraseQuery(field, root, analyzer); break; case "near": query = nearQuery(field, root, analyzer); break; case "first": query = getSpanFirst(field, root, analyzer); break; case "regex": query = regexQuery(field, root, options); break; default: throw new XPathException("Unknown element in lucene query expression: " + localName); } } if (query != null) { setBoost(root, query); } return query; } private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException { NodeList termList = node.getElementsByTagName("term"); if (termList.getLength() == 0) { PhraseQuery query = new PhraseQuery(); String qstr = getText(node); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { query.add(new Term(field, termAttr.toString())); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < termList.getLength(); i++) { Element elem = (Element) termList.item(i); String text = getText(elem); if (text.indexOf('?') > -1 || text.indexOf('*') > 0) { try { Term[] expanded = expandTerms(field, text); if (expanded.length > 0) query.add(expanded); } catch (IOException e) { throw new XPathException("IO error while expanding query terms: " + e.getMessage(), e); } } else { String termStr = getTerm(field, text, analyzer); if (termStr != null) query.add(new Term(field, text)); } } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0; boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); if (!hasElementContent(node)) { String qstr = getText(node); List<SpanTermQuery> list = new ArrayList<>(8); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { list.add(new SpanTermQuery(new Term(field, termAttr.toString()))); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder); } SpanQuery[] children = parseSpanChildren(field, node, analyzer); return new SpanNearQuery(children, slop, inOrder); } private SpanQuery[] parseSpanChildren(String field, Element node, Analyzer analyzer) throws XPathException { List<SpanQuery> list = new ArrayList<>(8); Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { final String localName = child.getLocalName(); if (null != localName) { switch (localName) { case "term": getSpanTerm(list, field, (Element) child, analyzer); break; case "near": list.add(nearQuery(field, (Element) child, analyzer)); break; case "first": list.add(getSpanFirst(field, (Element) child, analyzer)); break; case "regex": list.add(getSpanRegex(field, (Element) child, analyzer)); break; default: throw new XPathException("Unknown query element: " + child.getNodeName()); } } } child = child.getNextSibling(); } return list.toArray(new SpanQuery[list.size()]); } private void getSpanTerm(List<SpanQuery> list, String field, Element node, Analyzer analyzer) throws XPathException { String termStr = getTerm(field, getText(node), analyzer); if (termStr != null) list.add(new SpanTermQuery(new Term(field, termStr))); } private SpanQuery getSpanRegex(String field, Element node, Analyzer analyzer) { String regex = getText(node); return new SpanMultiTermQueryWrapper<RegexpQuery>(new RegexpQuery(new Term(field, regex))); } private SpanQuery getSpanFirst(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0; boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); SpanQuery query = null; if (hasElementContent(node)) { SpanQuery[] children = parseSpanChildren(field, node, analyzer); query = new SpanNearQuery(children, slop, inOrder); } else { String termStr = getTerm(field, getText(node), analyzer); if (termStr != null) query = new SpanTermQuery(new Term(field, termStr)); } int end = 0; if (node.hasAttribute("end")) { try { end = Integer.parseInt(node.getAttribute("end")); } catch (NumberFormatException e) { throw new XPathException("Attribute 'end' to query element 'first' should be a " + "valid integer. Got: " + node.getAttribute("end")); } } return query != null ? new SpanFirstQuery(query, end) : null; } private int getSlop(Element node) throws XPathException { String slop = node.getAttribute("slop"); if (slop != null && slop.length() > 0) { try { return Integer.parseInt(slop); } catch (NumberFormatException e) { throw new XPathException("Query parameter 'slop' should be an integer value. Got: " + slop); } } return -1; } private Term[] expandTerms(String field, String queryStr) throws XPathException, IOException { return index.withReader(reader -> { final Automaton automaton = WildcardQuery.toAutomaton(new Term(field, queryStr)); final CompiledAutomaton compiled = new CompiledAutomaton(automaton); final List<Term> termList = new ArrayList<>(8); for (AtomicReaderContext atomic : reader.leaves()) { Terms terms = atomic.reader().terms(field); if (terms != null) { TermsEnum termsEnum = compiled.getTermsEnum(terms); BytesRef data = termsEnum.next(); while (data != null) { String term = data.utf8ToString(); termList.add(new Term(field, term)); data = termsEnum.next(); } } } Term[] matchingTerms = new Term[termList.size()]; return termList.toArray(matchingTerms); }); } private Query termQuery(String field, Element node, Analyzer analyzer) throws XPathException { String termStr = getTerm(field, getText(node), analyzer); return termStr == null ? null : new TermQuery(new Term(field, termStr)); } private String getTerm(String field, String text, Analyzer analyzer) throws XPathException { String term = null; try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { term = termAttr.toString(); } stream.end(); stream.close(); return term; } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } } private Query wildcardQuery(String field, Element node, Analyzer analyzer, Properties options) throws XPathException { WildcardQuery query = new WildcardQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query prefixQuery(String field, Element node, Properties options) { PrefixQuery query = new PrefixQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query fuzzyQuery(String field, Element node) throws XPathException { int maxEdits = FuzzyQuery.defaultMaxEdits; String attr = node.getAttribute("max-edits"); if (attr != null && attr.length() > 0) { try { maxEdits = Integer.parseInt(attr); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new XPathException("Query parameter max-edits must by <= " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); } } catch (NumberFormatException e) { throw new XPathException("Query parameter 'max-edits' should be an integer value. Got: " + attr); } } return new FuzzyQuery(new Term(field, getText(node)), maxEdits); } private Query regexQuery(String field, Element node, Properties options) { RegexpQuery query = new RegexpQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query booleanQuery(String field, Element node, Analyzer analyzer, Properties options) throws XPathException { BooleanQuery query = new BooleanQuery(); Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) child; Query childQuery = parse(field, elem, analyzer, options); if (childQuery != null) { BooleanClause.Occur occur = getOccur(elem); query.add(childQuery, occur); } } child = child.getNextSibling(); } return query; } private void setRewriteMethod(MultiTermQuery query, Element node, Properties options) { String option = node.getAttribute("filter-rewrite"); if (option == null) option = "yes"; if (options != null) option = options.getProperty(LuceneIndexWorker.OPTION_FILTER_REWRITE, "yes"); if (option.equalsIgnoreCase("yes")) query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); else query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); } private BooleanClause.Occur getOccur(Element elem) { BooleanClause.Occur occur = BooleanClause.Occur.SHOULD; String occurOpt = elem.getAttribute("occur"); if (occurOpt != null) { switch (occurOpt) { case "must": occur = BooleanClause.Occur.MUST; break; case "not": occur = BooleanClause.Occur.MUST_NOT; break; case "should": occur = BooleanClause.Occur.SHOULD; break; } } return occur; } private Query parseChildren(String field, Element root, Analyzer analyzer, Properties options) throws XPathException { Query query = null; Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Query childQuery = parse(field, (Element) child, analyzer, options); if (query != null) { if (query instanceof BooleanQuery) ((BooleanQuery) query).add(childQuery, BooleanClause.Occur.SHOULD); else { BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add(query, BooleanClause.Occur.SHOULD); boolQuery.add(childQuery, BooleanClause.Occur.SHOULD); query = boolQuery; } } else query = childQuery; } child = child.getNextSibling(); } return query; } private void setBoost(Element node, Query query) throws XPathException { String boost = node.getAttribute("boost"); if (boost != null && boost.length() > 0) { try { query.setBoost(Float.parseFloat(boost)); } catch (NumberFormatException e) { throw new XPathException("Bad value for boost in query parameter. Got: " + boost); } } } private String getText(Element root) { StringBuilder buf = new StringBuilder(); Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.TEXT_NODE) buf.append(child.getNodeValue()); child = child.getNextSibling(); } return buf.toString(); } private boolean hasElementContent(Element root) { Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) return true; child = child.getNextSibling(); } return false; } }