/* * eXist Open Source Native XML Database * Copyright (C) 2001-07 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * \$Id\$ */ package org.exist.indexing.lucene; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardTermEnum; import org.apache.lucene.search.regex.RegexQuery; import org.apache.lucene.search.regex.SpanRegexQuery; import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.exist.xquery.XPathException; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * Parses the XML representation of a Lucene query and transforms * it into a tree of {@link org.apache.lucene.search.Query} objects. */ public class XMLToQuery { private LuceneIndex index; public XMLToQuery(LuceneIndex index) { this.index = index; } public Query parse(String field, Element root, Analyzer analyzer, Properties options) throws XPathException { Query query; String localName = root.getLocalName(); if ("query".equals(localName)) query = parseChildren(field, root, analyzer, options); else if ("term".equals(localName)) query = termQuery(field, root, analyzer); else if ("wildcard".equals(localName)) query = wildcardQuery(field, root, options); else if ("prefix".equals(localName)) query = prefixQuery(field, root, options); else if ("fuzzy".equals(localName)) query = fuzzyQuery(field, root); else if ("bool".equals(localName)) query = booleanQuery(field, root, analyzer, options); else if ("phrase".equals(localName)) query = phraseQuery(field, root, analyzer); else if ("near".equals(localName)) query = nearQuery(field, root, analyzer); else if ("first".equals(localName)) query = getSpanFirst(field, root, analyzer); else if ("regex".equals(localName)) query = regexQuery(field, root, options); else throw new XPathException("Unknown element in lucene query expression: " + localName); if (query != null) setBoost(root, query); return query; } private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException { NodeList termList = node.getElementsByTagName("term"); if (termList.getLength() == 0) { PhraseQuery query = new PhraseQuery(); String qstr = getText(node); TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); TermAttribute termAttr = (TermAttribute) stream.addAttribute(TermAttribute.class); try { stream.reset(); while (stream.incrementToken()) { query.add(new Term(field, termAttr.term())); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < termList.getLength(); i++) { Element elem = (Element) termList.item(i); String text = getText(elem); if (text.indexOf('?') > -1 || text.indexOf('*') > 0) { Term[] expanded = expandTerms(field, text); if (expanded.length > 0) query.add(expanded); } else { String termStr = getTerm(field, text, analyzer); if (termStr != null) query.add(new Term(field, text)); } } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0; boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); if (!hasElementContent(node)) { String qstr = getText(node); TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); TermAttribute termAttr = (TermAttribute) stream.addAttribute(TermAttribute.class); List<SpanTermQuery> list = new ArrayList<SpanTermQuery>(8); try { stream.reset(); while (stream.incrementToken()) { list.add(new SpanTermQuery(new Term(field, termAttr.term()))); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder); } SpanQuery[] children = parseSpanChildren(field, node, analyzer); return new SpanNearQuery(children, slop, inOrder); } private SpanQuery[] parseSpanChildren(String field, Element node, Analyzer analyzer) throws XPathException { List<SpanQuery> list = new ArrayList<SpanQuery>(8); Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { if ("term".equals(child.getLocalName())) getSpanTerm(list, field, (Element) child, analyzer); else if ("near".equals(child.getLocalName())) list.add(nearQuery(field, (Element) child, analyzer)); else if ("first".equals(child.getLocalName())) list.add(getSpanFirst(field, (Element) child, analyzer)); else if ("regex".equals(child.getLocalName())) list.add(getSpanRegex(field, (Element) child, analyzer)); else throw new XPathException("Unknown query element: " + child.getNodeName()); } child = child.getNextSibling(); } return list.toArray(new SpanQuery[list.size()]); } private void getSpanTerm(List<SpanQuery> list, String field, Element node, Analyzer analyzer) throws XPathException { String termStr = getTerm(field, getText(node), analyzer); if (termStr != null) list.add(new SpanTermQuery(new Term(field, termStr))); } private SpanQuery getSpanRegex(String field, Element node, Analyzer analyzer) { String regex = getText(node); return new SpanRegexQuery(new Term(field, regex)); } private SpanQuery getSpanFirst(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0; boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); SpanQuery query = null; if (hasElementContent(node)) { SpanQuery[] children = parseSpanChildren(field, node, analyzer); query = new SpanNearQuery(children, slop, inOrder); } else { String termStr = getTerm(field, getText(node), analyzer); if (termStr != null) query = new SpanTermQuery(new Term(field, termStr)); } int end = 0; if (node.hasAttribute("end")) { try { end = Integer.parseInt(node.getAttribute("end")); } catch (NumberFormatException e) { throw new XPathException("Attribute 'end' to query element 'first' should be a " + "valid integer. Got: " + node.getAttribute("end")); } } return query != null ? new SpanFirstQuery(query, end) : null; } private int getSlop(Element node) throws XPathException { String slop = node.getAttribute("slop"); if (slop != null && slop.length() > 0) { try { return Integer.parseInt(slop); } catch (NumberFormatException e) { throw new XPathException("Query parameter 'slop' should be an integer value. Got: " + slop); } } return -1; } private Term[] expandTerms(String field, String queryStr) throws XPathException { IndexReader reader = null; try { reader = index.getReader(); List<Term> termList = new ArrayList<Term>(8); WildcardTermEnum terms = new WildcardTermEnum(reader, new Term(field, queryStr)); Term term; do { term = terms.term(); if (term != null && term.field().equals(field)) { termList.add(term); } } while (terms.next()); terms.close(); Term[] matchingTerms = new Term[termList.size()]; return termList.toArray(matchingTerms); } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } finally { index.releaseReader(reader); } } private Query termQuery(String field, Element node, Analyzer analyzer) throws XPathException { String termStr = getTerm(field, getText(node), analyzer); return termStr == null ? null : new TermQuery(new Term(field, termStr)); } private String getTerm(String field, String text, Analyzer analyzer) throws XPathException { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); TermAttribute termAttr = (TermAttribute) stream.addAttribute(TermAttribute.class); String term = null; try { stream.reset(); if (stream.incrementToken()) { term = termAttr.term(); } stream.end(); stream.close(); return term; } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } } private Query wildcardQuery(String field, Element node, Properties options) { WildcardQuery query = new WildcardQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query prefixQuery(String field, Element node, Properties options) { PrefixQuery query = new PrefixQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query fuzzyQuery(String field, Element node) throws XPathException { float minSimilarity = FuzzyQuery.defaultMinSimilarity; String attr = node.getAttribute("min-similarity"); if (attr != null && attr.length() > 0) { try { minSimilarity = Float.parseFloat(attr); } catch (NumberFormatException e) { throw new XPathException("Query parameter 'min-similarity' should be a float value. Got: " + attr); } } return new FuzzyQuery(new Term(field, getText(node)), minSimilarity); } private Query regexQuery(String field, Element node, Properties options) { RegexQuery query = new RegexQuery(new Term(field, getText(node))); setRewriteMethod(query, node, options); return query; } private Query booleanQuery(String field, Element node, Analyzer analyzer, Properties options) throws XPathException { BooleanQuery query = new BooleanQuery(); Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) child; Query childQuery = parse(field, elem, analyzer, options); if (childQuery != null) { BooleanClause.Occur occur = getOccur(elem); query.add(childQuery, occur); } } child = child.getNextSibling(); } return query; } private void setRewriteMethod(MultiTermQuery query, Element node, Properties options) { String option = node.getAttribute("filter-rewrite"); if (option == null) option = "yes"; if (options != null) option = options.getProperty(LuceneIndexWorker.OPTION_FILTER_REWRITE, "yes"); if (option.equalsIgnoreCase("yes")) query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); else query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); } private BooleanClause.Occur getOccur(Element elem) { BooleanClause.Occur occur = BooleanClause.Occur.SHOULD; String occurOpt = elem.getAttribute("occur"); if (occurOpt != null) { if (occurOpt.equals("must")) occur = BooleanClause.Occur.MUST; else if (occurOpt.equals("not")) occur = BooleanClause.Occur.MUST_NOT; else if (occurOpt.equals("should")) occur = BooleanClause.Occur.SHOULD; } return occur; } private Query parseChildren(String field, Element root, Analyzer analyzer, Properties options) throws XPathException { Query query = null; Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Query childQuery = parse(field, (Element) child, analyzer, options); if (query != null) { if (query instanceof BooleanQuery) ((BooleanQuery) query).add(childQuery, BooleanClause.Occur.SHOULD); else { BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add(query, BooleanClause.Occur.SHOULD); boolQuery.add(childQuery, BooleanClause.Occur.SHOULD); query = boolQuery; } } else query = childQuery; } child = child.getNextSibling(); } return query; } private void setBoost(Element node, Query query) throws XPathException { String boost = node.getAttribute("boost"); if (boost != null && boost.length() > 0) { try { query.setBoost(Float.parseFloat(boost)); } catch (NumberFormatException e) { throw new XPathException("Bad value for boost in query parameter. Got: " + boost); } } } private String getText(Element root) { StringBuffer buf = new StringBuffer(); Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.TEXT_NODE) buf.append(child.getNodeValue()); child = child.getNextSibling(); } return buf.toString(); } private boolean hasElementContent(Element root) { Node child = root.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) return true; child = child.getNextSibling(); } return false; } }