/* * eXist Open Source Native XML Database * Copyright (C) 2007-09 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * $Id$ */ package org.exist.xquery.modules.ngram; import org.exist.dom.*; import org.exist.indexing.ngram.NGramIndex; import org.exist.indexing.ngram.NGramIndexWorker; import org.exist.storage.ElementValue; import org.exist.xquery.*; import org.exist.xquery.NodeTest; import org.exist.xquery.util.Error; import org.exist.xquery.value.*; import java.util.ArrayList; import java.util.List; public class NGramSearch extends Function implements Optimizable { public final static FunctionSignature signatures[] = { new FunctionSignature( new QName("contains", NGramModule.NAMESPACE_URI, NGramModule.PREFIX), "Similar to the standard XQuery fn:contains function, but based on the NGram index. " + "Searches the given $queryString in the index defined on the input node set $nodes. " + "The string may appear at any position within the node content. String comparison " + "is case insensitive. Nodes need to have an ngram index to be searched.", new SequenceType[] { new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE, "The input node set to search"), new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE, "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE, Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes containing the query string " + "or the empty sequence") ), new FunctionSignature( new QName("ends-with", NGramModule.NAMESPACE_URI, NGramModule.PREFIX), "Similar to the standard XQuery fn:ends-with function, but based on the NGram index. " + "Searches the given $queryString in the index defined on the input node set $nodes. " + "The string has to appear at the end of the node's content. String comparison " + "is case insensitive. Nodes need to have an ngram index to be searched.", new SequenceType[] { new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE, "The input node set to search"), new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE, "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE, Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes ending with the query string " + "or the empty sequence") ), new FunctionSignature( new QName("starts-with", NGramModule.NAMESPACE_URI, NGramModule.PREFIX), "Similar to the standard XQuery fn:starts-with function, but based on the NGram index. " + "Searches the given $queryString in the index defined on the input node set $nodes. " + "The string may appear at any position within the node content. String comparison " + "is case insensitive. Nodes need to have an ngram index to be searched.", new SequenceType[] { new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE, "The input node set to search"), new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE, "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE, Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes starting with the query string " + "or the empty sequence") ) }; private LocationStep contextStep = null; protected QName contextQName = null; protected int axis = Constants.UNKNOWN_AXIS; private NodeSet preselectResult = null; protected boolean optimizeSelf = false; public NGramSearch(XQueryContext context, FunctionSignature signature) { super(context, signature); } public void setArguments(List arguments) throws XPathException { Expression path = (Expression) arguments.get(0); steps.add(path); Expression arg = (Expression) arguments.get(1); arg = new DynamicCardinalityCheck(context, Cardinality.ZERO_OR_ONE, arg, new org.exist.xquery.util.Error(Error.FUNC_PARAM_CARDINALITY, "2", mySignature)); if(!Type.subTypeOf(arg.returnsType(), Type.ATOMIC)) arg = new Atomize(context, arg); steps.add(arg); } /* (non-Javadoc) * @see org.exist.xquery.PathExpr#analyze(org.exist.xquery.Expression) */ public void analyze(AnalyzeContextInfo contextInfo) throws XPathException { super.analyze(contextInfo); List steps = BasicExpressionVisitor.findLocationSteps(getArgument(0)); if (!steps.isEmpty()) { LocationStep firstStep = (LocationStep) steps.get(0); LocationStep lastStep = (LocationStep) steps.get(steps.size() - 1); if (steps.size() == 1 && firstStep.getAxis() == Constants.SELF_AXIS) { Expression outerExpr = contextInfo.getContextStep(); if (outerExpr != null && outerExpr instanceof LocationStep) { LocationStep outerStep = (LocationStep) outerExpr; NodeTest test = outerStep.getTest(); if (!test.isWildcardTest() && test.getName() != null) { contextQName = new QName(test.getName()); if (outerStep.getAxis() == Constants.ATTRIBUTE_AXIS || outerStep.getAxis() == Constants.DESCENDANT_ATTRIBUTE_AXIS) contextQName.setNameType(ElementValue.ATTRIBUTE); contextStep = firstStep; axis = outerStep.getAxis(); optimizeSelf = true; } } } else { NodeTest test = lastStep.getTest(); if (!test.isWildcardTest() && test.getName() != null) { contextQName = new QName(test.getName()); if (lastStep.getAxis() == Constants.ATTRIBUTE_AXIS || lastStep.getAxis() == Constants.DESCENDANT_ATTRIBUTE_AXIS) contextQName.setNameType(ElementValue.ATTRIBUTE); axis = firstStep.getAxis(); contextStep = lastStep; } } } } public boolean canOptimize(Sequence contextSequence) { return contextQName != null; } public boolean optimizeOnSelf() { return optimizeSelf; } public int getOptimizeAxis() { return axis; } public NodeSet preSelect(Sequence contextSequence, boolean useContext) throws XPathException { // the expression can be called multiple times, so we need to clear the previous preselectResult preselectResult = null; NGramIndexWorker index = (NGramIndexWorker) context.getBroker().getIndexController().getWorkerByIndexId(NGramIndex.ID); DocumentSet docs = contextSequence.getDocumentSet(); String key = getArgument(1).eval(contextSequence).getStringValue(); String[] ngrams = index.getDistinctNGrams(key); List qnames = new ArrayList(1); qnames.add(contextQName); preselectResult = processMatches(index, docs, qnames, ngrams, useContext ? contextSequence.toNodeSet() : null, NodeSet.DESCENDANT); return preselectResult; } public Sequence eval(Sequence contextSequence, Item contextItem) throws XPathException { if (contextItem != null) contextSequence = contextItem.toSequence(); NodeSet result; if (preselectResult == null) { Sequence input = getArgument(0).eval(contextSequence, contextItem); if (input.isEmpty()) result = NodeSet.EMPTY_SET; else { NodeSet inNodes = input.toNodeSet(); DocumentSet docs = inNodes.getDocumentSet(); NGramIndexWorker index = (NGramIndexWorker) context.getBroker().getIndexController().getWorkerByIndexId(NGramIndex.ID); //Alternate design //NGramIndexWorker index = (NGramIndexWorker)context.getBroker().getBrokerPool().getIndexManager().getIndexById(NGramIndex.ID).getWorker(); String key = getArgument(1).eval(contextSequence, contextItem).getStringValue(); String[] ngrams = index.getDistinctNGrams(key); List qnames = null; if (contextQName != null) { qnames = new ArrayList(1); qnames.add(contextQName); } result = processMatches(index, docs, qnames, ngrams, inNodes, NodeSet.ANCESTOR); } } else { contextStep.setPreloadedData(contextSequence.getDocumentSet(), preselectResult); result = getArgument(0).eval(contextSequence).toNodeSet(); } return result; } private NodeSet processMatches(NGramIndexWorker index, DocumentSet docs, List qnames, String[] ngrams, NodeSet nodeSet, int axis) throws TerminatedException { NodeSet result = null; for (int i = 0; i < ngrams.length; i++) { long start = System.currentTimeMillis(); String ngram = ngrams[i]; if (ngram.length() < index.getN() && i > 0) { // if this is the last ngram and its length is too small, // fill it up with characters from the previous ngram. too short // ngrams lead to a considerable performance loss. int fill = index.getN() - ngram.length(); ngram = ngrams[i - 1].substring(index.getN() - fill) + ngram; } NodeSet nodes = index.search(getExpressionId(), docs, qnames, ngram, ngrams[i], context, nodeSet, axis); if (LOG.isTraceEnabled()) LOG.trace("Found " + nodes.getLength() + " for " + ngram + " in " + (System.currentTimeMillis() - start)); if (result == null) { if (isCalledAs("starts-with")) result = startsWith(nodes); else result = nodes; } else { NodeSet temp = new ExtArrayNodeSet(); for (NodeSetIterator iterator = nodes.iterator(); iterator.hasNext();) { NodeProxy next = (NodeProxy) iterator.next(); NodeProxy before = result.get(next); if (before != null) { Match match = null; boolean found = false; Match mb = before.getMatches(); while (mb != null && !found) { Match mn = next.getMatches(); while (mn != null && !found) { if ((match = mb.isAfter(mn)) != null) { found = true; } mn = mn.getNextMatch(); } mb = mb.getNextMatch(); } if (found) { Match m = next.getMatches(); next.setMatches(null); while (m != null) { if (m.getContextId() != getExpressionId()) next.addMatch(m); m = m.getNextMatch(); } next.addMatch(match); temp.add(next); } } } result = temp; } } if (isCalledAs("starts-with")) result = startsWith(result); else if (isCalledAs("ends-with")) result = endsWith(result); return result; } private NodeSet startsWith(NodeSet nodes) { NodeSet temp = new ExtArrayNodeSet(); for (NodeSetIterator iterator = nodes.iterator(); iterator.hasNext();) { NodeProxy next = (NodeProxy) iterator.next(); Match mn = next.getMatches(); while (mn != null) { if (mn.hasMatchAt(0)) { temp.add(next); break; } mn = mn.getNextMatch(); } } return temp; } private NodeSet endsWith(NodeSet nodes) { NodeSet temp = new ExtArrayNodeSet(); if (LOG.isDebugEnabled()) LOG.debug("Filtering " + nodes.getLength()); for (NodeSetIterator iterator = nodes.iterator(); iterator.hasNext();) { NodeProxy next = (NodeProxy) iterator.next(); String data = next.getNodeValue(); int len = data.length(); Match mn = next.getMatches(); while (mn != null) { if (mn.hasMatchAround(len)) { temp.add(next); break; } mn = mn.getNextMatch(); } } return temp; } public int getDependencies() { final Expression stringArg = getArgument(0); if (Type.subTypeOf(stringArg.returnsType(), Type.NODE) && !Dependency.dependsOn(stringArg, Dependency.CONTEXT_ITEM)) { return Dependency.CONTEXT_SET; } else { return Dependency.CONTEXT_SET + Dependency.CONTEXT_ITEM; } } public int returnsType() { return Type.NODE; } }