/* * eXist Open Source Native XML Database * Copyright (C) 2001-2015 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ package org.exist.indexing.lucene; import org.exist.dom.persistent.IStoredNode; import org.exist.dom.QName; import org.exist.dom.persistent.Match; import org.exist.dom.persistent.NodeProxy; import org.exist.dom.persistent.NewArrayNodeSet; import org.exist.dom.persistent.NodeSet; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; import org.apache.lucene.search.PhraseQuery; import org.exist.indexing.AbstractMatchListener; import org.exist.numbering.NodeId; import org.exist.stax.ExtendedXMLStreamReader; import org.exist.stax.IEmbeddedXMLStreamReader; import org.exist.storage.DBBroker; import org.exist.storage.IndexSpec; import org.exist.storage.NodePath; import org.exist.util.serializer.AttrList; import org.xml.sax.SAXException; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import java.io.IOException; import java.io.StringReader; import java.util.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeSource.State; public class LuceneMatchListener extends AbstractMatchListener { private static final Logger LOG = LogManager.getLogger(LuceneMatchListener.class); private Match match; private Map<Object, Query> termMap; private Map<NodeId, Offset> nodesWithMatch; private final LuceneIndex index; private LuceneConfig config; private DBBroker broker; public LuceneMatchListener(LuceneIndex index, DBBroker broker, NodeProxy proxy) { this.index = index; reset(broker, proxy); } public boolean hasMatches(NodeProxy proxy) { Match nextMatch = proxy.getMatches(); while (nextMatch != null) { if (nextMatch.getIndexId() == LuceneIndex.ID) { return true; } nextMatch = nextMatch.getNextMatch(); } return false; } protected void reset(DBBroker broker, NodeProxy proxy) { this.broker = broker; this.match = proxy.getMatches(); setNextInChain(null); IndexSpec indexConf = proxy.getOwnerDocument().getCollection().getIndexConfiguration(broker); if (indexConf != null) config = (LuceneConfig) indexConf.getCustomIndexSpec(LuceneIndex.ID); getTerms(); nodesWithMatch = new TreeMap<>(); /* Check if an index is defined on an ancestor of the current node. * If yes, scan the ancestor to get the offset of the first character * in the current node. For example, if the indexed node is <a>abc<b>de</b></a> * and we query for //a[text:ngram-contains(., 'de')]/b, proxy will be a <b> node, but * the offsets of the matches are relative to the start of <a>. */ NodeSet ancestors = null; Match nextMatch = this.match; while (nextMatch != null) { if (proxy.getNodeId().isDescendantOf(nextMatch.getNodeId())) { if (ancestors == null) ancestors = new NewArrayNodeSet(); ancestors.add(new NodeProxy(proxy.getOwnerDocument(), nextMatch.getNodeId())); } nextMatch = nextMatch.getNextMatch(); } if (ancestors != null && !ancestors.isEmpty()) { for (NodeProxy p : ancestors) { scanMatches(p); } } } @Override public void startElement(QName qname, AttrList attribs) throws SAXException { Match nextMatch = match; // check if there are any matches in the current element // if yes, push a NodeOffset object to the stack to track // the node contents while (nextMatch != null) { if (nextMatch.getNodeId().equals(getCurrentNode().getNodeId())) { scanMatches(new NodeProxy(getCurrentNode())); break; } nextMatch = nextMatch.getNextMatch(); } super.startElement(qname, attribs); } @Override public void characters(CharSequence seq) throws SAXException { NodeId nodeId = getCurrentNode().getNodeId(); Offset offset = nodesWithMatch.get(nodeId); if (offset == null) super.characters(seq); else { String s = seq.toString(); int pos = 0; while (offset != null) { if (offset.startOffset > pos) { if (offset.startOffset > seq.length()) throw new SAXException("start offset out of bounds"); super.characters(s.substring(pos, offset.startOffset)); } int end = offset.endOffset; if (end > s.length()) end = s.length(); super.startElement(MATCH_ELEMENT, null); super.characters(s.substring(offset.startOffset, end)); super.endElement(MATCH_ELEMENT); pos = end; offset = offset.next; } if (pos < seq.length()) super.characters(s.substring(pos)); } } private void scanMatches(NodeProxy p) { // Collect the text content of all descendants of p. // Remember the start offsets of the text nodes for later use. NodePath path = getPath(p); LuceneIndexConfig idxConf = config.getConfig(path).next(); TextExtractor extractor = new DefaultTextExtractor(); extractor.configure(config, idxConf); OffsetList offsets = new OffsetList(); int level = 0; int textOffset = 0; try { IEmbeddedXMLStreamReader reader = broker.getXMLStreamReader(p, false); while (reader.hasNext()) { int ev = reader.next(); switch (ev) { case XMLStreamConstants.END_ELEMENT: if (--level < 0) { break; } // call extractor.endElement unless this is the root of the current fragment if (level > 0) { textOffset += extractor.endElement(reader.getQName()); } break; case XMLStreamConstants.START_ELEMENT: // call extractor.startElement unless this is the root of the current fragment if (level > 0) { textOffset += extractor.startElement(reader.getQName()); } ++level; break; case XMLStreamConstants.CHARACTERS: NodeId nodeId = (NodeId) reader.getProperty(ExtendedXMLStreamReader.PROPERTY_NODE_ID); textOffset += extractor.beforeCharacters(); offsets.add(textOffset, nodeId); textOffset += extractor.characters(reader.getXMLText()); break; } } } catch (IOException | XMLStreamException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } // Retrieve the Analyzer for the NodeProxy that was used for // indexing and querying. Analyzer analyzer = idxConf.getAnalyzer(); if (analyzer == null) { // Otherwise use system default Lucene analyzer (from conf.xml) // to tokenize the text and find matching query terms. analyzer = index.getDefaultAnalyzer(); } if (LOG.isDebugEnabled()) LOG.debug("Analyzer: " + analyzer + " for path: " + path); String str = extractor.getText().toString(); //Token token; try (TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(str))) { tokenStream.reset(); MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream); while (stream.incrementToken()) { String text = stream.getAttribute(CharTermAttribute.class).toString(); Query query = termMap.get(text); if (query != null) { // Phrase queries need to be handled differently to filter // out wrong matches: only the phrase should be marked, not // single words which may also occur elsewhere in the document if (query instanceof PhraseQuery) { PhraseQuery phraseQuery = (PhraseQuery) query; Term[] terms = phraseQuery.getTerms(); if (text.equals(terms[0].text())) { // Scan the following text and collect tokens to see // if they are part of the phrase. stream.mark(); int t = 1; List<State> stateList = new ArrayList<>(terms.length); stateList.add(stream.captureState()); while (stream.incrementToken() && t < terms.length) { text = stream.getAttribute(CharTermAttribute.class).toString(); if (text.equals(terms[t].text())) { stateList.add(stream.captureState()); if (++t == terms.length) { break; } } else { // Don't reset the token stream since we will // miss matches. /ljo //stream.reset(); break; } } if (stateList.size() == terms.length) { // we indeed have a phrase match. record the offsets of its terms. int lastIdx = -1; for (int i = 0; i < terms.length; i++) { stream.restoreState(stateList.get(i)); OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class); int idx = offsets.getIndex(offsetAttr.startOffset()); NodeId nodeId = offsets.ids[idx]; Offset offset = nodesWithMatch.get(nodeId); if (offset != null) if (lastIdx == idx) offset.setEndOffset(offsetAttr.endOffset() - offsets.offsets[idx]); else offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]); else nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx])); lastIdx = idx; } } } // End of phrase handling } else { OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class); int idx = offsets.getIndex(offsetAttr.startOffset()); NodeId nodeId = offsets.ids[idx]; Offset offset = nodesWithMatch.get(nodeId); if (offset != null) offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]); else { nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx])); } } } } } catch (IOException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } } private NodePath getPath(NodeProxy proxy) { NodePath path = new NodePath(); IStoredNode<?> node = (IStoredNode<?>) proxy.getNode(); walkAncestor(node, path); return path; } private void walkAncestor(IStoredNode node, NodePath path) { if (node == null) return; IStoredNode parent = node.getParentStoredNode(); walkAncestor(parent, path); path.addComponent(node.getQName()); } /** * Get all query terms from the original queries. */ private void getTerms() { try { index.withReader(reader -> { Set<Query> queries = new HashSet<>(); termMap = new TreeMap<>(); Match nextMatch = this.match; while (nextMatch != null) { if (nextMatch.getIndexId() == LuceneIndex.ID) { Query query = ((LuceneIndexWorker.LuceneMatch) nextMatch).getQuery(); if (!queries.contains(query)) { queries.add(query); LuceneUtil.extractTerms(query, termMap, reader, false); } } nextMatch = nextMatch.getNextMatch(); } return null; }); } catch (IOException e) { LOG.warn("Match listener caught IO exception while reading query tersm: " + e.getMessage(), e); } } private static class OffsetList { int[] offsets = new int[16]; NodeId[] ids = new NodeId[16]; int len = 0; void add(int offset, NodeId nodeId) { if (len == offsets.length) { int[] tempOffsets = new int[len * 2]; System.arraycopy(offsets, 0, tempOffsets, 0, len); offsets = tempOffsets; NodeId[] tempIds = new NodeId[len * 2]; System.arraycopy(ids, 0, tempIds, 0, len); ids = tempIds; } offsets[len] = offset; ids[len++] = nodeId; } int getIndex(int offset) { for (int i = 0; i < len; i++) { if (offsets[i] <= offset && (i + 1 == len || offsets[i + 1] > offset)) { return i; } } return -1; } } private class Offset { int startOffset; int endOffset; Offset next = null; Offset(int startOffset, int endOffset) { this.startOffset = startOffset; this.endOffset = endOffset; } void add(int offset, int endOffset) { if (startOffset == offset) // duplicate match starts at same offset. ignore. return; getLast().next = new Offset(offset, endOffset); } private Offset getLast() { Offset next = this; while (next.next != null) { next = next.next; } return next; } void setEndOffset(int offset) { getLast().endOffset = offset; } } }