package org.exist.fulltext;
import org.apache.log4j.Logger;
import org.exist.dom.ExtArrayNodeSet;
import org.exist.dom.Match;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.dom.QName;
import org.exist.indexing.AbstractMatchListener;
import org.exist.numbering.NodeId;
import org.exist.stax.EmbeddedXMLStreamReader;
import org.exist.storage.DBBroker;
import org.exist.util.FastQSort;
import org.exist.util.serializer.AttrList;
import org.xml.sax.SAXException;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
/**
* Implementation of {@link org.exist.indexing.MatchListener} for the fulltext index.
* Right now, the serializer will directly plug this into the listener pipeline. This will
* change once we move the fulltext index into its own module.
*/
public class FTMatchListener extends AbstractMatchListener {
private final static Logger LOG = Logger.getLogger(FTMatchListener.class);
private Match match;
private Stack offsetStack = null;
public FTMatchListener(DBBroker broker, NodeProxy proxy) {
reset(broker, proxy);
}
public boolean hasMatches(NodeProxy proxy) {
Match nextMatch = proxy.getMatches();
while (nextMatch != null) {
if (nextMatch.getIndexId() == FTIndex.ID) {
return true;
}
nextMatch = nextMatch.getNextMatch();
}
return false;
}
protected void reset(DBBroker broker, NodeProxy proxy) {
this.match = proxy.getMatches();
setNextInChain(null);
/* Check if an index is defined on an ancestor of the current node.
* If yes, scan the ancestor to get the offset of the first character
* in the current node. For example, if the indexed node is <a>abc<b>de</b></a>
* and we query for //a[text:ngram-contains(., 'de')]/b, proxy will be a <b> node, but
* the offsets of the matches are relative to the start of <a>.
*/
NodeSet ancestors = null;
Match nextMatch = this.match;
while (nextMatch != null) {
if (proxy.getNodeId().isDescendantOf(nextMatch.getNodeId())) {
if (ancestors == null)
ancestors = new ExtArrayNodeSet();
ancestors.add(new NodeProxy(proxy.getDocument(), nextMatch.getNodeId()));
}
nextMatch = nextMatch.getNextMatch();
}
if (ancestors != null && !ancestors.isEmpty()) {
for (Iterator i = ancestors.iterator(); i.hasNext();) {
NodeProxy p = (NodeProxy) i.next();
int startOffset = 0;
try {
XMLStreamReader reader = broker.getXMLStreamReader(p, false);
while (reader.hasNext()) {
int ev = reader.next();
NodeId nodeId = (NodeId) reader.getProperty(EmbeddedXMLStreamReader.PROPERTY_NODE_ID);
if (nodeId.equals(proxy.getNodeId()))
break;
if (ev == XMLStreamReader.CHARACTERS)
startOffset += reader.getText().length();
}
} catch (IOException e) {
LOG.warn("Problem found while serializing XML: " + e.getMessage(), e);
} catch (XMLStreamException e) {
LOG.warn("Problem found while serializing XML: " + e.getMessage(), e);
}
if (offsetStack == null)
offsetStack = new Stack();
offsetStack.push(new NodeOffset(p.getNodeId(), startOffset));
}
}
}
public void startElement(QName qname, AttrList attribs) throws SAXException {
Match nextMatch = match;
// check if there are any matches in the current element
// if yes, push a NodeOffset object to the stack to track
// the node contents
while (nextMatch != null) {
if (nextMatch.getNodeId().equals(getCurrentNode().getNodeId())) {
if (offsetStack == null)
offsetStack = new Stack();
offsetStack.push(new NodeOffset(nextMatch.getNodeId()));
break;
}
nextMatch = nextMatch.getNextMatch();
}
super.startElement(qname, attribs);
}
public void endElement(QName qname) throws SAXException {
Match nextMatch = match;
// check if we need to pop the stack
while (nextMatch != null) {
if (nextMatch.getNodeId().equals(getCurrentNode().getNodeId())) {
offsetStack.pop();
break;
}
nextMatch = nextMatch.getNextMatch();
}
super.endElement(qname);
}
public void characters(CharSequence seq) throws SAXException {
List offsets = null; // a list of offsets to process
if (offsetStack != null) {
// walk through the stack to find matches which start in
// the current string of text
for (int i = 0; i < offsetStack.size(); i++) {
NodeOffset no = (NodeOffset) offsetStack.get(i);
int end = no.offset + seq.length();
// scan all matches
Match next = match;
while (next != null) {
if (next.getIndexId() == FTIndex.ID && next.getNodeId().equals(no.nodeId)) {
int freq = next.getFrequency();
for (int j = 0; j < freq; j++) {
Match.Offset offset = next.getOffset(j);
if (offset.getOffset() < end &&
offset.getOffset() + offset.getLength() > no.offset) {
// add it to the list to be processed
if (offsets == null) {
offsets = new ArrayList(4);
}
// adjust the offset and add it to the list
int start = offset.getOffset() - no.offset;
int len = offset.getLength();
if (start < 0) {
len = len - Math.abs(start);
start = 0;
}
if (start + len > seq.length())
len = seq.length() - start;
offsets.add(new Match.Offset(start, len));
}
}
}
next = next.getNextMatch();
}
// add the length of the current text to the element content length
no.offset = end;
}
}
// walk through the matches a second time to find matches in the text node itself
Match next = match;
while (next != null) {
if (next.getIndexId() == FTIndex.ID &&
next.getNodeId().equals(getCurrentNode().getNodeId())) {
if (offsets == null)
offsets = new ArrayList();
int freq = next.getFrequency();
for (int i = 0; i < freq; i++) {
offsets.add(next.getOffset(i));
}
}
next = next.getNextMatch();
}
// now print out the text, marking all matches with a match element
if (offsets != null) {
FastQSort.sort(offsets, 0, offsets.size() - 1);
String s = seq.toString();
int pos = 0;
for (int i = 0; i < offsets.size(); i++) {
Match.Offset offset = (Match.Offset) offsets.get(i);
if (offset.getOffset() > pos) {
super.characters(s.substring(pos, pos + (offset.getOffset() - pos)));
}
super.startElement(MATCH_ELEMENT, null);
super.characters(s.substring(offset.getOffset(), offset.getOffset() + offset.getLength()));
super.endElement(MATCH_ELEMENT);
pos = offset.getOffset() + offset.getLength();
}
if (pos < s.length()) {
super.characters(s.substring(pos));
}
} else
super.characters(seq);
}
private class NodeOffset {
NodeId nodeId;
int offset = 0;
public NodeOffset(NodeId nodeId) {
this.nodeId = nodeId;
}
public NodeOffset(NodeId nodeId, int offset) {
this.nodeId = nodeId;
this.offset = offset;
}
}
}