package lux.index.analysis;
import lux.xml.Offsets;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* A TokenStream that extracts text from a Saxon Document model (XdmNode) and generates
* a token for every "word" for every element that contains it.
* TODO: control over element transparency
*
* <p>Each element name may be one of: transparent, opaque, hidden, or container. The default may be
* set to either opaque or transparent. Unless hidden, text is tagged with its parent element.
* If its parent is transparent, it is also tagged with ancestor elements, stopping at the first opaque
* or container element. In addition, visible (non-hidden) text is tagged with all ancestor container elements.
* </p>
* <dl>
* <dt>{@link ElementVisibility#OPAQUE}</dt>
* <dd>text content is indexed only with the parent element tag. Opaque elements' start and end tags act as phrase boundaries.</dd>
* <dt>{@link ElementVisibility#TRANSPARENT}</dt>
* <dd>if the parent element is transparent, the text is also indexed as if it were a child of the grandparent element; and the same rule applies recursively.</dd>
* <dt>{@link ElementVisibility#HIDDEN}</dt>
* <dd>descendant content of hidden elements is not indexed.</dd>
* <dt>{@link ElementVisibility#CONTAINER}</dt>
* <dd>a container element tags all of its visible descendants, regardless of opacity.</dd>
* </dl>
*/
public final class ElementTokenStream extends TextOffsetTokenStream {
public ElementTokenStream(String fieldName, Analyzer analyzer, TokenStream wrapped, XdmNode doc, Offsets offsets, Processor processor) {
super(fieldName, analyzer, wrapped, doc, offsets, processor);
contentIter = new TextIterator(doc);
setWrappedTokenStream (qnameTokenFilter);
}
@Override
protected boolean updateNodeAtts () {
getAncestorQNames();
return qnameAtt.hasNext();
}
private void getAncestorQNames() {
// list the QNames of containing elements in qnameAtt filtered by the visibility rules
assert(curNode.getNodeKind() == XdmNodeKind.TEXT);
AncestorIterator nodeAncestors = new AncestorIterator(curNode);
qnameAtt.clearQNames();
boolean isOpaque = false;
while (nodeAncestors.hasNext()) {
XdmNode e = (XdmNode) nodeAncestors.next();
assert (e.getNodeKind() == XdmNodeKind.ELEMENT);
int nameCode = e.getUnderlyingNode().getNameCode();
ElementVisibility vis = eltVis.get(nameCode);
if (vis == null) {
// nothing configured for this QName, use the default visibility
vis = defVis;
}
if (vis == ElementVisibility.HIDDEN) {
// this node is hidden: don't index its content
qnameAtt.clearQNames();
return;
}
// TODO; avoid allocating all these QNames?
QName qname = e.getNodeName();
if (isOpaque) {
// we hit an opaque element in a previous iteration, so this element can't "see" the content
// unless it is a container, which sees through opaque elements
if (vis == ElementVisibility.CONTAINER) {
qnameAtt.addQName(new lux.xml.QName(qname.getNamespaceURI(), qname.getLocalName(), qname.getPrefix()));
}
} else {
// all elements so far have been transparent, so tag the content with this element name
qnameAtt.addQName(new lux.xml.QName(qname.getNamespaceURI(), qname.getLocalName(), qname.getPrefix()));
if (vis == ElementVisibility.OPAQUE || vis == ElementVisibility.CONTAINER) {
// set the opaque flag if this element is opaque (containers are always opaque).
// still continue, because there might be containers
isOpaque = true;
}
}
}
}
}
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */