package lux.index.analysis; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.HashMap; import java.util.Iterator; import java.util.Map.Entry; import lux.index.IndexConfiguration; import lux.index.XmlIndexer; import lux.index.attribute.QNameAttribute; import net.sf.saxon.om.NamePool; import net.sf.saxon.om.NodeInfo; import net.sf.saxon.s9api.Processor; import net.sf.saxon.s9api.XdmNode; import net.sf.saxon.s9api.XdmSequenceIterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * <p> * This is the root of a set of xml-aware TokenStream classes that work by selecting text * a node at a time from an XML document, and then * passing that text to the wrapped TokenStream. The wrapped TokenStream is re-used for each text node. * The outermost link in the chain will be a TokenFilter that applies a sequence of structure-related * Attributes to each text token (ie a list of QNames, but can be any kind of structural attribute * that should be composed with each text token). * <p> * The token stream topology is: this( this.wrapped (this.tokenizer )) * For example, for the element-text field we have ElementTokenStream (a subclass of this class): * </p> * <blockquote> * <code>ElementTokenStream (QNameTokenFilter (LowerCaseFilter (StandardTokenizer)))</code> * </blockquote> * <p> * We can't follow the standard Lucene pattern of Analyzer as a factory for a TokenStream * since we want to be able to extend any arbitrary textual Analyzer, but the constraints * of the Analyzer class design prevent it from being extended in a straightforward manner. * Thus we have essentially an outer (XML) stream wrapping an inner (Text) stream. * </p> * * FIXME: make the constructor protected; allow construction only through static builders * defined on each derived class. This will enable us to hide the complexity of wrapping the * token stream, which is the same pattern for each of these; only the classes vary. But we * can't do the work in the constructor due to Java structural issues. */ public abstract class XmlTokenStreamBase extends TokenStream { private final String fieldName; // The analyzer creates the wrapped TokenStream/Tokenizer that does the text analysis private final Analyzer analyzer; private TokenStream wrapped; protected XdmNode curNode; protected Iterator<XdmNode> contentIter; // retrieves the nodes with text to index protected CharTermAttribute termAtt; protected Reader charStream = new OffsetCharFilter(new StringReader("")); protected ElementVisibility defVis; protected HashMap<Integer,ElementVisibility> eltVis; protected final QNameAttribute qnameAtt; protected final QNameTokenFilter qnameTokenFilter; // protected EmptyTokenStream empty; protected static final XdmSequenceIterator EMPTY = new EmptyXdmIterator(null); XmlTokenStreamBase(String fieldName, Analyzer analyzer, TokenStream wrapped, Processor processor) { super (wrapped); this.wrapped = wrapped; this.fieldName = fieldName; this.analyzer = analyzer; termAtt = addAttribute(CharTermAttribute.class); // empty = new EmptyTokenStream(wrapped); eltVis = new HashMap<Integer, ElementVisibility>(); // FIXME - don't use QNameTokenFilter for this -- that handles prefixing tokens // use instead an XmlVisibilityFilter that encapsulatres the logic currently in ElementTokenStream if (wrapped instanceof QNameTokenFilter) { qnameTokenFilter = (QNameTokenFilter) wrapped; defVis = qnameTokenFilter.getDefaultVisibility(); NamePool namePool = processor.getUnderlyingConfiguration().getNamePool(); for (Entry<String, ElementVisibility> entry : qnameTokenFilter.getElementVisibility().entrySet()) { int namecode = namePool.allocateClarkName(entry.getKey()); eltVis.put(namecode, entry.getValue()); } } else { defVis = ElementVisibility.OPAQUE; qnameTokenFilter = new QNameTokenFilter (getWrappedTokenStream()); } qnameAtt = qnameTokenFilter.addAttribute(QNameAttribute.class); } @Override public void reset () throws IOException{ reset (charStream); wrapped.reset(); } @Override public void close () throws IOException { wrapped.close(); } public void reset (Reader reader) throws IOException { close(); TokenStream reset = analyzer.tokenStream (fieldName, reader); // This must be the same token stream: ie the Analyzer must be re-usable, and the // original token stream must have arisen from it. We don't check for actual // identity with wrapped since that might get wrapped again (eg w/QNameTokenFilter). assert (reset.getAttribute(CharTermAttribute.class) == wrapped.getAttribute(CharTermAttribute.class)); } /* * Advance the iteration by looping through the following: * 1) next text node * 2) next token in text * 3) next ancestor element node * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { if (!incrementWrappedTokenStream()) { // next token in current node if (!advanceToTokenNode()) { // next node with a token return false; } } return true; } /** * @return the underlying stream of text tokens to which additional xml-related attributes are added by this. */ public TokenStream getWrappedTokenStream () { return wrapped; } protected void setWrappedTokenStream (TokenStream wrapped) { this.wrapped = wrapped; } protected boolean incrementWrappedTokenStream() throws IOException { while (wrapped.incrementToken()) { if (termAtt.length() > 0) { return true; } } return false; } private boolean advanceToTokenNode() { while (contentIter.hasNext()) { curNode = (XdmNode) contentIter.next(); // wrap the content in a reader and hand it to the tokenizer NodeInfo nodeInfo = curNode.getUnderlyingNode(); if (! updateNodeAtts ()) { continue; } if (resetTokenizer(nodeInfo.getStringValueCS())) { return true; } } return false; } abstract boolean resetTokenizer(CharSequence cs); /** @return false if the node is hidden */ abstract boolean updateNodeAtts (); /** * @param clarkName the name of an element as a clarkName ({namespace}name) * @return the explicitly-specified visibility of the element name, or null if the element has the default * visibility. */ public ElementVisibility getElementVisibility(String clarkName) { return eltVis.get(clarkName); } /** * @param namecode the name of an element as a namecode from a {@link net.sf.saxon.om.NamePool} * @param visibility the explicitly-specified visibility of the element name, or null to give the element the default * visibility. */ public void setElementVisibility(int namecode, ElementVisibility visibility) { if (visibility == null) { eltVis.remove(namecode); } else { eltVis.put(namecode, visibility); } } /** @return the visibility of elements not explicitly specified using setElementVisibility. * Always {@link ElementVisibility#OPAQUE}. */ public ElementVisibility getDefaultVisibility() { return defVis; } public void setDefaultVisibility(ElementVisibility vis) { this.defVis = vis; } public void configureElementVisibility(XmlIndexer indexer) { IndexConfiguration config = indexer.getConfiguration(); if (qnameTokenFilter != null) { qnameTokenFilter.setNamespaceAware(config.isOption(IndexConfiguration.NAMESPACE_AWARE)); } NamePool namePool = indexer.getProcessor().getUnderlyingConfiguration().getNamePool(); if (defVis == null) { defVis = config.getDefaultVisibility(); } for (Entry<String, ElementVisibility> e : config.getVisibilityMap().entrySet()) { int namecode = namePool.allocateClarkName(e.getKey()); if (! eltVis.containsKey(namecode)) { eltVis.put(namecode, e.getValue()); } } } } /* * This Source Code Form is subject to the terms of the Mozilla Public License, * v. 2.0. If a copy of the MPL was not distributed with this file, You can * obtain one at http://mozilla.org/MPL/2.0/. */