package lux.index; import static lux.index.IndexConfiguration.*; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.util.Collection; import java.util.HashMap; import java.util.Map.Entry; import javax.xml.stream.XMLStreamException; import lux.Compiler; import lux.exception.LuxException; import lux.index.field.FieldDefinition; import lux.xml.OffsetDocBuilder; import lux.xml.SaxonDocBuilder; import lux.xml.Serializer; import lux.xml.XmlReader; import net.sf.saxon.om.NodeInfo; import net.sf.saxon.s9api.Processor; import net.sf.saxon.s9api.SaxonApiException; import net.sf.saxon.s9api.XPathCompiler; import net.sf.saxon.s9api.XPathExecutable; import net.sf.saxon.s9api.XPathSelector; import net.sf.saxon.s9api.XdmNode; import net.sf.saxon.s9api.XdmValue; import org.apache.commons.io.IOUtils; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; /** * Indexes XML documents. The constructor accepts a set of flags that * define a set of fields known to XmlIndexer. The fields are represented * by instances of XmlField. Instances of XmlField are immutable; they * hold no data, merely serving as markers. Additional fields can also be * added using addField(). A field may be associated with a StAXHandler; * the indexer is responsible for feeding the handlers with StAX (XML) * events. Some fields may share the same handler. The association * between field and handler is implicit: the field calls an XmlIndexer * getter to retrieve the handler. * * This class is not thread-safe * * This is all kind of a mess, and not readily extendable. If you want to * add a new type of field (a new XmlField instance), you have to modify * the indexer, which has knowledge of all the possible fields. * * Also, not every combination of indexing options will actually work. We * need to consider which things one might actually want to turn on and * off. * * We could make each field act as a StAXHandler factory? For efficiency * though, some fields share the same handler instance. For now, we leave * things as they are; we'll refactor as we add more fields. * * Indexing is triggered by a call to indexDocument(). read(InputStream) * parses and gathers the values. which are retrieved by calling * XmlField.getFieldValues(XmlIndexer) for each field. */ public class XmlIndexer { private final IndexConfiguration configuration; private XmlReader xmlReader; private Processor processor; private XPathCompiler compiler; private SaxonDocBuilder saxonBuilder; private Serializer serializer; private XmlPathMapper pathMapper; private String uri; private byte[] documentBytes; private XdmNode xdmNode; private HashMap<String,XPathExecutable> xpathCache; /** * Make a new instance with default options */ public XmlIndexer () { this (new IndexConfiguration()); } /** * Make a new instance with the given configuration. Options in the configuration control * how documents are indexed, and which kinds of indexed values will be available after indexing * a document. * @param config the index configuration to use */ public XmlIndexer (IndexConfiguration config) { this.configuration = config; xpathCache = new HashMap<String, XPathExecutable>(); init(); } /** * Make a new instance with the given options. Used mostly for testing. * @param options the index configuration options to use */ public XmlIndexer (long options) { this (new IndexConfiguration(options)); } /** * Make a new instance with the given options and Compiler. The runtime uses this to * index documents from its nodes directly, without serializing and parsing. * @param indexConfig the index configuration options to use * @param compiler the indexer will make XPath that is compatible with this compiler */ public XmlIndexer(IndexConfiguration indexConfig, Compiler compiler) { this.configuration = indexConfig; xpathCache = new HashMap<String, XPathExecutable>(); this.processor = compiler.getProcessor(); this.compiler = null; init (); } /** * initialize the indexer; an extension of the constructors. Creates subsidiary objects * required for indexing based on the index options. */ protected void init () { xmlReader = new XmlReader(); if (isOption (INDEX_QNAMES) || isOption (INDEX_PATHS)) { // accumulate XML paths and QNames for indexing if (isOption (INDEX_VALUES)) { pathMapper = new XPathValueMapper(); } else { pathMapper = new XmlPathMapper(); } pathMapper.setNamespaceAware(isOption(NAMESPACE_AWARE)); xmlReader.addHandler (pathMapper); } if (isOption (INDEX_FULLTEXT)) { initDocBuilder(); } if (isOption (STORE_DOCUMENT)) { if (! isOption(STORE_TINY_BINARY)) { serializer = new Serializer(); xmlReader.addHandler(serializer); } } if (isOption (BUILD_DOCUMENT) && saxonBuilder == null) { initDocBuilder(); } if (isOption (STRIP_NAMESPACES)) { xmlReader.setStripNamespaces(true); } } /** * Constructs a new Lucene IndexWriter for the given index directory * supplied with the proper analyzers for each field. The directory * must exist: if there is no index in the directory, a new one will be * created. If there is an existing directory, it will be locked for * writing until the writer is closed. * @param dir the directory where the index is stored * @return the IndexWriter * @throws IOException if there is a problem with the index */ public IndexWriter newIndexWriter(Directory dir) throws IOException { return new IndexWriter(dir, new IndexWriterConfig(LUCENE_VERSION, configuration.getFieldAnalyzers())); } /** * this is primarily for internal use * @return an XPathCompiler */ public XPathCompiler getXPathCompiler () { if (compiler == null) { compiler = getProcessor().newXPathCompiler(); for (Entry<String, String> nsmap : configuration.getNamespaceMap().entrySet()) { compiler.declareNamespace(nsmap.getKey(), nsmap.getValue()); } } return compiler; } public Processor getProcessor () { if (processor == null) { processor = new Processor(false); } return processor; } /** * this is primarily for internal use * @param xpath an xpath expression to evaluate * @return the result of evaluating the xpath expression with the last indexed as context * @throws SaxonApiException if there is an error during compilation or evaluation */ public XdmValue evaluateXPath(String xpath) throws SaxonApiException { XPathExecutable xpathExec = xpathCache.get(xpath); if (xpathExec == null) { xpathExec = getXPathCompiler().compile(xpath); xpathCache.put(xpath, xpathExec); } XPathSelector xps = xpathExec.load(); xps.setContextItem(getXdmNode()); return xps.evaluate(); } private void initDocBuilder () { try { if (isOption (COMPUTE_OFFSETS)) { saxonBuilder = new OffsetDocBuilder(getProcessor()); } else { saxonBuilder = new SaxonDocBuilder(getProcessor()); } xmlReader.addHandler(saxonBuilder); } catch (SaxonApiException e) { throw new LuxException (e); } } /** * Index the document read from the stream, caching field values to be written * to the Lucene index. * @param xml the document, as a byte-based InputStream * @param inputUri the uri to assign to the document * @throws XMLStreamException */ public void index (InputStream xml, String inputUri) throws XMLStreamException { reset(); this.uri = inputUri; xmlReader.read (xml); xdmNode = getBuilderNode(); } /** * Index the document read from the Reader, caching field values to be written * to the Lucene index. * @param xml the document, as a character-based Reader * @param inputUri the uri to assign to the document * @throws XMLStreamException */ public void index (Reader xml, String inputUri) throws XMLStreamException { reset(); this.uri = inputUri; xmlReader.read (xml); xdmNode = getBuilderNode(); } /** * Index the document read from the String, caching field values to be * written to the Lucene index. * @param doc the document (or element) as a Saxon NodeInfo * @param inputUri the uri to assign to the document * @throws XMLStreamException */ public void index (NodeInfo doc, String inputUri) throws XMLStreamException { reset(); this.uri = inputUri; // We'd like to use the input node directly and skip building a copy of it, // however the input may be an element, and not a document, and we need a document. xmlReader.read (doc); xdmNode = getBuilderNode(); } /** Clear out internal storage cached by #index when indexing a document */ public void reset() { xmlReader.reset(); uri = null; xdmNode = null; documentBytes = null; } /** * * @param option an option flag; one of: NAMESPACE_AWARE, STORE_XML, * STORE_PTREE, INDEX_QNAMES, INDEX_PATHS, INDEX_FULLTEXT * @return whether the option is set */ private boolean isOption (int option) { return configuration.isOption(option); } private Collection<FieldDefinition> getFields () { return configuration.getFields(); } /** * @return the uri cached from the last invocation of #index */ public String getURI() { return uri; } /** * @return the document cached from the last invocation of #index, as a Saxon XdmNode. * This will be null if the indexer options don't require the generation of an XdmNode. */ public XdmNode getXdmNode () { return xdmNode; } private XdmNode getBuilderNode () { if (saxonBuilder == null) { return null; } try { return saxonBuilder.getDocument(); } catch (SaxonApiException e) { throw new LuxException (e); } } /** * @return the document cached from the last invocation of #index, as a * String. This will be null if the indexer options don't require the * generation of a serialized document. The document is always re-serialized * after parsing. */ public String getDocumentText() { if (serializer != null) { return serializer.getDocument(); } return null; } /** * @return the document bytes; this will be non-null if {@link #storeDocument(IndexWriter, String, InputStream)} * was called. */ public byte[] getDocumentBytes() { return documentBytes; } /** * Index and write a document to the Lucene index. * @param indexWriter the Lucene IndexWriter for the index to write to * @param docUri the uri to assign to the document; any scheme will * be stripped: only the path is stored in the index * @param xml the text of an xml document to index * @throws XMLStreamException if there is an error parsing the document * @throws IOException if there is an error writing to the index */ public void indexDocument(final IndexWriter indexWriter, final String docUri, final String xml) throws XMLStreamException, IOException { reset(); String path = normalizeUri(docUri); index(new StringReader(xml), path); addLuceneDocument(indexWriter); } /** * Index and write a document to the Lucene index. * @param indexWriter the Lucene IndexWriter for the index to write to * @param docUri the uri to assign to the document; any scheme will * be stripped: only the path is stored in the index * @param xmlStream a stream from which the text of an xml document is to be read * @throws XMLStreamException if there is an error parsing the document * @throws IOException if there is an error writing to the index */ public void indexDocument(final IndexWriter indexWriter, final String docUri, final InputStream xmlStream) throws XMLStreamException, IOException { reset(); String path = normalizeUri(docUri); index(xmlStream, path); addLuceneDocument(indexWriter); } /** * Fully read the stream and store it as a document without attempting to parse or index it. Used for * binary and other non-XML text. * @param indexWriter the Lucene IndexWriter for the index to write to * @param docUri the uri to assign to the document; any scheme will be stripped: only the path is stored in the index * @param input the stream to read the document from * @throws IOException if there is an error writing to the index */ public void storeDocument(final IndexWriter indexWriter, final String docUri, final InputStream input) throws IOException { storeDocument (indexWriter, docUri, IOUtils.toByteArray(input)); } /** * Fully read the stream and store it as a document without attempting to parse or index it. Used for * binary and other non-XML text. * @param indexWriter the Lucene IndexWriter for the index to write to * @param docUri the uri to assign to the document; any scheme will be stripped: only the path is stored in the index * @param bytes the document bytes to store * @throws IOException if there is an error writing to the index */ public void storeDocument(final IndexWriter indexWriter, final String docUri, final byte[] bytes) throws IOException { reset(); String path = normalizeUri(docUri); uri = path; documentBytes = bytes; addLuceneDocument(indexWriter); } private static String normalizeUri(String uri) { String path = uri.replaceFirst("^\\w+:/+", "/"); // strip the scheme part (file:/, lux:/, etc), if any path = path.replace('\\', '/'); return path; } /** * Index and write a document to the Lucene index. * @param indexWriter the Lucene IndexWriter for the index to write to * @param path the uri to assign to the document * @param node an xml document to index, as a Saxon NodeInfo * @throws XMLStreamException if there is an error parsing the document * @throws IOException if there is an error writing to the index */ public void indexDocument(IndexWriter indexWriter, String path, NodeInfo node) throws XMLStreamException, IOException { reset(); index(node, path); addLuceneDocument(indexWriter); } /** * @return a Lucene {@link org.apache.lucene.document.Document} created * from the field values stored in this indexer. The document is ready * to be inserted into Lucene via {@link IndexWriter#addDocument}. */ public org.apache.lucene.document.Document createLuceneDocument () { org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); for (FieldDefinition field : getFields()) { for (IndexableField f : field.getFieldValues(this)) { doc.add(f); } } return doc; } private void addLuceneDocument(IndexWriter indexWriter) throws CorruptIndexException, IOException { indexWriter.deleteDocuments(new Term(configuration.getUriFieldName(), uri)); indexWriter.addDocument(createLuceneDocument()); } /** Primarily for internal use. * @return the {@link SaxonDocBuilder} used by the indexer to construct XdmNodes. */ public SaxonDocBuilder getSaxonDocBuilder () { return saxonBuilder; } /** Primarily for internal use. * @return the {@link XmlPathMapper} used by the indexer to gather node paths. */ public XmlPathMapper getPathMapper() { return pathMapper; } /** @return the index configuration */ public IndexConfiguration getConfiguration() { return configuration; } } /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */