package lux.index;
import static lux.index.IndexConfiguration.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map.Entry;
import javax.xml.stream.XMLStreamException;
import lux.Compiler;
import lux.exception.LuxException;
import lux.index.field.FieldDefinition;
import lux.xml.OffsetDocBuilder;
import lux.xml.SaxonDocBuilder;
import lux.xml.Serializer;
import lux.xml.XmlReader;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XPathCompiler;
import net.sf.saxon.s9api.XPathExecutable;
import net.sf.saxon.s9api.XPathSelector;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmValue;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
/**
* Indexes XML documents. The constructor accepts a set of flags that
* define a set of fields known to XmlIndexer. The fields are represented
* by instances of XmlField. Instances of XmlField are immutable; they
* hold no data, merely serving as markers. Additional fields can also be
* added using addField(). A field may be associated with a StAXHandler;
* the indexer is responsible for feeding the handlers with StAX (XML)
* events. Some fields may share the same handler. The association
* between field and handler is implicit: the field calls an XmlIndexer
* getter to retrieve the handler.
*
* This class is not thread-safe
*
* This is all kind of a mess, and not readily extendable. If you want to
* add a new type of field (a new XmlField instance), you have to modify
* the indexer, which has knowledge of all the possible fields.
*
* Also, not every combination of indexing options will actually work. We
* need to consider which things one might actually want to turn on and
* off.
*
* We could make each field act as a StAXHandler factory? For efficiency
* though, some fields share the same handler instance. For now, we leave
* things as they are; we'll refactor as we add more fields.
*
* Indexing is triggered by a call to indexDocument(). read(InputStream)
* parses and gathers the values. which are retrieved by calling
* XmlField.getFieldValues(XmlIndexer) for each field.
*/
public class XmlIndexer {
private final IndexConfiguration configuration;
private XmlReader xmlReader;
private Processor processor;
private XPathCompiler compiler;
private SaxonDocBuilder saxonBuilder;
private Serializer serializer;
private XmlPathMapper pathMapper;
private String uri;
private byte[] documentBytes;
private XdmNode xdmNode;
private HashMap<String,XPathExecutable> xpathCache;
/**
* Make a new instance with default options
*/
public XmlIndexer () {
this (new IndexConfiguration());
}
/**
* Make a new instance with the given configuration. Options in the configuration control
* how documents are indexed, and which kinds of indexed values will be available after indexing
* a document.
* @param config the index configuration to use
*/
public XmlIndexer (IndexConfiguration config) {
this.configuration = config;
xpathCache = new HashMap<String, XPathExecutable>();
init();
}
/**
* Make a new instance with the given options. Used mostly for testing.
* @param options the index configuration options to use
*/
public XmlIndexer (long options) {
this (new IndexConfiguration(options));
}
/**
* Make a new instance with the given options and Compiler. The runtime uses this to
* index documents from its nodes directly, without serializing and parsing.
* @param indexConfig the index configuration options to use
* @param compiler the indexer will make XPath that is compatible with this compiler
*/
public XmlIndexer(IndexConfiguration indexConfig, Compiler compiler) {
this.configuration = indexConfig;
xpathCache = new HashMap<String, XPathExecutable>();
this.processor = compiler.getProcessor();
this.compiler = null;
init ();
}
/**
* initialize the indexer; an extension of the constructors. Creates subsidiary objects
* required for indexing based on the index options.
*/
protected void init () {
xmlReader = new XmlReader();
if (isOption (INDEX_QNAMES) || isOption (INDEX_PATHS)) {
// accumulate XML paths and QNames for indexing
if (isOption (INDEX_VALUES)) {
pathMapper = new XPathValueMapper();
} else {
pathMapper = new XmlPathMapper();
}
pathMapper.setNamespaceAware(isOption(NAMESPACE_AWARE));
xmlReader.addHandler (pathMapper);
}
if (isOption (INDEX_FULLTEXT)) {
initDocBuilder();
}
if (isOption (STORE_DOCUMENT)) {
if (! isOption(STORE_TINY_BINARY)) {
serializer = new Serializer();
xmlReader.addHandler(serializer);
}
}
if (isOption (BUILD_DOCUMENT) && saxonBuilder == null) {
initDocBuilder();
}
if (isOption (STRIP_NAMESPACES)) {
xmlReader.setStripNamespaces(true);
}
}
/**
* Constructs a new Lucene IndexWriter for the given index directory
* supplied with the proper analyzers for each field. The directory
* must exist: if there is no index in the directory, a new one will be
* created. If there is an existing directory, it will be locked for
* writing until the writer is closed.
* @param dir the directory where the index is stored
* @return the IndexWriter
* @throws IOException if there is a problem with the index
*/
public IndexWriter newIndexWriter(Directory dir) throws IOException {
return new IndexWriter(dir, new IndexWriterConfig(LUCENE_VERSION, configuration.getFieldAnalyzers()));
}
/**
* this is primarily for internal use
* @return an XPathCompiler
*/
public XPathCompiler getXPathCompiler () {
if (compiler == null) {
compiler = getProcessor().newXPathCompiler();
for (Entry<String, String> nsmap : configuration.getNamespaceMap().entrySet()) {
compiler.declareNamespace(nsmap.getKey(), nsmap.getValue());
}
}
return compiler;
}
public Processor getProcessor () {
if (processor == null) {
processor = new Processor(false);
}
return processor;
}
/**
* this is primarily for internal use
* @param xpath an xpath expression to evaluate
* @return the result of evaluating the xpath expression with the last indexed as context
* @throws SaxonApiException if there is an error during compilation or evaluation
*/
public XdmValue evaluateXPath(String xpath) throws SaxonApiException {
XPathExecutable xpathExec = xpathCache.get(xpath);
if (xpathExec == null) {
xpathExec = getXPathCompiler().compile(xpath);
xpathCache.put(xpath, xpathExec);
}
XPathSelector xps = xpathExec.load();
xps.setContextItem(getXdmNode());
return xps.evaluate();
}
private void initDocBuilder () {
try {
if (isOption (COMPUTE_OFFSETS)) {
saxonBuilder = new OffsetDocBuilder(getProcessor());
} else {
saxonBuilder = new SaxonDocBuilder(getProcessor());
}
xmlReader.addHandler(saxonBuilder);
} catch (SaxonApiException e) {
throw new LuxException (e);
}
}
/**
* Index the document read from the stream, caching field values to be written
* to the Lucene index.
* @param xml the document, as a byte-based InputStream
* @param inputUri the uri to assign to the document
* @throws XMLStreamException
*/
public void index (InputStream xml, String inputUri) throws XMLStreamException {
reset();
this.uri = inputUri;
xmlReader.read (xml);
xdmNode = getBuilderNode();
}
/**
* Index the document read from the Reader, caching field values to be written
* to the Lucene index.
* @param xml the document, as a character-based Reader
* @param inputUri the uri to assign to the document
* @throws XMLStreamException
*/
public void index (Reader xml, String inputUri) throws XMLStreamException {
reset();
this.uri = inputUri;
xmlReader.read (xml);
xdmNode = getBuilderNode();
}
/**
* Index the document read from the String, caching field values to be
* written to the Lucene index.
* @param doc the document (or element) as a Saxon NodeInfo
* @param inputUri the uri to assign to the document
* @throws XMLStreamException
*/
public void index (NodeInfo doc, String inputUri) throws XMLStreamException {
reset();
this.uri = inputUri;
// We'd like to use the input node directly and skip building a copy of it,
// however the input may be an element, and not a document, and we need a document.
xmlReader.read (doc);
xdmNode = getBuilderNode();
}
/** Clear out internal storage cached by #index when indexing a document */
public void reset() {
xmlReader.reset();
uri = null;
xdmNode = null;
documentBytes = null;
}
/**
*
* @param option an option flag; one of: NAMESPACE_AWARE, STORE_XML,
* STORE_PTREE, INDEX_QNAMES, INDEX_PATHS, INDEX_FULLTEXT
* @return whether the option is set
*/
private boolean isOption (int option) {
return configuration.isOption(option);
}
private Collection<FieldDefinition> getFields () {
return configuration.getFields();
}
/**
* @return the uri cached from the last invocation of #index
*/
public String getURI() {
return uri;
}
/**
* @return the document cached from the last invocation of #index, as a Saxon XdmNode.
* This will be null if the indexer options don't require the generation of an XdmNode.
*/
public XdmNode getXdmNode () {
return xdmNode;
}
private XdmNode getBuilderNode () {
if (saxonBuilder == null) {
return null;
}
try {
return saxonBuilder.getDocument();
} catch (SaxonApiException e) {
throw new LuxException (e);
}
}
/**
* @return the document cached from the last invocation of #index, as a
* String. This will be null if the indexer options don't require the
* generation of a serialized document. The document is always re-serialized
* after parsing.
*/
public String getDocumentText() {
if (serializer != null) {
return serializer.getDocument();
}
return null;
}
/**
* @return the document bytes; this will be non-null if {@link #storeDocument(IndexWriter, String, InputStream)}
* was called.
*/
public byte[] getDocumentBytes() {
return documentBytes;
}
/**
* Index and write a document to the Lucene index.
* @param indexWriter the Lucene IndexWriter for the index to write to
* @param docUri the uri to assign to the document; any scheme will
* be stripped: only the path is stored in the index
* @param xml the text of an xml document to index
* @throws XMLStreamException if there is an error parsing the document
* @throws IOException if there is an error writing to the index
*/
public void indexDocument(final IndexWriter indexWriter, final String docUri, final String xml) throws XMLStreamException, IOException {
reset();
String path = normalizeUri(docUri);
index(new StringReader(xml), path);
addLuceneDocument(indexWriter);
}
/**
* Index and write a document to the Lucene index.
* @param indexWriter the Lucene IndexWriter for the index to write to
* @param docUri the uri to assign to the document; any scheme will
* be stripped: only the path is stored in the index
* @param xmlStream a stream from which the text of an xml document is to be read
* @throws XMLStreamException if there is an error parsing the document
* @throws IOException if there is an error writing to the index
*/
public void indexDocument(final IndexWriter indexWriter, final String docUri, final InputStream xmlStream) throws XMLStreamException, IOException {
reset();
String path = normalizeUri(docUri);
index(xmlStream, path);
addLuceneDocument(indexWriter);
}
/**
* Fully read the stream and store it as a document without attempting to parse or index it. Used for
* binary and other non-XML text.
* @param indexWriter the Lucene IndexWriter for the index to write to
* @param docUri the uri to assign to the document; any scheme will be stripped: only the path is stored in the index
* @param input the stream to read the document from
* @throws IOException if there is an error writing to the index
*/
public void storeDocument(final IndexWriter indexWriter, final String docUri, final InputStream input) throws IOException {
storeDocument (indexWriter, docUri, IOUtils.toByteArray(input));
}
/**
* Fully read the stream and store it as a document without attempting to parse or index it. Used for
* binary and other non-XML text.
* @param indexWriter the Lucene IndexWriter for the index to write to
* @param docUri the uri to assign to the document; any scheme will be stripped: only the path is stored in the index
* @param bytes the document bytes to store
* @throws IOException if there is an error writing to the index
*/
public void storeDocument(final IndexWriter indexWriter, final String docUri, final byte[] bytes) throws IOException {
reset();
String path = normalizeUri(docUri);
uri = path;
documentBytes = bytes;
addLuceneDocument(indexWriter);
}
private static String normalizeUri(String uri) {
String path = uri.replaceFirst("^\\w+:/+", "/"); // strip the scheme part (file:/, lux:/, etc), if any
path = path.replace('\\', '/');
return path;
}
/**
* Index and write a document to the Lucene index.
* @param indexWriter the Lucene IndexWriter for the index to write to
* @param path the uri to assign to the document
* @param node an xml document to index, as a Saxon NodeInfo
* @throws XMLStreamException if there is an error parsing the document
* @throws IOException if there is an error writing to the index
*/
public void indexDocument(IndexWriter indexWriter, String path, NodeInfo node) throws XMLStreamException, IOException {
reset();
index(node, path);
addLuceneDocument(indexWriter);
}
/**
* @return a Lucene {@link org.apache.lucene.document.Document} created
* from the field values stored in this indexer. The document is ready
* to be inserted into Lucene via {@link IndexWriter#addDocument}.
*/
public org.apache.lucene.document.Document createLuceneDocument () {
org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
for (FieldDefinition field : getFields()) {
for (IndexableField f : field.getFieldValues(this)) {
doc.add(f);
}
}
return doc;
}
private void addLuceneDocument(IndexWriter indexWriter) throws CorruptIndexException, IOException {
indexWriter.deleteDocuments(new Term(configuration.getUriFieldName(), uri));
indexWriter.addDocument(createLuceneDocument());
}
/** Primarily for internal use.
* @return the {@link SaxonDocBuilder} used by the indexer to construct XdmNodes.
*/
public SaxonDocBuilder getSaxonDocBuilder () {
return saxonBuilder;
}
/** Primarily for internal use.
* @return the {@link XmlPathMapper} used by the indexer to gather node paths.
*/
public XmlPathMapper getPathMapper() {
return pathMapper;
}
/** @return the index configuration */
public IndexConfiguration getConfiguration() {
return configuration;
}
}
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */