LuceneIndexWorker.java example

Explorer
eXist-1.4.x-master
package org.exist.indexing.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.OpenBitSet;
import org.exist.collections.Collection;
import org.exist.dom.AttrImpl;
import org.exist.dom.CharacterDataImpl;
import org.exist.dom.DocumentImpl;
import org.exist.dom.DocumentSet;
import org.exist.dom.ElementImpl;
import org.exist.dom.Match;
import org.exist.dom.NewArrayNodeSet;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.dom.QName;
import org.exist.dom.StoredNode;
import org.exist.dom.SymbolTable;
import org.exist.indexing.AbstractStreamListener;
import org.exist.indexing.IndexController;
import org.exist.indexing.IndexWorker;
import org.exist.indexing.MatchListener;
import org.exist.indexing.OrderedValuesIndex;
import org.exist.indexing.QNamedKeysIndex;
import org.exist.indexing.StreamListener;
import org.exist.numbering.NodeId;
import org.exist.storage.DBBroker;
import org.exist.storage.ElementValue;
import org.exist.storage.IndexSpec;
import org.exist.storage.NodePath;
import org.exist.storage.txn.Txn;
import org.exist.util.ByteConversion;
import org.exist.util.DatabaseConfigurationException;
import org.exist.util.Occurrences;
import org.exist.xquery.Expression;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.value.IntegerValue;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {

    public static final String OPTION_DEFAULT_OPERATOR = "default-operator";
    public static final String OPTION_PHRASE_SLOP = "phrase-slop";
    public static final String OPTION_LEADING_WILDCARD = "leading-wildcard";
    public static final String OPTION_FILTER_REWRITE = "filter-rewrite";
    public static final String DEFAULT_OPERATOR_OR = "or";
    
    private static final Logger LOG = Logger.getLogger(LuceneIndexWorker.class);

    private static final FieldSelector NODE_FIELD_SELECTOR = new NodeFieldSelector();

    private LuceneIndex index;
    @SuppressWarnings("unused")
	private IndexController controller;

    private LuceneMatchListener matchListener = null;

    private XMLToQuery queryTranslator;

    private DBBroker broker;

    private DocumentImpl currentDoc = null;
    private int mode = 0;
    
    private LuceneConfig config;
    private Stack<TextExtractor> contentStack = null;
    private Set<NodeId> nodesToRemove = null;
    private List<PendingDoc> nodesToWrite = null;
    private int cachedNodesSize = 0;

    private int maxCachedNodesSize = 4096 * 1024;
    private Analyzer analyzer;

    public static final String FIELD_NODE_ID = "nodeId";
    public static final String FIELD_DOC_ID = "docId";

    public LuceneIndexWorker(LuceneIndex parent, DBBroker broker) {
        this.index = parent;
        this.broker = broker;
        this.queryTranslator = new XMLToQuery(index);
    }

    public String getIndexId() {
        return LuceneIndex.ID;
    }

    public String getIndexName() {
        return index.getIndexName();
    }

    public Object configure(IndexController controller, NodeList configNodes, Map namespaces) throws DatabaseConfigurationException {
        this.controller = controller;
        LOG.debug("Configuring lucene index...");
        config = new LuceneConfig(configNodes, namespaces);
        return config;
    }


    public void flush() {
        switch (mode) {
            case StreamListener.STORE:
                write();
                break;
            case StreamListener.REMOVE_ALL_NODES:
                removeDocument(currentDoc.getDocId());
                break;
            case StreamListener.REMOVE_SOME_NODES:
                removeNodes();
                break;
        }
    }

    public void setDocument(DocumentImpl document) {
        setDocument(document, StreamListener.UNKNOWN);
    }

    public void setDocument(DocumentImpl document, int newMode) {
        currentDoc = document;
        //config = null;
        contentStack = null;
        IndexSpec indexConf = document.getCollection().getIndexConfiguration(broker);
        if (indexConf != null) {
            config = (LuceneConfig) indexConf.getCustomIndexSpec(LuceneIndex.ID);
            if (config != null)
            	// Create a copy of the original LuceneConfig (there's only one per db instance), 
            	// so we can safely work with it.
            	config = new LuceneConfig(config);
        }
        mode = newMode;
    }

    public void setMode(int mode) {
        this.mode = mode;
        switch (mode) {
            case StreamListener.STORE:
                if (nodesToWrite == null)
                    nodesToWrite = new ArrayList<PendingDoc>();
                else
                    nodesToWrite.clear();
                cachedNodesSize = 0;
                break;
            case StreamListener.REMOVE_SOME_NODES:
                nodesToRemove = new TreeSet<NodeId>();
                break;
        }
    }

    public DocumentImpl getDocument() {
        return currentDoc;
    }

    public int getMode() {
        return this.mode;
    }

    public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean includeSelf) {
        if (node.getNodeType() == Node.ATTRIBUTE_NODE)
            return null;
        if (config == null)
            return null;
        NodePath p = new NodePath(path);
        boolean reindexRequired = false;
        if (node.getNodeType() == Node.ELEMENT_NODE && !includeSelf)
            p.removeLastComponent();
        for (int i = 0; i < p.length(); i++) {
            if (config.matches(p)) {
                reindexRequired = true;
                break;
            }
            p.removeLastComponent();
        }
        if (reindexRequired) {
            p = new NodePath(path);
            StoredNode topMost = null;
            StoredNode currentNode = node;
            if (currentNode.getNodeType() != Node.ELEMENT_NODE)
                currentNode = currentNode.getParentStoredNode();
            while (currentNode != null) {
                if (config.matches(p))
                    topMost = currentNode;
                currentNode = currentNode.getParentStoredNode();
                p.removeLastComponent();
            }
            return topMost;
        }
        return null;
    }

    private StreamListener listener = new LuceneStreamListener();

    public StreamListener getListener() {
        return listener;
    }

    public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy) {
        boolean needToFilter = false;
        Match nextMatch = proxy.getMatches();
        while (nextMatch != null) {
            if (nextMatch.getIndexId() == LuceneIndex.ID) {
                needToFilter = true;
                break;
            }
            nextMatch = nextMatch.getNextMatch();
        }
        if (!needToFilter)
            return null;
        if (matchListener == null)
            matchListener = new LuceneMatchListener(index, broker, proxy);
        else
            matchListener.reset(broker, proxy);
        return matchListener;
    }

    protected void removeDocument(int docId) {
        IndexReader reader = null;
        try {
            reader = index.getWritingReader();
            Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(docId));
            reader.deleteDocuments(dt);
            reader.flush();
        } catch (IOException e) {
            LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWritingReader(reader);
            mode = StreamListener.STORE;
        }
    }

    public void removeCollection(Collection collection, DBBroker broker) {
        if (LOG.isDebugEnabled())
            LOG.debug("Removing collection " + collection.getURI());
        IndexReader reader = null;
        try {
            reader = index.getWritingReader();
            for (Iterator<DocumentImpl> i = collection.iterator(broker); i.hasNext(); ) {
                DocumentImpl doc = i.next();
                Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(doc.getDocId()));
                TermDocs td = reader.termDocs(dt);
                while (td.next()) {
                    reader.deleteDocument(td.doc());
                }
            }
            reader.flush();
        } catch (IOException e) {
            LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWritingReader(reader);
            mode = StreamListener.STORE;
        }
        if (LOG.isDebugEnabled())
            LOG.debug("Collection removed.");
    }

    /**
     * Remove specific nodes from the index. This method is used for node updates
     * and called from flush() if the worker is in {@link StreamListener#REMOVE_SOME_NODES}
     * mode.
     */
    protected void removeNodes() {
        if (nodesToRemove == null)
            return;
        IndexReader reader = null;
        try {
            reader = index.getWritingReader();
            Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(currentDoc.getDocId()));
            TermDocs docsEnum = reader.termDocs(dt);
            while (docsEnum.next()) {
                Document doc = reader.document(docsEnum.doc());
                NodeId nodeId = readNodeId(doc);
                if (nodesToRemove.contains(nodeId)) {
                    reader.deleteDocument(docsEnum.doc());
                }
            }
            nodesToRemove = null;
            reader.flush();
        } catch (IOException e) {
            LOG.warn("Error while deleting lucene index entries: " + e.getMessage(), e);
        } finally {
            index.releaseWritingReader(reader);
        }
    }

    /**
     * Query the index. Returns a node set containing all matching nodes. Each node
     * in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
     * element attached, which stores the score and a link to the query which generated it.
     *
     * @param context current XQuery context
     * @param contextId current context id, identify to track the position inside nested XPath predicates
     * @param docs query will be restricted to documents in this set
     * @param contextSet if specified, returned nodes will be descendants of the nodes in this set
     * @param qnames query will be restricted to nodes with the qualified names given here
     * @param queryStr a lucene query string
     * @param axis which node is returned: the node in which a match was found or the corresponding ancestor
     *  from the contextSet
     * @return node set containing all matching nodes
     * 
     * @throws IOException
     * @throws ParseException
     */
    public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
        List<QName> qnames, String queryStr, int axis, Properties options)
        throws IOException, ParseException {
        qnames = getDefinedIndexes(qnames);
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            for (QName qname : qnames) {
                String field = encodeQName(qname);
                Analyzer analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
                QueryParser parser = new QueryParser(field, analyzer);
                setOptions(options, parser);
                Query query = parser.parse(queryStr);
                searchAndProcess(contextId, qname, docs, contextSet, resultSet,
                    returnAncestor, searcher, query);
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }

    private void setOptions(Properties options, QueryParser parser) throws ParseException {
        if (options == null)
            return;
        String option = options.getProperty(OPTION_DEFAULT_OPERATOR);
        if (option != null) {
            if (DEFAULT_OPERATOR_OR.equals(option))
                parser.setDefaultOperator(QueryParser.OR_OPERATOR);
            else
                parser.setDefaultOperator(QueryParser.AND_OPERATOR);
        }
        option = options.getProperty(OPTION_LEADING_WILDCARD);
        if (option != null)
            parser.setAllowLeadingWildcard(option.equalsIgnoreCase("yes"));
        option = options.getProperty(OPTION_PHRASE_SLOP);
        if (option != null) {
            try {
                int slop = Integer.parseInt(option);
                parser.setPhraseSlop(slop);
            } catch (NumberFormatException e) {
                throw new ParseException("value for option " + OPTION_PHRASE_SLOP + " needs to be a number");
            }
        }
        option = options.getProperty(OPTION_FILTER_REWRITE);
        if (option != null) {
            if (option.equalsIgnoreCase("yes"))
                parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
            else
                parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
        }
    }

    /**
     * Query the index. Returns a node set containing all matching nodes. Each node
     * in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
     * element attached, which stores the score and a link to the query which generated it.
     *
     * @param context current XQuery context
     * @param contextId current context id, identify to track the position inside nested XPath predicates
     * @param docs query will be restricted to documents in this set
     * @param contextSet if specified, returned nodes will be descendants of the nodes in this set
     * @param qnames query will be restricted to nodes with the qualified names given here
     * @param queryRoot an XML representation of the query, see {@link XMLToQuery}.
     * @param axis which node is returned: the node in which a match was found or the corresponding ancestor
     *  from the contextSet
     * @return node set containing all matching nodes
     *
     * @throws IOException
     * @throws ParseException
     */
    public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
                         List<QName> qnames, Element queryRoot, int axis, Properties options)
            throws IOException, ParseException, XPathException {
        qnames = getDefinedIndexes(qnames);
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            for (QName qname : qnames) {
                String field = encodeQName(qname);
                analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
                Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
                if (query != null) {
	                searchAndProcess(contextId, qname, docs, contextSet, resultSet,
                        returnAncestor, searcher, query);
                }
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }

    public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
            String field, Element queryRoot, int axis, Properties options)
            throws IOException, XPathException {
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            analyzer = getAnalyzer(field, null, context.getBroker(), docs);
            Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
            if (query != null) {
                searchAndProcess(contextId, null, docs, contextSet, resultSet,
                    returnAncestor, searcher, query);
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }

    private void searchAndProcess(int contextId, QName qname, DocumentSet docs,
            NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
            IndexSearcher searcher, Query query) throws IOException {
        LuceneHitCollector collector = new LuceneHitCollector();
        searcher.search(query, collector);
        processHits(collector.getDocs(), searcher, contextId, qname, docs, contextSet, resultSet, returnAncestor, query);
    }

    public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
            String field, String queryString, int axis, Properties options)
            throws IOException, ParseException {
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            Analyzer analyzer = getAnalyzer(field, null, context.getBroker(), docs);
            LOG.debug("Using analyzer " + analyzer + " for " + queryString);
            QueryParser parser = new QueryParser(field, analyzer);
            setOptions(options, parser);
            Query query = parser.parse(queryString);
            searchAndProcess(contextId, null, docs, contextSet, resultSet,
                returnAncestor, searcher, query);
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }
    
    /**
     * Process the query results collected from the Lucene index and
     * map them to the corresponding XML nodes in eXist.
     */
    private void processHits(List<ScoreDoc> hits, IndexSearcher searcher, int contextId, QName qname, DocumentSet docs, NodeSet contextSet,
                             NodeSet resultSet, boolean returnAncestor, Query query) {
        for (ScoreDoc scoreDoc : hits) {
            try {
                Document doc = searcher.doc(scoreDoc.doc, NODE_FIELD_SELECTOR);
                String fDocId = doc.get(FIELD_DOC_ID);
                int docId = Integer.parseInt(fDocId);
                DocumentImpl storedDocument = docs.getDoc(docId);
                if (storedDocument == null)
                    continue;
                NodeId nodeId = readNodeId(doc);
                NodeProxy storedNode = new NodeProxy(storedDocument, nodeId);
                if (qname != null)
                	storedNode.setNodeType(qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
                // if a context set is specified, we can directly check if the
                // matching node is a descendant of one of the nodes
                // in the context set.
                if (contextSet != null) {
                    int sizeHint = contextSet.getSizeHint(storedDocument);
                    if (returnAncestor) {
                    	NodeProxy parentNode = contextSet.get(storedNode);
                        // NodeProxy parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
                        if (parentNode != null) {
                            LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                            match.setScore(scoreDoc.score);
                            parentNode.addMatch(match);
                            resultSet.add(parentNode, sizeHint);
                            if (Expression.NO_CONTEXT_ID != contextId) {
                                parentNode.deepCopyContext(storedNode, contextId);
                            } else
                                parentNode.copyContext(storedNode);
                        }
                    } else {
                        LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                        match.setScore(scoreDoc.score);
                        storedNode.addMatch(match);
                        resultSet.add(storedNode, sizeHint);
                    }
                } else {
                    LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                    match.setScore(scoreDoc.score);
                    storedNode.addMatch(match);
                    resultSet.add(storedNode);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    private static class LuceneHitCollector extends Collector {

        private List<ScoreDoc> docs = new ArrayList<ScoreDoc>();
        private int docBase;
        private Scorer scorer;

        private LuceneHitCollector() {
            //Nothing special to do
        }

        public List<ScoreDoc> getDocs() {
            Collections.sort(docs, new Comparator<ScoreDoc>() {

                public int compare(ScoreDoc scoreDoc, ScoreDoc scoreDoc1) {
                    if (scoreDoc.doc == scoreDoc1.doc)
                        return 0;
                    else if (scoreDoc.doc < scoreDoc1.doc)
                        return -1;
                    return 1;
                }
            });
            return docs;
        }
        
        @Override
        public void setScorer(Scorer scorer) throws IOException {
            this.scorer = scorer;
        }

        @Override
        public void setNextReader(IndexReader indexReader, int docBase) throws IOException {
            this.docBase = docBase;
        }

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return false;
        }

        @Override
        public void collect(int doc) {
            try {
                float score = scorer.score();
                docs.add(new ScoreDoc(doc + docBase, score));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    private NodeId readNodeId(Document doc) {
        byte[] temp = doc.getBinaryValue(FIELD_NODE_ID);
        int units = ByteConversion.byteToShort(temp, 0);
        return index.getBrokerPool().getNodeFactory()
                .createFromData(units, temp, 2);
    }

    /**
     * Check index configurations for all collection in the given DocumentSet and return
     * a list of QNames, which have indexes defined on them.
     *
     * @return List of QName objects on which indexes are defined
     */
    private List<QName> getDefinedIndexes(List<QName> qnames) {
        List<QName> indexes = new ArrayList<QName>(20);
        if (qnames != null && !qnames.isEmpty()) {
            for (QName qname : qnames) {
                if (qname.getLocalName() == null || qname.getNamespaceURI() == null)
                    getDefinedIndexesFor(qname, indexes);
                else
                    indexes.add(qname);
            }
            return indexes;
        }
        return getDefinedIndexesFor(null, indexes);
    }

    private List<QName> getDefinedIndexesFor(QName qname, List<QName> indexes) {
        IndexReader reader = null;
        try {
            reader = index.getReader();
            java.util.Collection<String> fields = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
            for (String field: fields) {
                if (!FIELD_DOC_ID.equals(field)) {
                    QName name = decodeQName(field);
                    if (name != null && (qname == null || matchQName(qname, name)))
                        indexes.add(name);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            index.releaseReader(reader);
        }
        return indexes;
    }

    private static boolean matchQName(QName qname, QName candidate) {
        boolean match = true;
        if (qname.getLocalName() != null)
            match = qname.getLocalName().equals(candidate.getLocalName());
        if (match && qname.getNamespaceURI() != null && qname.getNamespaceURI().length() > 0)
            match = qname.getNamespaceURI().equals(candidate.getNamespaceURI());
        return match;
    }

    /**
     * Return the analyzer to be used for the given field or qname. Either field
     * or qname should be specified.
     */
    private Analyzer getAnalyzer(String field, QName qname, DBBroker broker, DocumentSet docs) {
        for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
            Collection collection = i.next();
            IndexSpec idxConf = collection.getIndexConfiguration(broker);
            if (idxConf != null) {
                LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
                if (config != null) {
                    Analyzer analyzer;
                    if (field == null)
                    	analyzer = config.getAnalyzer(qname);
                    else
                    	analyzer = config.getAnalyzer(field);
                    if (analyzer != null)
                        return analyzer;
                }
            }
        }
        return index.getDefaultAnalyzer();
    }

    public boolean checkIndex(DBBroker broker) {
        return false;  //To change body of implemented methods use File | Settings | File Templates.
    }

    public Occurrences[] scanIndex(XQueryContext context, DocumentSet docs, NodeSet nodes, Map hints) {
        List<QName> qnames = hints == null ? null : (List<QName>)hints.get(QNAMES_KEY);
        qnames = getDefinedIndexes(qnames);
        //Expects a StringValue
        String start = null, end = null;
        long max = Long.MAX_VALUE;
        if (hints != null) {
            Object vstart = hints.get(START_VALUE);
            Object vend = hints.get(END_VALUE);
            start = vstart == null ? null : vstart.toString();
            end = vend == null ? null : vend.toString();
            IntegerValue vmax = (IntegerValue) hints.get(VALUE_COUNT);
            max = vmax == null ? Long.MAX_VALUE : vmax.getValue();
        }
        if (nodes == null || max < Long.MAX_VALUE)
            return scanIndexByQName(qnames, docs, nodes, start, end, max);
        return scanIndexByNodes(qnames, docs, nodes, start, end, max);
    }

    private Occurrences[] scanIndexByQName(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
        TreeMap<String, Occurrences> map = new TreeMap<String, Occurrences>();
        IndexReader reader = null;
        try {
            reader = index.getReader();
            for (QName qname : qnames) {
                String field = encodeQName(qname);
                TermEnum terms;
                if (start == null)
                    terms = reader.terms(new Term(field, ""));
                else
                    terms = reader.terms(new Term(field, start));
                if (terms == null)
                    continue;
                Term term;
                TermDocs termDocs = reader.termDocs();
                do {
                    term = terms.term();
                    if (term != null && term.field().equals(field)) {
                        boolean include = true;
                        if (end != null) {
                            if (term.text().compareTo(start) > 0)
                                include = false;
                        } else if (start != null && !term.text().startsWith(start))
                            include = false;
                        if (include) {
                            termDocs.seek(term);
                            while (termDocs.next()) {
                                if (reader.isDeleted(termDocs.doc()))
                                    continue;
                                Document doc = reader.document(termDocs.doc());
                                String fDocId = doc.get(FIELD_DOC_ID);
                                int docId = Integer.parseInt(fDocId);
                                DocumentImpl storedDocument = docs.getDoc(docId);
                                if (storedDocument == null)
                                    continue;
                                NodeId nodeId = null;
                                if (nodes != null) {
                                    // load document to check if the current node is in the passed context set, if any
                                    nodeId = readNodeId(doc);
                                }
                                if (nodeId == null || nodes.get(storedDocument, nodeId) != null) {
                                    Occurrences oc = map.get(term.text());
                                    if (oc == null) {
                                        oc = new Occurrences(term.text());
                                        map.put(term.text(), oc);
                                    }
                                    oc.addDocument(storedDocument);
                                    oc.addOccurrences(termDocs.freq());
                                }
                            }
                            termDocs.close();
                        }
                    }
                    if (map.size() >= max)
                        break;
                } while (terms.next());
                termDocs.close();
                terms.close();
            }
        } catch (IOException e) {
            LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
        } finally {
            index.releaseReader(reader);
        }
        Occurrences[] occur = new Occurrences[map.size()];
        return map.values().toArray(occur);
    }

    private Occurrences[] scanIndexByNodes(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
        TreeMap<String, Occurrences> map = new TreeMap<String, Occurrences>();

        FieldSelector selector = new FieldSelector() {
            private static final long serialVersionUID = 3270211696620175721L;
            public FieldSelectorResult accept(String field) {
                if (field.equals(FIELD_NODE_ID))
                    return FieldSelectorResult.LOAD_AND_BREAK;
                return FieldSelectorResult.NO_LOAD;
            }
        };
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            IndexReader reader = searcher.getIndexReader();
            for (Iterator<DocumentImpl> i = docs.getDocumentIterator(); i.hasNext(); ) {
                DocumentImpl doc = i.next();
                Query query = new TermQuery(new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(doc.getDocId())));
                DocumentCollector collector = new DocumentCollector(searcher.maxDoc());
                searcher.search(query, collector);

                DocIdSetIterator iter = collector.docs.iterator();
                int next;
                while ((next = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    NodeId nodeId = null;
                    if (nodes != null) {
                        // load document to check if the current node is in the passed context set, if any
                        Document luceneDoc = searcher.doc(next, selector);
                        nodeId = readNodeId(luceneDoc);
                    }
                    if (nodeId == null || nodes.get(doc, nodeId) != null) {
                        for (QName qname : qnames) {
                            String field = encodeQName(qname);
                            TermFreqVector tfv = reader.getTermFreqVector(next, field);
                            if (tfv != null) {
                                String[] terms = tfv.getTerms();
                                int[] freq = tfv.getTermFrequencies();
                                for (int j = 0; j < terms.length; j++) {
                                    boolean include = true;
                                    if (end != null) {
                                        if (terms[j].compareTo(start) > 0)
                                            include = false;
                                    } else if (start != null && !terms[j].startsWith(start))
                                        include = false;
                                    if (include) {
                                        Occurrences oc = map.get(terms[j]);
                                        if (oc == null) {
                                            oc = new Occurrences(terms[j]);
                                            map.put(terms[j], oc);
                                        }
                                        oc.addDocument(doc);
                                        oc.addOccurrences(freq[j]);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        } catch (IOException e) {
            LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
        } finally {
            index.releaseSearcher(searcher);
        }
        return occurrencesToArray(map);
    }

    private Occurrences[] occurrencesToArray(TreeMap<String, Occurrences> map) {
        Occurrences[] occur = new Occurrences[map.size()];
        return map.values().toArray(occur);
    }

    private static class DocumentCollector extends Collector {

        OpenBitSet docs;
        int base = 0;

        private DocumentCollector(int size) {
            docs = new OpenBitSet(size);
        }

        @Override
        public void setScorer(Scorer scorer) throws IOException {
            //What to do there ?
        }

        @Override
        public void collect(int doc) throws IOException {
            docs.set(base + doc);
        }

        @Override
        public void setNextReader(IndexReader indexReader, int base) throws IOException {
            this.base = base;
        }

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }
    }

    /**
     * Adds the passed character sequence to the lucene index. We
     * create one lucene document per XML node, using 2 fields to identify
     * the node:
     *
     * <ul>
     *  <li>docId: eXist-internal document id of the node, stored as string.</li>
     *  <li>nodeId: the id of the node, stored in binary compressed form.</li>
     * </ul>
     *
     * The text is indexed into a field whose name encodes the qualified name of
     * the node. The qualified name is stored as a hex sequence pointing into the
     * global symbol table.
     *
     * @param nodeId
     * @param qname
     * @param content
     */
    protected void indexText(NodeId nodeId, QName qname, NodePath path, LuceneIndexConfig config, CharSequence content) {
        PendingDoc pending = new PendingDoc(nodeId, qname, path, content, config);
        nodesToWrite.add(pending);
        cachedNodesSize += content.length();
        if (cachedNodesSize > maxCachedNodesSize)
            write();
    }

    private class PendingDoc {
        NodeId nodeId;
        CharSequence text;
        QName qname;
        LuceneIndexConfig idxConf;

        private PendingDoc(NodeId nodeId, QName qname, NodePath path, CharSequence text, LuceneIndexConfig idxConf) {
            this.nodeId = nodeId;
            this.qname = qname;
            this.text = text;
            this.idxConf = idxConf;
        }
    }
    
    private void write() {
        if (nodesToWrite == null || nodesToWrite.size() == 0)
            return;
        IndexWriter writer = null;
        try {
            writer = index.getWriter();
            // by default, Lucene only indexes the first 10,000 terms in a field
            writer.setMaxFieldLength(Integer.MAX_VALUE);
            NumericField fDocId = new NumericField(FIELD_DOC_ID, Field.Store.YES, true);
            Field fNodeId = new Field(FIELD_NODE_ID, new byte [] { 0 }, Field.Store.YES);
            for (PendingDoc pending : nodesToWrite) {
                Document doc = new Document();
                if (pending.idxConf.getBoost() > 0)
                    doc.setBoost(pending.idxConf.getBoost());
                else if (config.getBoost() > 0)
                    doc.setBoost(config.getBoost());

                // store the node id
                int nodeIdLen = pending.nodeId.size();
                byte[] data = new byte[nodeIdLen + 2];
                ByteConversion.shortToByte((short) pending.nodeId.units(), data, 0);
                pending.nodeId.serialize(data, 2);

                String contentField;
                // the text content is indexed in a field using either
                // the qname of the element or attribute or the field
                // name defined in the configuration
                if (pending.idxConf.isNamed())
                	contentField = pending.idxConf.getName();
                else
                	contentField = encodeQName(pending.qname);
                fDocId.setIntValue(currentDoc.getDocId());
                fNodeId.setValue(data);

                doc.add(fDocId);
                doc.add(fNodeId);
                doc.add(new Field(contentField, pending.text.toString(), Field.Store.NO, Field.Index.ANALYZED,
                    Field.TermVector.YES));

                if (pending.idxConf.getAnalyzer() == null)
                    writer.addDocument(doc);
                else {
                    writer.addDocument(doc, pending.idxConf.getAnalyzer());
                }
            }
        } catch (IOException e) {
            LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            nodesToWrite = new ArrayList<PendingDoc>();
            cachedNodesSize = 0;
        }
    }

    /**
     * Optimize the Lucene index by merging all segments into a single one. This
     * may take a while and write operations will be blocked during the optimize.
     *
     * @see http://lucene.apache.org/java/3_0_1/api/all/org/apache/lucene/index/IndexWriter.html#optimize()
     */
    public void optimize() {
        IndexWriter writer = null;
        try {
            writer = index.getWriter();
            writer.optimize(true);
        } catch (IOException e) {
            LOG.warn("An exception was caught while optimizing the lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
        }
    }

    /**
     * Encode an element or attribute qname into a lucene field name using the
     * internal ids for namespace and local name.
     * 
     * @param qname
     * @return encoded qname
     */
    private String encodeQName(QName qname) {
        SymbolTable symbols = index.getBrokerPool().getSymbols();
        short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI());
        short localNameId = symbols.getSymbol(qname.getLocalName());
        long nameId = qname.getNameType() | (namespaceId & 0xFFFF) << 16 | (localNameId & 0xFFFFFFFFL) << 32;
        return Long.toHexString(nameId);
    }

    /**
     * Decode the lucene field name into an element or attribute qname.
     * 
     * @param s
     * @return the qname
     */
    private QName decodeQName(String s) {
        SymbolTable symbols = index.getBrokerPool().getSymbols();
        try {
            long l = Long.parseLong(s, 16);
            short namespaceId = (short) ((l >>> 16) & 0xFFFFL);
            short localNameId = (short) ((l >>> 32) & 0xFFFFL);
            byte type = (byte) (l & 0xFFL);
            if (namespaceId < 0 || localNameId < 0)
            	return null;
            String namespaceURI = symbols.getNamespace(namespaceId);
            String localName = symbols.getName(localNameId);
            if (namespaceURI == null || localName == null)
            	return null;
            QName qname = new QName(localName, namespaceURI, "");
            qname.setNameType(type);
            return qname;
        } catch (NumberFormatException e) {
            return null;
        }
    }

    private class LuceneStreamListener extends AbstractStreamListener {

        @Override
        public void startElement(Txn transaction, ElementImpl element, NodePath path) {
            if (mode == STORE && config != null) {
                if (contentStack != null && !contentStack.isEmpty()) {
                    for (TextExtractor extractor : contentStack) {
                        extractor.startElement(element.getQName());
                    }
                }
                Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
                if (configIter != null) {
                    if (contentStack == null) contentStack = new Stack<TextExtractor>();
                    while (configIter.hasNext()) {
                        LuceneIndexConfig configuration = configIter.next();
                        if (configuration.match(path)) {
                            TextExtractor extractor = new DefaultTextExtractor();
                            extractor.configure(config, configuration);
                            contentStack.push(extractor);
                        }
                    }
                }
            }
            super.startElement(transaction, element, path);
        }

        @Override
        public void endElement(Txn transaction, ElementImpl element, NodePath path) {
            if (config != null) {
                if (mode == STORE && contentStack != null && !contentStack.isEmpty()) {
                    for (TextExtractor extractor : contentStack) {
                        extractor.endElement(element.getQName());
                    }
                }
                Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
                if (mode != REMOVE_ALL_NODES && configIter != null) {
                    if (mode == REMOVE_SOME_NODES) {
                        nodesToRemove.add(element.getNodeId());
                    } else {
                        while (configIter.hasNext()) {
                            LuceneIndexConfig configuration = configIter.next();
                            if (configuration.match(path)) {
                                TextExtractor extractor = contentStack.pop();
                                indexText(element.getNodeId(), element.getQName(), 
                                    path, extractor.getIndexConfig(), extractor.getText());
                            }
                        }
                    }
                }
            }
            super.endElement(transaction, element, path);
        }

        @Override
        public void attribute(Txn transaction, AttrImpl attrib, NodePath path) {
            path.addComponent(attrib.getQName());
            Iterator<LuceneIndexConfig> configIter = null;
            if (config != null)
                configIter = config.getConfig(path);
            if (mode != REMOVE_ALL_NODES && configIter != null) {
                if (mode == REMOVE_SOME_NODES) {
                    nodesToRemove.add(attrib.getNodeId());
                } else {
                    while (configIter.hasNext()) {
                        LuceneIndexConfig configuration = configIter.next();
                        if (configuration.match(path)) {
                            indexText(attrib.getNodeId(), attrib.getQName(), path,
                                configuration, attrib.getValue());
                        }
                    }
                }
            }
            path.removeLastComponent();
            super.attribute(transaction, attrib, path);
        }

        @Override
        public void characters(Txn transaction, CharacterDataImpl text, NodePath path) {
            if (contentStack != null && !contentStack.isEmpty()) {
                for (TextExtractor extractor : contentStack) {
                	extractor.beforeCharacters();
                    extractor.characters(text.getXMLString());
                }
            }
            super.characters(transaction, text, path);
        }

        @Override
        public IndexWorker getWorker() {
            return LuceneIndexWorker.this;
        }
    }

    /**
     * Match class containing the score of a match and a reference to
     * the query that generated it.
     */
    public class LuceneMatch extends Match {

        private float score = 0.0f;
        private Query query;

        public LuceneMatch(int contextId, NodeId nodeId, Query query) {
            super(contextId, nodeId, null);
            this.query = query;
        }

        public LuceneMatch(LuceneMatch copy) {
            super(copy);
            this.score = copy.score;
            this.query = copy.query;
        }

        @Override
        public Match createInstance(int contextId, NodeId nodeId, String matchTerm) {
            return null;
        }

        public Match createInstance(int contextId, NodeId nodeId, Query query) {
            return new LuceneMatch(contextId, nodeId, query);
        }

        @Override
        public Match newCopy() {
            return new LuceneMatch(this);
        }

        @Override
        public String getIndexId() {
            return LuceneIndex.ID;
        }

        public Query getQuery() {
            return query;
        }

        public float getScore() {
            return score;
        }

        private void setScore(float score) {
            this.score = score;
        }

        @Override
        public boolean equals(Object other) {
            if(!(other instanceof LuceneMatch))
                return false;
            LuceneMatch o = (LuceneMatch) other;
            return (nodeId == o.nodeId || nodeId.equals(o.nodeId))  &&
                query == ((LuceneMatch)other).query;
        }

        @Override
        public boolean matchEquals(Match other) {
            return equals(other);
        }
    }

    private static class NodeFieldSelector implements FieldSelector {

        private static final long serialVersionUID = -4899170629980829109L;

        public FieldSelectorResult accept(String fieldName) {
            if (FIELD_DOC_ID.equals(fieldName))
                return FieldSelectorResult.LOAD;
            if (FIELD_NODE_ID.equals(fieldName))
                return FieldSelectorResult.LOAD_AND_BREAK;
            return FieldSelectorResult.NO_LOAD;
        }
    }
}