package org.exist.indexing.lucene;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.OpenBitSet;
import org.exist.collections.Collection;
import org.exist.dom.AttrImpl;
import org.exist.dom.CharacterDataImpl;
import org.exist.dom.DocumentImpl;
import org.exist.dom.DocumentSet;
import org.exist.dom.ElementImpl;
import org.exist.dom.Match;
import org.exist.dom.NewArrayNodeSet;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.dom.QName;
import org.exist.dom.StoredNode;
import org.exist.dom.SymbolTable;
import org.exist.indexing.AbstractStreamListener;
import org.exist.indexing.IndexController;
import org.exist.indexing.IndexWorker;
import org.exist.indexing.MatchListener;
import org.exist.indexing.OrderedValuesIndex;
import org.exist.indexing.QNamedKeysIndex;
import org.exist.indexing.StreamListener;
import org.exist.numbering.NodeId;
import org.exist.storage.DBBroker;
import org.exist.storage.ElementValue;
import org.exist.storage.IndexSpec;
import org.exist.storage.NodePath;
import org.exist.storage.txn.Txn;
import org.exist.util.ByteConversion;
import org.exist.util.DatabaseConfigurationException;
import org.exist.util.Occurrences;
import org.exist.xquery.Expression;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.value.IntegerValue;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {
public static final String OPTION_DEFAULT_OPERATOR = "default-operator";
public static final String OPTION_PHRASE_SLOP = "phrase-slop";
public static final String OPTION_LEADING_WILDCARD = "leading-wildcard";
public static final String OPTION_FILTER_REWRITE = "filter-rewrite";
public static final String DEFAULT_OPERATOR_OR = "or";
private static final Logger LOG = Logger.getLogger(LuceneIndexWorker.class);
private static final FieldSelector NODE_FIELD_SELECTOR = new NodeFieldSelector();
private LuceneIndex index;
@SuppressWarnings("unused")
private IndexController controller;
private LuceneMatchListener matchListener = null;
private XMLToQuery queryTranslator;
private DBBroker broker;
private DocumentImpl currentDoc = null;
private int mode = 0;
private LuceneConfig config;
private Stack<TextExtractor> contentStack = null;
private Set<NodeId> nodesToRemove = null;
private List<PendingDoc> nodesToWrite = null;
private int cachedNodesSize = 0;
private int maxCachedNodesSize = 4096 * 1024;
private Analyzer analyzer;
public static final String FIELD_NODE_ID = "nodeId";
public static final String FIELD_DOC_ID = "docId";
public LuceneIndexWorker(LuceneIndex parent, DBBroker broker) {
this.index = parent;
this.broker = broker;
this.queryTranslator = new XMLToQuery(index);
}
public String getIndexId() {
return LuceneIndex.ID;
}
public String getIndexName() {
return index.getIndexName();
}
public Object configure(IndexController controller, NodeList configNodes, Map namespaces) throws DatabaseConfigurationException {
this.controller = controller;
LOG.debug("Configuring lucene index...");
config = new LuceneConfig(configNodes, namespaces);
return config;
}
public void flush() {
switch (mode) {
case StreamListener.STORE:
write();
break;
case StreamListener.REMOVE_ALL_NODES:
removeDocument(currentDoc.getDocId());
break;
case StreamListener.REMOVE_SOME_NODES:
removeNodes();
break;
}
}
public void setDocument(DocumentImpl document) {
setDocument(document, StreamListener.UNKNOWN);
}
public void setDocument(DocumentImpl document, int newMode) {
currentDoc = document;
//config = null;
contentStack = null;
IndexSpec indexConf = document.getCollection().getIndexConfiguration(broker);
if (indexConf != null) {
config = (LuceneConfig) indexConf.getCustomIndexSpec(LuceneIndex.ID);
if (config != null)
// Create a copy of the original LuceneConfig (there's only one per db instance),
// so we can safely work with it.
config = new LuceneConfig(config);
}
mode = newMode;
}
public void setMode(int mode) {
this.mode = mode;
switch (mode) {
case StreamListener.STORE:
if (nodesToWrite == null)
nodesToWrite = new ArrayList<PendingDoc>();
else
nodesToWrite.clear();
cachedNodesSize = 0;
break;
case StreamListener.REMOVE_SOME_NODES:
nodesToRemove = new TreeSet<NodeId>();
break;
}
}
public DocumentImpl getDocument() {
return currentDoc;
}
public int getMode() {
return this.mode;
}
public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean includeSelf) {
if (node.getNodeType() == Node.ATTRIBUTE_NODE)
return null;
if (config == null)
return null;
NodePath p = new NodePath(path);
boolean reindexRequired = false;
if (node.getNodeType() == Node.ELEMENT_NODE && !includeSelf)
p.removeLastComponent();
for (int i = 0; i < p.length(); i++) {
if (config.matches(p)) {
reindexRequired = true;
break;
}
p.removeLastComponent();
}
if (reindexRequired) {
p = new NodePath(path);
StoredNode topMost = null;
StoredNode currentNode = node;
if (currentNode.getNodeType() != Node.ELEMENT_NODE)
currentNode = currentNode.getParentStoredNode();
while (currentNode != null) {
if (config.matches(p))
topMost = currentNode;
currentNode = currentNode.getParentStoredNode();
p.removeLastComponent();
}
return topMost;
}
return null;
}
private StreamListener listener = new LuceneStreamListener();
public StreamListener getListener() {
return listener;
}
public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy) {
boolean needToFilter = false;
Match nextMatch = proxy.getMatches();
while (nextMatch != null) {
if (nextMatch.getIndexId() == LuceneIndex.ID) {
needToFilter = true;
break;
}
nextMatch = nextMatch.getNextMatch();
}
if (!needToFilter)
return null;
if (matchListener == null)
matchListener = new LuceneMatchListener(index, broker, proxy);
else
matchListener.reset(broker, proxy);
return matchListener;
}
protected void removeDocument(int docId) {
IndexReader reader = null;
try {
reader = index.getWritingReader();
Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(docId));
reader.deleteDocuments(dt);
reader.flush();
} catch (IOException e) {
LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
} finally {
index.releaseWritingReader(reader);
mode = StreamListener.STORE;
}
}
public void removeCollection(Collection collection, DBBroker broker) {
if (LOG.isDebugEnabled())
LOG.debug("Removing collection " + collection.getURI());
IndexReader reader = null;
try {
reader = index.getWritingReader();
for (Iterator<DocumentImpl> i = collection.iterator(broker); i.hasNext(); ) {
DocumentImpl doc = i.next();
Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(doc.getDocId()));
TermDocs td = reader.termDocs(dt);
while (td.next()) {
reader.deleteDocument(td.doc());
}
}
reader.flush();
} catch (IOException e) {
LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
} finally {
index.releaseWritingReader(reader);
mode = StreamListener.STORE;
}
if (LOG.isDebugEnabled())
LOG.debug("Collection removed.");
}
/**
* Remove specific nodes from the index. This method is used for node updates
* and called from flush() if the worker is in {@link StreamListener#REMOVE_SOME_NODES}
* mode.
*/
protected void removeNodes() {
if (nodesToRemove == null)
return;
IndexReader reader = null;
try {
reader = index.getWritingReader();
Term dt = new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(currentDoc.getDocId()));
TermDocs docsEnum = reader.termDocs(dt);
while (docsEnum.next()) {
Document doc = reader.document(docsEnum.doc());
NodeId nodeId = readNodeId(doc);
if (nodesToRemove.contains(nodeId)) {
reader.deleteDocument(docsEnum.doc());
}
}
nodesToRemove = null;
reader.flush();
} catch (IOException e) {
LOG.warn("Error while deleting lucene index entries: " + e.getMessage(), e);
} finally {
index.releaseWritingReader(reader);
}
}
/**
* Query the index. Returns a node set containing all matching nodes. Each node
* in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
* element attached, which stores the score and a link to the query which generated it.
*
* @param context current XQuery context
* @param contextId current context id, identify to track the position inside nested XPath predicates
* @param docs query will be restricted to documents in this set
* @param contextSet if specified, returned nodes will be descendants of the nodes in this set
* @param qnames query will be restricted to nodes with the qualified names given here
* @param queryStr a lucene query string
* @param axis which node is returned: the node in which a match was found or the corresponding ancestor
* from the contextSet
* @return node set containing all matching nodes
*
* @throws IOException
* @throws ParseException
*/
public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
List<QName> qnames, String queryStr, int axis, Properties options)
throws IOException, ParseException {
qnames = getDefinedIndexes(qnames);
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
for (QName qname : qnames) {
String field = encodeQName(qname);
Analyzer analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
QueryParser parser = new QueryParser(field, analyzer);
setOptions(options, parser);
Query query = parser.parse(queryStr);
searchAndProcess(contextId, qname, docs, contextSet, resultSet,
returnAncestor, searcher, query);
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
private void setOptions(Properties options, QueryParser parser) throws ParseException {
if (options == null)
return;
String option = options.getProperty(OPTION_DEFAULT_OPERATOR);
if (option != null) {
if (DEFAULT_OPERATOR_OR.equals(option))
parser.setDefaultOperator(QueryParser.OR_OPERATOR);
else
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
}
option = options.getProperty(OPTION_LEADING_WILDCARD);
if (option != null)
parser.setAllowLeadingWildcard(option.equalsIgnoreCase("yes"));
option = options.getProperty(OPTION_PHRASE_SLOP);
if (option != null) {
try {
int slop = Integer.parseInt(option);
parser.setPhraseSlop(slop);
} catch (NumberFormatException e) {
throw new ParseException("value for option " + OPTION_PHRASE_SLOP + " needs to be a number");
}
}
option = options.getProperty(OPTION_FILTER_REWRITE);
if (option != null) {
if (option.equalsIgnoreCase("yes"))
parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
else
parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
}
}
/**
* Query the index. Returns a node set containing all matching nodes. Each node
* in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
* element attached, which stores the score and a link to the query which generated it.
*
* @param context current XQuery context
* @param contextId current context id, identify to track the position inside nested XPath predicates
* @param docs query will be restricted to documents in this set
* @param contextSet if specified, returned nodes will be descendants of the nodes in this set
* @param qnames query will be restricted to nodes with the qualified names given here
* @param queryRoot an XML representation of the query, see {@link XMLToQuery}.
* @param axis which node is returned: the node in which a match was found or the corresponding ancestor
* from the contextSet
* @return node set containing all matching nodes
*
* @throws IOException
* @throws ParseException
*/
public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
List<QName> qnames, Element queryRoot, int axis, Properties options)
throws IOException, ParseException, XPathException {
qnames = getDefinedIndexes(qnames);
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
for (QName qname : qnames) {
String field = encodeQName(qname);
analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
if (query != null) {
searchAndProcess(contextId, qname, docs, contextSet, resultSet,
returnAncestor, searcher, query);
}
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
String field, Element queryRoot, int axis, Properties options)
throws IOException, XPathException {
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
analyzer = getAnalyzer(field, null, context.getBroker(), docs);
Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
if (query != null) {
searchAndProcess(contextId, null, docs, contextSet, resultSet,
returnAncestor, searcher, query);
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
private void searchAndProcess(int contextId, QName qname, DocumentSet docs,
NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
IndexSearcher searcher, Query query) throws IOException {
LuceneHitCollector collector = new LuceneHitCollector();
searcher.search(query, collector);
processHits(collector.getDocs(), searcher, contextId, qname, docs, contextSet, resultSet, returnAncestor, query);
}
public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
String field, String queryString, int axis, Properties options)
throws IOException, ParseException {
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
Analyzer analyzer = getAnalyzer(field, null, context.getBroker(), docs);
LOG.debug("Using analyzer " + analyzer + " for " + queryString);
QueryParser parser = new QueryParser(field, analyzer);
setOptions(options, parser);
Query query = parser.parse(queryString);
searchAndProcess(contextId, null, docs, contextSet, resultSet,
returnAncestor, searcher, query);
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
/**
* Process the query results collected from the Lucene index and
* map them to the corresponding XML nodes in eXist.
*/
private void processHits(List<ScoreDoc> hits, IndexSearcher searcher, int contextId, QName qname, DocumentSet docs, NodeSet contextSet,
NodeSet resultSet, boolean returnAncestor, Query query) {
for (ScoreDoc scoreDoc : hits) {
try {
Document doc = searcher.doc(scoreDoc.doc, NODE_FIELD_SELECTOR);
String fDocId = doc.get(FIELD_DOC_ID);
int docId = Integer.parseInt(fDocId);
DocumentImpl storedDocument = docs.getDoc(docId);
if (storedDocument == null)
continue;
NodeId nodeId = readNodeId(doc);
NodeProxy storedNode = new NodeProxy(storedDocument, nodeId);
if (qname != null)
storedNode.setNodeType(qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
// if a context set is specified, we can directly check if the
// matching node is a descendant of one of the nodes
// in the context set.
if (contextSet != null) {
int sizeHint = contextSet.getSizeHint(storedDocument);
if (returnAncestor) {
NodeProxy parentNode = contextSet.get(storedNode);
// NodeProxy parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
if (parentNode != null) {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(scoreDoc.score);
parentNode.addMatch(match);
resultSet.add(parentNode, sizeHint);
if (Expression.NO_CONTEXT_ID != contextId) {
parentNode.deepCopyContext(storedNode, contextId);
} else
parentNode.copyContext(storedNode);
}
} else {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(scoreDoc.score);
storedNode.addMatch(match);
resultSet.add(storedNode, sizeHint);
}
} else {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(scoreDoc.score);
storedNode.addMatch(match);
resultSet.add(storedNode);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static class LuceneHitCollector extends Collector {
private List<ScoreDoc> docs = new ArrayList<ScoreDoc>();
private int docBase;
private Scorer scorer;
private LuceneHitCollector() {
//Nothing special to do
}
public List<ScoreDoc> getDocs() {
Collections.sort(docs, new Comparator<ScoreDoc>() {
public int compare(ScoreDoc scoreDoc, ScoreDoc scoreDoc1) {
if (scoreDoc.doc == scoreDoc1.doc)
return 0;
else if (scoreDoc.doc < scoreDoc1.doc)
return -1;
return 1;
}
});
return docs;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void setNextReader(IndexReader indexReader, int docBase) throws IOException {
this.docBase = docBase;
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
@Override
public void collect(int doc) {
try {
float score = scorer.score();
docs.add(new ScoreDoc(doc + docBase, score));
} catch (IOException e) {
e.printStackTrace();
}
}
}
private NodeId readNodeId(Document doc) {
byte[] temp = doc.getBinaryValue(FIELD_NODE_ID);
int units = ByteConversion.byteToShort(temp, 0);
return index.getBrokerPool().getNodeFactory()
.createFromData(units, temp, 2);
}
/**
* Check index configurations for all collection in the given DocumentSet and return
* a list of QNames, which have indexes defined on them.
*
* @return List of QName objects on which indexes are defined
*/
private List<QName> getDefinedIndexes(List<QName> qnames) {
List<QName> indexes = new ArrayList<QName>(20);
if (qnames != null && !qnames.isEmpty()) {
for (QName qname : qnames) {
if (qname.getLocalName() == null || qname.getNamespaceURI() == null)
getDefinedIndexesFor(qname, indexes);
else
indexes.add(qname);
}
return indexes;
}
return getDefinedIndexesFor(null, indexes);
}
private List<QName> getDefinedIndexesFor(QName qname, List<QName> indexes) {
IndexReader reader = null;
try {
reader = index.getReader();
java.util.Collection<String> fields = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
for (String field: fields) {
if (!FIELD_DOC_ID.equals(field)) {
QName name = decodeQName(field);
if (name != null && (qname == null || matchQName(qname, name)))
indexes.add(name);
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
index.releaseReader(reader);
}
return indexes;
}
private static boolean matchQName(QName qname, QName candidate) {
boolean match = true;
if (qname.getLocalName() != null)
match = qname.getLocalName().equals(candidate.getLocalName());
if (match && qname.getNamespaceURI() != null && qname.getNamespaceURI().length() > 0)
match = qname.getNamespaceURI().equals(candidate.getNamespaceURI());
return match;
}
/**
* Return the analyzer to be used for the given field or qname. Either field
* or qname should be specified.
*/
private Analyzer getAnalyzer(String field, QName qname, DBBroker broker, DocumentSet docs) {
for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
Collection collection = i.next();
IndexSpec idxConf = collection.getIndexConfiguration(broker);
if (idxConf != null) {
LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
if (config != null) {
Analyzer analyzer;
if (field == null)
analyzer = config.getAnalyzer(qname);
else
analyzer = config.getAnalyzer(field);
if (analyzer != null)
return analyzer;
}
}
}
return index.getDefaultAnalyzer();
}
public boolean checkIndex(DBBroker broker) {
return false; //To change body of implemented methods use File | Settings | File Templates.
}
public Occurrences[] scanIndex(XQueryContext context, DocumentSet docs, NodeSet nodes, Map hints) {
List<QName> qnames = hints == null ? null : (List<QName>)hints.get(QNAMES_KEY);
qnames = getDefinedIndexes(qnames);
//Expects a StringValue
String start = null, end = null;
long max = Long.MAX_VALUE;
if (hints != null) {
Object vstart = hints.get(START_VALUE);
Object vend = hints.get(END_VALUE);
start = vstart == null ? null : vstart.toString();
end = vend == null ? null : vend.toString();
IntegerValue vmax = (IntegerValue) hints.get(VALUE_COUNT);
max = vmax == null ? Long.MAX_VALUE : vmax.getValue();
}
if (nodes == null || max < Long.MAX_VALUE)
return scanIndexByQName(qnames, docs, nodes, start, end, max);
return scanIndexByNodes(qnames, docs, nodes, start, end, max);
}
private Occurrences[] scanIndexByQName(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
TreeMap<String, Occurrences> map = new TreeMap<String, Occurrences>();
IndexReader reader = null;
try {
reader = index.getReader();
for (QName qname : qnames) {
String field = encodeQName(qname);
TermEnum terms;
if (start == null)
terms = reader.terms(new Term(field, ""));
else
terms = reader.terms(new Term(field, start));
if (terms == null)
continue;
Term term;
TermDocs termDocs = reader.termDocs();
do {
term = terms.term();
if (term != null && term.field().equals(field)) {
boolean include = true;
if (end != null) {
if (term.text().compareTo(start) > 0)
include = false;
} else if (start != null && !term.text().startsWith(start))
include = false;
if (include) {
termDocs.seek(term);
while (termDocs.next()) {
if (reader.isDeleted(termDocs.doc()))
continue;
Document doc = reader.document(termDocs.doc());
String fDocId = doc.get(FIELD_DOC_ID);
int docId = Integer.parseInt(fDocId);
DocumentImpl storedDocument = docs.getDoc(docId);
if (storedDocument == null)
continue;
NodeId nodeId = null;
if (nodes != null) {
// load document to check if the current node is in the passed context set, if any
nodeId = readNodeId(doc);
}
if (nodeId == null || nodes.get(storedDocument, nodeId) != null) {
Occurrences oc = map.get(term.text());
if (oc == null) {
oc = new Occurrences(term.text());
map.put(term.text(), oc);
}
oc.addDocument(storedDocument);
oc.addOccurrences(termDocs.freq());
}
}
termDocs.close();
}
}
if (map.size() >= max)
break;
} while (terms.next());
termDocs.close();
terms.close();
}
} catch (IOException e) {
LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
} finally {
index.releaseReader(reader);
}
Occurrences[] occur = new Occurrences[map.size()];
return map.values().toArray(occur);
}
private Occurrences[] scanIndexByNodes(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
TreeMap<String, Occurrences> map = new TreeMap<String, Occurrences>();
FieldSelector selector = new FieldSelector() {
private static final long serialVersionUID = 3270211696620175721L;
public FieldSelectorResult accept(String field) {
if (field.equals(FIELD_NODE_ID))
return FieldSelectorResult.LOAD_AND_BREAK;
return FieldSelectorResult.NO_LOAD;
}
};
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
IndexReader reader = searcher.getIndexReader();
for (Iterator<DocumentImpl> i = docs.getDocumentIterator(); i.hasNext(); ) {
DocumentImpl doc = i.next();
Query query = new TermQuery(new Term(FIELD_DOC_ID, NumericUtils.intToPrefixCoded(doc.getDocId())));
DocumentCollector collector = new DocumentCollector(searcher.maxDoc());
searcher.search(query, collector);
DocIdSetIterator iter = collector.docs.iterator();
int next;
while ((next = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
NodeId nodeId = null;
if (nodes != null) {
// load document to check if the current node is in the passed context set, if any
Document luceneDoc = searcher.doc(next, selector);
nodeId = readNodeId(luceneDoc);
}
if (nodeId == null || nodes.get(doc, nodeId) != null) {
for (QName qname : qnames) {
String field = encodeQName(qname);
TermFreqVector tfv = reader.getTermFreqVector(next, field);
if (tfv != null) {
String[] terms = tfv.getTerms();
int[] freq = tfv.getTermFrequencies();
for (int j = 0; j < terms.length; j++) {
boolean include = true;
if (end != null) {
if (terms[j].compareTo(start) > 0)
include = false;
} else if (start != null && !terms[j].startsWith(start))
include = false;
if (include) {
Occurrences oc = map.get(terms[j]);
if (oc == null) {
oc = new Occurrences(terms[j]);
map.put(terms[j], oc);
}
oc.addDocument(doc);
oc.addOccurrences(freq[j]);
}
}
}
}
}
}
}
} catch (IOException e) {
LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
} finally {
index.releaseSearcher(searcher);
}
return occurrencesToArray(map);
}
private Occurrences[] occurrencesToArray(TreeMap<String, Occurrences> map) {
Occurrences[] occur = new Occurrences[map.size()];
return map.values().toArray(occur);
}
private static class DocumentCollector extends Collector {
OpenBitSet docs;
int base = 0;
private DocumentCollector(int size) {
docs = new OpenBitSet(size);
}
@Override
public void setScorer(Scorer scorer) throws IOException {
//What to do there ?
}
@Override
public void collect(int doc) throws IOException {
docs.set(base + doc);
}
@Override
public void setNextReader(IndexReader indexReader, int base) throws IOException {
this.base = base;
}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
}
/**
* Adds the passed character sequence to the lucene index. We
* create one lucene document per XML node, using 2 fields to identify
* the node:
*
* <ul>
* <li>docId: eXist-internal document id of the node, stored as string.</li>
* <li>nodeId: the id of the node, stored in binary compressed form.</li>
* </ul>
*
* The text is indexed into a field whose name encodes the qualified name of
* the node. The qualified name is stored as a hex sequence pointing into the
* global symbol table.
*
* @param nodeId
* @param qname
* @param content
*/
protected void indexText(NodeId nodeId, QName qname, NodePath path, LuceneIndexConfig config, CharSequence content) {
PendingDoc pending = new PendingDoc(nodeId, qname, path, content, config);
nodesToWrite.add(pending);
cachedNodesSize += content.length();
if (cachedNodesSize > maxCachedNodesSize)
write();
}
private class PendingDoc {
NodeId nodeId;
CharSequence text;
QName qname;
LuceneIndexConfig idxConf;
private PendingDoc(NodeId nodeId, QName qname, NodePath path, CharSequence text, LuceneIndexConfig idxConf) {
this.nodeId = nodeId;
this.qname = qname;
this.text = text;
this.idxConf = idxConf;
}
}
private void write() {
if (nodesToWrite == null || nodesToWrite.size() == 0)
return;
IndexWriter writer = null;
try {
writer = index.getWriter();
// by default, Lucene only indexes the first 10,000 terms in a field
writer.setMaxFieldLength(Integer.MAX_VALUE);
NumericField fDocId = new NumericField(FIELD_DOC_ID, Field.Store.YES, true);
Field fNodeId = new Field(FIELD_NODE_ID, new byte [] { 0 }, Field.Store.YES);
for (PendingDoc pending : nodesToWrite) {
Document doc = new Document();
if (pending.idxConf.getBoost() > 0)
doc.setBoost(pending.idxConf.getBoost());
else if (config.getBoost() > 0)
doc.setBoost(config.getBoost());
// store the node id
int nodeIdLen = pending.nodeId.size();
byte[] data = new byte[nodeIdLen + 2];
ByteConversion.shortToByte((short) pending.nodeId.units(), data, 0);
pending.nodeId.serialize(data, 2);
String contentField;
// the text content is indexed in a field using either
// the qname of the element or attribute or the field
// name defined in the configuration
if (pending.idxConf.isNamed())
contentField = pending.idxConf.getName();
else
contentField = encodeQName(pending.qname);
fDocId.setIntValue(currentDoc.getDocId());
fNodeId.setValue(data);
doc.add(fDocId);
doc.add(fNodeId);
doc.add(new Field(contentField, pending.text.toString(), Field.Store.NO, Field.Index.ANALYZED,
Field.TermVector.YES));
if (pending.idxConf.getAnalyzer() == null)
writer.addDocument(doc);
else {
writer.addDocument(doc, pending.idxConf.getAnalyzer());
}
}
} catch (IOException e) {
LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
nodesToWrite = new ArrayList<PendingDoc>();
cachedNodesSize = 0;
}
}
/**
* Optimize the Lucene index by merging all segments into a single one. This
* may take a while and write operations will be blocked during the optimize.
*
* @see http://lucene.apache.org/java/3_0_1/api/all/org/apache/lucene/index/IndexWriter.html#optimize()
*/
public void optimize() {
IndexWriter writer = null;
try {
writer = index.getWriter();
writer.optimize(true);
} catch (IOException e) {
LOG.warn("An exception was caught while optimizing the lucene index: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
}
}
/**
* Encode an element or attribute qname into a lucene field name using the
* internal ids for namespace and local name.
*
* @param qname
* @return encoded qname
*/
private String encodeQName(QName qname) {
SymbolTable symbols = index.getBrokerPool().getSymbols();
short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI());
short localNameId = symbols.getSymbol(qname.getLocalName());
long nameId = qname.getNameType() | (namespaceId & 0xFFFF) << 16 | (localNameId & 0xFFFFFFFFL) << 32;
return Long.toHexString(nameId);
}
/**
* Decode the lucene field name into an element or attribute qname.
*
* @param s
* @return the qname
*/
private QName decodeQName(String s) {
SymbolTable symbols = index.getBrokerPool().getSymbols();
try {
long l = Long.parseLong(s, 16);
short namespaceId = (short) ((l >>> 16) & 0xFFFFL);
short localNameId = (short) ((l >>> 32) & 0xFFFFL);
byte type = (byte) (l & 0xFFL);
if (namespaceId < 0 || localNameId < 0)
return null;
String namespaceURI = symbols.getNamespace(namespaceId);
String localName = symbols.getName(localNameId);
if (namespaceURI == null || localName == null)
return null;
QName qname = new QName(localName, namespaceURI, "");
qname.setNameType(type);
return qname;
} catch (NumberFormatException e) {
return null;
}
}
private class LuceneStreamListener extends AbstractStreamListener {
@Override
public void startElement(Txn transaction, ElementImpl element, NodePath path) {
if (mode == STORE && config != null) {
if (contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.startElement(element.getQName());
}
}
Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
if (configIter != null) {
if (contentStack == null) contentStack = new Stack<TextExtractor>();
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
TextExtractor extractor = new DefaultTextExtractor();
extractor.configure(config, configuration);
contentStack.push(extractor);
}
}
}
}
super.startElement(transaction, element, path);
}
@Override
public void endElement(Txn transaction, ElementImpl element, NodePath path) {
if (config != null) {
if (mode == STORE && contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.endElement(element.getQName());
}
}
Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
if (mode != REMOVE_ALL_NODES && configIter != null) {
if (mode == REMOVE_SOME_NODES) {
nodesToRemove.add(element.getNodeId());
} else {
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
TextExtractor extractor = contentStack.pop();
indexText(element.getNodeId(), element.getQName(),
path, extractor.getIndexConfig(), extractor.getText());
}
}
}
}
}
super.endElement(transaction, element, path);
}
@Override
public void attribute(Txn transaction, AttrImpl attrib, NodePath path) {
path.addComponent(attrib.getQName());
Iterator<LuceneIndexConfig> configIter = null;
if (config != null)
configIter = config.getConfig(path);
if (mode != REMOVE_ALL_NODES && configIter != null) {
if (mode == REMOVE_SOME_NODES) {
nodesToRemove.add(attrib.getNodeId());
} else {
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
indexText(attrib.getNodeId(), attrib.getQName(), path,
configuration, attrib.getValue());
}
}
}
}
path.removeLastComponent();
super.attribute(transaction, attrib, path);
}
@Override
public void characters(Txn transaction, CharacterDataImpl text, NodePath path) {
if (contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.beforeCharacters();
extractor.characters(text.getXMLString());
}
}
super.characters(transaction, text, path);
}
@Override
public IndexWorker getWorker() {
return LuceneIndexWorker.this;
}
}
/**
* Match class containing the score of a match and a reference to
* the query that generated it.
*/
public class LuceneMatch extends Match {
private float score = 0.0f;
private Query query;
public LuceneMatch(int contextId, NodeId nodeId, Query query) {
super(contextId, nodeId, null);
this.query = query;
}
public LuceneMatch(LuceneMatch copy) {
super(copy);
this.score = copy.score;
this.query = copy.query;
}
@Override
public Match createInstance(int contextId, NodeId nodeId, String matchTerm) {
return null;
}
public Match createInstance(int contextId, NodeId nodeId, Query query) {
return new LuceneMatch(contextId, nodeId, query);
}
@Override
public Match newCopy() {
return new LuceneMatch(this);
}
@Override
public String getIndexId() {
return LuceneIndex.ID;
}
public Query getQuery() {
return query;
}
public float getScore() {
return score;
}
private void setScore(float score) {
this.score = score;
}
@Override
public boolean equals(Object other) {
if(!(other instanceof LuceneMatch))
return false;
LuceneMatch o = (LuceneMatch) other;
return (nodeId == o.nodeId || nodeId.equals(o.nodeId)) &&
query == ((LuceneMatch)other).query;
}
@Override
public boolean matchEquals(Match other) {
return equals(other);
}
}
private static class NodeFieldSelector implements FieldSelector {
private static final long serialVersionUID = -4899170629980829109L;
public FieldSelectorResult accept(String fieldName) {
if (FIELD_DOC_ID.equals(fieldName))
return FieldSelectorResult.LOAD;
if (FIELD_NODE_ID.equals(fieldName))
return FieldSelectorResult.LOAD_AND_BREAK;
return FieldSelectorResult.NO_LOAD;
}
}
}