/* * eXist Open Source Native XML Database * Copyright (C) 2001-07 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * $Id$ */ package org.exist.indexing.ngram; import org.apache.log4j.Logger; import org.exist.collections.Collection; import org.exist.dom.*; import org.exist.indexing.AbstractMatchListener; import org.exist.indexing.AbstractStreamListener; import org.exist.indexing.Index; import org.exist.indexing.IndexController; import org.exist.indexing.IndexWorker; import org.exist.indexing.MatchListener; import org.exist.indexing.OrderedValuesIndex; import org.exist.indexing.QNamedKeysIndex; import org.exist.indexing.StreamListener; import org.exist.numbering.NodeId; import org.exist.stax.EmbeddedXMLStreamReader; import org.exist.storage.DBBroker; import org.exist.storage.ElementValue; import org.exist.storage.IndexSpec; import org.exist.storage.NodePath; import org.exist.storage.OccurrenceList; import org.exist.storage.btree.BTreeCallback; import org.exist.storage.btree.BTreeException; import org.exist.storage.btree.IndexQuery; import org.exist.storage.btree.Value; import org.exist.storage.index.BFile; import org.exist.storage.io.VariableByteInput; import org.exist.storage.io.VariableByteOutputStream; import org.exist.storage.lock.Lock; import org.exist.storage.txn.Txn; import org.exist.util.ByteArray; import org.exist.util.ByteConversion; import org.exist.util.DatabaseConfigurationException; import org.exist.util.FastQSort; import org.exist.util.LockException; import org.exist.util.Occurrences; import org.exist.util.ReadOnlyException; import org.exist.util.UTF8; import org.exist.util.XMLString; import org.exist.util.serializer.AttrList; import org.exist.xquery.Constants; import org.exist.xquery.TerminatedException; import org.exist.xquery.XPathException; import org.exist.xquery.XQueryContext; import org.exist.xquery.value.StringValue; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Stack; import java.util.TreeMap; /** * * Each index entry maps a key (collectionId, ngram) to a list of occurrences, which has the * following structure: * * <pre>[docId : int, nameType: byte, occurrenceCount: int, entrySize: long, [id: NodeId, offset: int, ...]* ]</pre> */ public class NGramIndexWorker implements OrderedValuesIndex, QNamedKeysIndex { private static final Logger LOG = Logger.getLogger(NGramIndexWorker.class); private final static String INDEX_ELEMENT = "ngram"; private static final String QNAME_ATTR = "qname"; private static final byte IDX_QNAME = 0; private static final byte IDX_GENERIC = 1; private int mode = 0; private org.exist.indexing.ngram.NGramIndex index; private char[] buf = new char[1024]; private int currentChar = 0; private DocumentImpl currentDoc = null; private DBBroker broker; private IndexController controller; private Map ngrams = new TreeMap(); private VariableByteOutputStream os = new VariableByteOutputStream(7); private NGramMatchListener matchListener = null; public NGramIndexWorker(DBBroker broker, org.exist.indexing.ngram.NGramIndex index) { this.broker = broker; this.index = index; Arrays.fill(buf, ' '); } public String getIndexId() { return index.ID; } public String getIndexName() { return index.getIndexName(); } public Index getIndex() { return index; } public int getN() { return index.getN(); } public Object configure(IndexController controller, NodeList configNodes, Map namespaces) throws DatabaseConfigurationException { this.controller = controller; // We use a map to store the QNames to be indexed Map map = new TreeMap(); Node node; for(int i = 0; i < configNodes.getLength(); i++) { node = configNodes.item(i); if(node.getNodeType() == Node.ELEMENT_NODE && INDEX_ELEMENT.equals(node.getLocalName())) { String qname = ((Element)node).getAttribute(QNAME_ATTR); if (qname == null || qname.length() == 0) throw new DatabaseConfigurationException("Configuration error: element " + node.getNodeName() + " must have an attribute " + QNAME_ATTR); if (LOG.isTraceEnabled()) LOG.trace("NGram index defined on " + qname); NGramIndexConfig config = new NGramIndexConfig(namespaces, qname); map.put(config.getQName(), config); } } return map; } public void flush() { switch (mode) { case StreamListener.STORE : saveIndex(); break; case StreamListener.REMOVE_ALL_NODES : case StreamListener.REMOVE_SOME_NODES : dropIndex(mode); break; } } private void saveIndex() { if (ngrams.size() == 0) return; for (Iterator iterator = ngrams.entrySet().iterator(); iterator.hasNext();) { Map.Entry entry = (Map.Entry) iterator.next(); QNameTerm key = (QNameTerm) entry.getKey(); OccurrenceList occurences = (OccurrenceList) entry.getValue(); occurences.sort(); os.clear(); os.writeInt(currentDoc.getDocId()); os.writeByte(key.qname.getNameType()); os.writeInt(occurences.getTermCount()); //Mark position int lenOffset = os.position(); //Dummy value : actual one will be written below os.writeFixedInt(0); NodeId previous = null; for (int m = 0; m < occurences.getSize(); ) { try { previous = occurences.getNode(m).write(previous, os); } catch (IOException e) { LOG.error("IOException while writing fulltext index: " + e.getMessage(), e); } int freq = occurences.getOccurrences(m); os.writeInt(freq); for (int n = 0; n < freq; n++) { os.writeInt(occurences.getOffset(m + n)); } m += freq; } //Write (variable) length of node IDs + frequency + offsets os.writeFixedInt(lenOffset, os.position() - lenOffset - 4); ByteArray data = os.data(); if (data.size() == 0) continue; Lock lock = index.db.getLock(); try { lock.acquire(Lock.WRITE_LOCK); NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term); index.db.append(value, data); } catch (LockException e) { LOG.warn("Failed to acquire lock for file " + index.db.getFile().getName(), e); } catch (IOException e) { LOG.warn("IO error for file " + index.db.getFile().getName(), e); } catch (ReadOnlyException e) { LOG.warn("Read-only error for file " + index.db.getFile().getName(), e); } finally { lock.release(Lock.WRITE_LOCK); os.clear(); } } ngrams.clear(); } private void dropIndex(int mode) { if (ngrams.size() == 0) return; for (Iterator iterator = ngrams.entrySet().iterator(); iterator.hasNext();) { Map.Entry entry = (Map.Entry) iterator.next(); QNameTerm key = (QNameTerm) entry.getKey(); OccurrenceList occurencesList = (OccurrenceList) entry.getValue(); occurencesList.sort(); os.clear(); Lock lock = index.db.getLock(); try { lock.acquire(Lock.WRITE_LOCK); NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term); boolean changed = false; os.clear(); VariableByteInput is = index.db.getAsStream(value); if (is == null) continue; while (is.available() > 0) { int storedDocId = is.readInt(); byte nameType = is.readByte(); int occurrences = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); if (storedDocId != currentDoc.getDocId()) { // data are related to another document: // copy them to any existing data os.writeInt(storedDocId); os.writeByte(nameType); os.writeInt(occurrences); os.writeFixedInt(length); is.copyRaw(os, length); } else { // data are related to our document: if (mode == StreamListener.REMOVE_ALL_NODES) { // skip them is.skipBytes(length); } else { // removing nodes: need to filter out the node ids to be removed // feed the new list with the GIDs NodeId previous = null; OccurrenceList newOccurrences = new OccurrenceList(); for (int m = 0; m < occurrences; m++) { NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); // add the node to the new list if it is not // in the list of removed nodes if (!occurencesList.contains(nodeId)) { for (int n = 0; n < freq; n++) { newOccurrences.add(nodeId, is.readInt()); } } else { is.skip(freq); } } // append the data from the new list if(newOccurrences.getSize() > 0) { //Don't forget this one newOccurrences.sort(); os.writeInt(currentDoc.getDocId()); os.writeByte(nameType); os.writeInt(newOccurrences.getTermCount()); //Mark position int lenOffset = os.position(); //Dummy value : actual one will be written below os.writeFixedInt(0); previous = null; for (int m = 0; m < newOccurrences.getSize(); ) { previous = newOccurrences.getNode(m).write(previous, os); int freq = newOccurrences.getOccurrences(m); os.writeInt(freq); for (int n = 0; n < freq; n++) { os.writeInt(newOccurrences.getOffset(m + n)); } m += freq; } //Write (variable) length of node IDs + frequency + offsets os.writeFixedInt(lenOffset, os.position() - lenOffset - 4); } } changed = true; } } //Store new data, if relevant if (changed) { //Well, nothing to store : remove the existing data if (os.data().size() == 0) { index.db.remove(value); } else { if (index.db.put(value, os.data()) == BFile.UNKNOWN_ADDRESS) { LOG.error("Could not put index data for token '" + key.term + "' in '" + index.db.getFile().getName() + "'"); } } } } catch (LockException e) { LOG.warn("Failed to acquire lock for file " + index.db.getFile().getName(), e); } catch (IOException e) { LOG.warn("IO error for file " + index.db.getFile().getName(), e); } catch (ReadOnlyException e) { LOG.warn("Read-only error for file " + index.db.getFile().getName(), e); } finally { lock.release(Lock.WRITE_LOCK); os.clear(); } } ngrams.clear(); } public void removeCollection(Collection collection, DBBroker broker) { if (LOG.isDebugEnabled()) LOG.debug("Dropping NGram index for collection " + collection.getURI()); final Lock lock = index.db.getLock(); try { lock.acquire(Lock.WRITE_LOCK); // remove generic index Value value = new NGramQNameKey(collection.getId()); index.db.removeAll(null, new IndexQuery(IndexQuery.TRUNC_RIGHT, value)); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + index.db.getFile().getName() + "'", e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (IOException e) { LOG.error(e.getMessage(), e); } finally { lock.release(Lock.WRITE_LOCK); } } public NodeSet search(int contextId, DocumentSet docs, List qnames, String query, String ngram, XQueryContext context, NodeSet contextSet, int axis) throws TerminatedException { if (qnames == null || qnames.isEmpty()) qnames = getDefinedIndexes(context.getBroker(), docs); final NodeSet result = new ExtArrayNodeSet(docs.getDocumentCount(), 250); for (Iterator iter = docs.getCollectionIterator(); iter.hasNext();) { final int collectionId = ((org.exist.collections.Collection) iter.next()).getId(); for (int i = 0; i < qnames.size(); i++) { QName qname = (QName) qnames.get(i); NGramQNameKey key = new NGramQNameKey(collectionId, qname, index.getBrokerPool().getSymbols(), query); final Lock lock = index.db.getLock(); try { lock.acquire(Lock.READ_LOCK); SearchCallback cb = new SearchCallback(contextId, query, ngram, docs, contextSet, context, result, axis == NodeSet.ANCESTOR); int op = query.length() < getN() ? IndexQuery.TRUNC_RIGHT : IndexQuery.EQ; index.db.query(new IndexQuery(op, key), cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + index.db.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage() + " in '" + index.db.getFile().getName() + "'", e); } catch (BTreeException e) { LOG.error(e.getMessage() + " in '" + index.db.getFile().getName() + "'", e); } finally { lock.release(Lock.READ_LOCK); } } } return result; } /** * Check index configurations for all collection in the given DocumentSet and return * a list of QNames, which have indexes defined on them. * * @param broker * @param docs * @return */ private List getDefinedIndexes(DBBroker broker, DocumentSet docs) { List indexes = new ArrayList(20); for (Iterator i = docs.getCollectionIterator(); i.hasNext(); ) { Collection collection = (Collection) i.next(); IndexSpec idxConf = collection.getIndexConfiguration(broker); if (idxConf != null) { Map config = (Map) idxConf.getCustomIndexSpec(NGramIndex.ID); if (config != null) { for (Iterator ci = config.keySet().iterator(); ci.hasNext();) { QName qn = (QName) ci.next(); indexes.add(qn); } } } } return indexes; } public boolean checkIndex(DBBroker broker) { return true; } public Occurrences[] scanIndex(XQueryContext context, DocumentSet docs, NodeSet contextSet, Map hints) { List qnames = hints == null ? null : (List)hints.get(QNAMES_KEY); //Expects a StringValue Object start = hints == null ? null : hints.get(START_VALUE); //Expects a StringValue Object end = hints == null ? null : hints.get(END_VALUE); //TODO : does this fallback make sense ? I guess yes. if (qnames == null || qnames.isEmpty()) qnames = getDefinedIndexes(context.getBroker(), docs); //TODO : use the IndexWorker.VALUE_COUNT hint, if present, to limit the number of returned entries final Lock lock = index.db.getLock(); final IndexScanCallback cb = new IndexScanCallback(docs, contextSet); for (int q = 0; q < qnames.size(); q++) { for (Iterator i = docs.getCollectionIterator(); i.hasNext();) { final int collectionId = ((Collection) i.next()).getId(); final IndexQuery query; if (start == null) { Value startRef = new NGramQNameKey(collectionId); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else if (end == null) { Value startRef = new NGramQNameKey(collectionId, (QName)qnames.get(q), index.getBrokerPool().getSymbols(), ((StringValue)start).getStringValue().toLowerCase()); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else { Value startRef = new NGramQNameKey(collectionId, (QName)qnames.get(q), index.getBrokerPool().getSymbols(), ((StringValue)start).getStringValue().toLowerCase()); Value endRef = new NGramQNameKey(collectionId, (QName)qnames.get(q), index.getBrokerPool().getSymbols(), ((StringValue)end).getStringValue().toLowerCase()); query = new IndexQuery(IndexQuery.BW, startRef, endRef); } try { lock.acquire(Lock.READ_LOCK); index.db.query(query, cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + index.db.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage(), e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (TerminatedException e) { LOG.warn(e.getMessage(), e); } finally { lock.release(Lock.READ_LOCK); } } } Occurrences[] result = new Occurrences[cb.map.size()]; return (Occurrences[]) cb.map.values().toArray(result); } //This listener is always the same whatever the document and the mode //It should thus be declared static private StreamListener listener = new NGramStreamListener(); public StreamListener getListener() { return listener; } public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy) { return getMatchListener(broker, proxy, null); } public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy, NGramMatchCallback callback) { boolean needToFilter = false; Match nextMatch = proxy.getMatches(); while (nextMatch != null) { if (nextMatch.getIndexId() == org.exist.indexing.ngram.NGramIndex.ID) { needToFilter = true; break; } nextMatch = nextMatch.getNextMatch(); } if (!needToFilter) return null; if (matchListener == null) matchListener = new NGramMatchListener(broker, proxy); else matchListener.reset(broker, proxy); matchListener.setMatchCallback(callback); return matchListener; } public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean includeSelf) { if (node.getNodeType() == Node.ATTRIBUTE_NODE) return null; IndexSpec indexConf = node.getDocument().getCollection().getIndexConfiguration(broker); if (indexConf != null) { Map config = (Map) indexConf.getCustomIndexSpec(NGramIndex.ID); if (config == null) return null; boolean reindexRequired = false; int len = node.getNodeType() == Node.ELEMENT_NODE && !includeSelf ? path.length() - 1 : path.length(); for (int i = 0; i < len; i++) { QName qn = path.getComponent(i); if (config.get(qn) != null) { reindexRequired = true; break; } } if (reindexRequired) { StoredNode topMost = null; StoredNode currentNode = node; while (currentNode != null) { if (config.get(currentNode.getQName()) != null) topMost = currentNode; if (currentNode.getDocument().getCollection().isTempCollection() && currentNode.getNodeId().getTreeLevel() == 2) break; //currentNode = (StoredNode) currentNode.getParentNode(); currentNode = currentNode.getParentStoredNode(); } return topMost; } } return null; } public String[] tokenize(CharSequence text) { int len = text.length(); int gramSize = index.getN(); String ngrams[] = new String[len]; int next = 0; for (int i = 0; i < len; i++) { checkBuffer(); for (int j = 0; j < gramSize && i + j < len; j++) { // TODO: case sensitivity should be configurable buf[currentChar + j] = Character.toLowerCase(text.charAt(i + j)); } ngrams[next++] = new String(buf, currentChar, gramSize); currentChar += gramSize; } return ngrams; } /** * Split the specified string into a sequence of ngrams to be used * for querying the index. For example, if we have a 3-gram index, the * string 'distinct' will be split into the ngrams 'dis', 'tin' and 'ct'. * * @param text the character sequence to split * @return a sequence of ngrams. the last item might be shorter than n. */ public String[] getDistinctNGrams(CharSequence text) { int ngramSize = index.getN(); int count = text.length() / ngramSize; int remainder = text.length() % ngramSize; String[] n = new String[(remainder > 0 ? count + 1 : count)]; int pos = 0; for (int i = 0; i < count; i++) { char ch[] = new char[ngramSize]; for (int j = 0; j < ngramSize; j++) { ch[j] = Character.toLowerCase(text.charAt(pos++)); } n[i] = new String(ch); } if (remainder > 0) { char ch[] = new char[remainder]; for (int i = 0; i < remainder; i++) ch[i] = Character.toLowerCase(text.charAt(pos++)); n[count] = new String(ch); } return n; } private void indexText(NodeId nodeId, QName qname, CharSequence text) { String ngram[] = tokenize(text); int len = ngram.length; for (int i = 0; i < len; i++) { QNameTerm key = new QNameTerm(qname, ngram[i]); OccurrenceList list = (OccurrenceList) ngrams.get(key); if (list == null) { list = new OccurrenceList(); list.add(nodeId, i); ngrams.put(key, list); } else { list.add(nodeId, i); } } } private void checkBuffer() { if (currentChar + index.getN() > buf.length) { buf = new char[1024]; Arrays.fill(buf, ' '); currentChar = 0; } } private Map config; private Stack contentStack = null; public void setDocument(DocumentImpl document) { setDocument(document, StreamListener.UNKNOWN); } public void setMode(int newMode) { // wolf: unnecessary call to setDocument? // setDocument(currentDoc, newMode); mode = newMode; } public DocumentImpl getDocument() { return currentDoc; } public int getMode() { return mode; } public void setDocument(DocumentImpl document, int newMode) { currentDoc = document; //config = null; contentStack = null; IndexSpec indexConf = document.getCollection().getIndexConfiguration(broker); if (indexConf != null) config = (Map) indexConf.getCustomIndexSpec(org.exist.indexing.ngram.NGramIndex.ID); mode = newMode; } private class NGramStreamListener extends AbstractStreamListener { public NGramStreamListener() { } public void startElement(Txn transaction, ElementImpl element, NodePath path) { if (config != null && config.get(element.getQName()) != null) { if (contentStack == null) contentStack = new Stack(); XMLString contentBuf = new XMLString(); contentStack.push(contentBuf); } super.startElement(transaction, element, path); } public void attribute(Txn transaction, AttrImpl attrib, NodePath path) { if (config != null && config.get(attrib.getQName()) != null) { indexText(attrib.getNodeId(), attrib.getQName(), attrib.getValue()); } super.attribute(transaction, attrib, path); } public void endElement(Txn transaction, ElementImpl element, NodePath path) { if (config != null && config.get(element.getQName()) != null) { XMLString content = (XMLString) contentStack.pop(); indexText(element.getNodeId(), element.getQName(), content); } super.endElement(transaction, element, path); } public void characters(Txn transaction, CharacterDataImpl text, NodePath path) { if (contentStack != null && !contentStack.isEmpty()) { for (int i = 0; i < contentStack.size(); i++) { XMLString next = (XMLString) contentStack.get(i); next.append(text.getXMLString()); } } super.characters(transaction, text, path); } public IndexWorker getWorker() { return NGramIndexWorker.this; } } private class NGramMatchListener extends AbstractMatchListener { private Match match; private Stack offsetStack = null; private NGramMatchCallback callback = null; private NodeProxy root; public NGramMatchListener(DBBroker broker, NodeProxy proxy) { reset(broker, proxy); } protected void setMatchCallback(NGramMatchCallback cb) { this.callback = cb; } protected void reset(DBBroker broker, NodeProxy proxy) { this.root = proxy; this.match = proxy.getMatches(); setNextInChain(null); /* Check if an index is defined on an ancestor of the current node. * If yes, scan the ancestor to get the offset of the first character * in the current node. For example, if the indexed node is <a>abc<b>de</b></a> * and we query for //a[text:ngram-contains(., 'de')]/b, proxy will be a <b> node, but * the offsets of the matches are relative to the start of <a>. */ NodeSet ancestors = null; Match nextMatch = this.match; while (nextMatch != null) { if (proxy.getNodeId().isDescendantOf(nextMatch.getNodeId())) { if (ancestors == null) ancestors = new ExtArrayNodeSet(); ancestors.add(new NodeProxy(proxy.getDocument(), nextMatch.getNodeId())); } nextMatch = nextMatch.getNextMatch(); } if (ancestors != null && !ancestors.isEmpty()) { for (Iterator i = ancestors.iterator(); i.hasNext(); ) { NodeProxy p = (NodeProxy) i.next(); int startOffset = 0; try { XMLStreamReader reader = broker.getXMLStreamReader(p, false); while (reader.hasNext()) { int ev = reader.next(); NodeId nodeId = (NodeId) reader.getProperty(EmbeddedXMLStreamReader.PROPERTY_NODE_ID); if (nodeId.equals(proxy.getNodeId())) break; if (ev == XMLStreamReader.CHARACTERS) startOffset += reader.getText().length(); } } catch (IOException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } catch (XMLStreamException e) { LOG.warn("Problem found while serializing XML: " + e.getMessage(), e); } if (offsetStack == null) offsetStack = new Stack(); offsetStack.push(new NodeOffset(p.getNodeId(), startOffset)); } } } public void startElement(QName qname, AttrList attribs) throws SAXException { Match nextMatch = match; // check if there are any matches in the current element // if yes, push a NodeOffset object to the stack to track // the node contents while (nextMatch != null) { if (nextMatch.getNodeId().equals(getCurrentNode().getNodeId())) { if (offsetStack == null) offsetStack = new Stack(); offsetStack.push(new NodeOffset(nextMatch.getNodeId())); break; } nextMatch = nextMatch.getNextMatch(); } super.startElement(qname, attribs); } public void endElement(QName qname) throws SAXException { Match nextMatch = match; // check if we need to pop the stack while (nextMatch != null) { if (nextMatch.getNodeId().equals(getCurrentNode().getNodeId())) { offsetStack.pop(); break; } nextMatch = nextMatch.getNextMatch(); } super.endElement(qname); } public void characters(CharSequence seq) throws SAXException { List offsets = null; // a list of offsets to process if (offsetStack != null) { // walk through the stack to find matches which start in // the current string of text for (int i = 0; i < offsetStack.size(); i++) { NodeOffset no = (NodeOffset) offsetStack.get(i); int end = no.offset + seq.length(); // scan all matches Match next = match; while (next != null) { if (next.getIndexId() == NGramIndex.ID && next.getNodeId().equals(no.nodeId)) { int freq = next.getFrequency(); for (int j = 0; j < freq; j++) { Match.Offset offset = next.getOffset(j); if (offset.getOffset() < end && offset.getOffset() + offset.getLength() > no.offset) { // add it to the list to be processed if (offsets == null) { offsets = new ArrayList(4); } // adjust the offset and add it to the list int start = offset.getOffset() - no.offset; int len = offset.getLength(); if (start < 0) { len = len - Math.abs(start); start = 0; } if (start + len > seq.length()) len = seq.length() - start; offsets.add(new Match.Offset(start, len)); } } } next = next.getNextMatch(); } // add the length of the current text to the element content length no.offset = end; } } // now print out the text, marking all matches with a match element if (offsets != null) { FastQSort.sort(offsets, 0, offsets.size() - 1); String s = seq.toString(); int pos = 0; for (int i = 0; i < offsets.size(); i++) { Match.Offset offset = (Match.Offset) offsets.get(i); if (offset.getOffset() > pos) { super.characters(s.substring(pos, pos + (offset.getOffset() - pos))); } if (callback == null) { super.startElement(MATCH_ELEMENT, null); super.characters(s.substring(offset.getOffset(), offset.getOffset() + offset.getLength())); super.endElement(MATCH_ELEMENT); } else { try { callback.match(nextListener, s.substring(offset.getOffset(), offset.getOffset() + offset.getLength()), new NodeProxy(getCurrentNode())); } catch (XPathException e) { throw new SAXException("An error occurred while calling match callback: " + e.getMessage(), e); } } pos = offset.getOffset() + offset.getLength(); } if (pos < s.length()) { super.characters(s.substring(pos)); } } else super.characters(seq); } } private class NodeOffset { NodeId nodeId; int offset = 0; public NodeOffset(NodeId nodeId) { this.nodeId = nodeId; } public NodeOffset(NodeId nodeId, int offset) { this.nodeId = nodeId; this.offset = offset; } } private class QNameTerm implements Comparable { QName qname; String term; public QNameTerm(QName qname, String term) { this.qname = qname; this.term = term; } public int compareTo(Object o) { QNameTerm other = (QNameTerm) o; int cmp = qname.compareTo(other.qname); if (cmp == 0) return term.compareTo(other.term); else return cmp; } } private static class NGramQNameKey extends Value { private final static int COLLECTION_ID_OFFSET = 1; private final static int NAMETYPE_OFFSET = COLLECTION_ID_OFFSET + Collection.LENGTH_COLLECTION_ID; // 5 private final static int NAMESPACE_OFFSET = NAMETYPE_OFFSET + ElementValue.LENGTH_TYPE; // 6 private final static int LOCALNAME_OFFSET = NAMESPACE_OFFSET + SymbolTable.LENGTH_NS_URI; // 8 private final static int NGRAM_OFFSET = LOCALNAME_OFFSET + SymbolTable.LENGTH_LOCAL_NAME; // 10 public NGramQNameKey(int collectionId) { len = Collection.LENGTH_COLLECTION_ID + 1; data = new byte[len]; data[0] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, COLLECTION_ID_OFFSET); } public NGramQNameKey(int collectionId, QName qname, SymbolTable symbols) { len = NGRAM_OFFSET; data = new byte[len]; data[0] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, COLLECTION_ID_OFFSET); final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI()); final short localNameId = symbols.getSymbol(qname.getLocalName()); data[NAMETYPE_OFFSET] = qname.getNameType(); ByteConversion.shortToByte(namespaceId, data, NAMESPACE_OFFSET); ByteConversion.shortToByte(localNameId, data, LOCALNAME_OFFSET); } public NGramQNameKey(int collectionId, QName qname, SymbolTable symbols, String ngram) { len = UTF8.encoded(ngram) + NGRAM_OFFSET; data = new byte[len]; data[0] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, COLLECTION_ID_OFFSET); final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI()); final short localNameId = symbols.getSymbol(qname.getLocalName()); data[NAMETYPE_OFFSET] = qname.getNameType(); ByteConversion.shortToByte(namespaceId, data, NAMESPACE_OFFSET); ByteConversion.shortToByte(localNameId, data, LOCALNAME_OFFSET); UTF8.encode(ngram, data, NGRAM_OFFSET); } } private final class SearchCallback implements BTreeCallback { private int contextId; private String query; private String ngram; private DocumentSet docs; private NodeSet contextSet; private XQueryContext context; private NodeSet resultSet; private boolean returnAncestor; public SearchCallback(int contextId, String query, String ngram, DocumentSet docs, NodeSet contextSet, XQueryContext context, NodeSet result, boolean returnAncestor) { this.contextId = contextId; this.query = query; this.ngram = ngram; this.docs = docs; this.context = context; this.contextSet = contextSet; this.resultSet = result; this.returnAncestor = returnAncestor; } public boolean indexInfo(Value key, long pointer) throws TerminatedException { String ngram; try { ngram = new String(key.getData(), NGramQNameKey.NGRAM_OFFSET, key.getLength() - NGramQNameKey.NGRAM_OFFSET, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error(e.getMessage(), e); return true; } VariableByteInput is; try { is = index.db.getAsStream(pointer); //Does the token already has data in the index ? if (is == null) return true; while (is.available() > 0) { int storedDocId = is.readInt(); is.readByte(); int occurrences = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); DocumentImpl storedDocument = docs.getDoc(storedDocId); //Exit if the document is not concerned if (storedDocument == null) { is.skipBytes(length); continue; } NodeId previous = null; for (int m = 0; m < occurrences; m++) { NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); NodeProxy storedNode = new NodeProxy(storedDocument, nodeId); // if a context set is specified, we can directly check if the // matching node is a descendant of one of the nodes // in the context set. if (contextSet != null) { int sizeHint = contextSet.getSizeHint(storedDocument); if (returnAncestor) { NodeProxy parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL); if (parentNode != null) { readMatches(ngram, is, nodeId, freq, parentNode); resultSet.add(parentNode, sizeHint); } else is.skip(freq); } else { readMatches(ngram, is, nodeId, freq, storedNode); resultSet.add(storedNode, sizeHint); } // otherwise, we add all text nodes without check } else { readMatches(ngram, is, nodeId, freq, storedNode); resultSet.add(storedNode, Constants.NO_SIZE_HINT); } context.proceed(); } } return false; } catch (IOException e) { LOG.error(e.getMessage(), e); return true; } } private void readMatches(String current, VariableByteInput is, NodeId nodeId, int freq, NodeProxy parentNode) throws IOException { int diff = 0; if (current.length() > ngram.length()) diff = current.indexOf(ngram); Match match = new NGramMatch(contextId, nodeId, ngram, freq); for (int n = 0; n < freq; n++) { int offset = is.readInt(); if (diff > 0) offset += diff; match.addOffset(offset, ngram.length()); } parentNode.addMatch(match); } } private final class IndexScanCallback implements BTreeCallback { private DocumentSet docs; private NodeSet contextSet; private Map map = new TreeMap(); IndexScanCallback(DocumentSet docs) { this.docs = docs; } IndexScanCallback(DocumentSet docs, NodeSet contextSet) { this.docs = docs; this.contextSet = contextSet; } /* (non-Javadoc) * @see org.dbxml.core.filer.BTreeCallback#indexInfo(org.dbxml.core.data.Value, long) */ public boolean indexInfo(Value key, long pointer) throws TerminatedException { String term; try { term = new String(key.getData(), NGramQNameKey.NGRAM_OFFSET, key.getLength() - NGramQNameKey.NGRAM_OFFSET, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error(e.getMessage(), e); return true; } VariableByteInput is; try { is = index.db.getAsStream(pointer); } catch (IOException e) { LOG.error(e.getMessage(), e); return true; } try { while (is.available() > 0) { boolean docAdded = false; int storedDocId = is.readInt(); byte nameType = is.readByte(); int occurrences = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); DocumentImpl storedDocument = docs.getDoc(storedDocId); //Exit if the document is not concerned if (storedDocument == null) { is.skipBytes(length); continue; } NodeId previous = null; for (int m = 0; m < occurrences; m++) { NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); is.skip(freq); boolean include = true; //TODO : revisit if (contextSet != null) { NodeProxy parentNode = contextSet.parentWithChild(storedDocument, nodeId, false, true); include = (parentNode != null); } if (include) { Occurrences oc = (Occurrences) map.get(term); if (oc == null) { oc = new Occurrences(term); map.put(term, oc); } if (!docAdded) { oc.addDocument(storedDocument); docAdded = true; } oc.addOccurrences(freq); } } } } catch(IOException e) { LOG.error(e.getMessage() + " in '" + index.db.getFile().getName() + "'", e); } return true; } } public class NGramMatch extends Match { public NGramMatch(int contextId, NodeId nodeId, String matchTerm) { super(contextId, nodeId, matchTerm); } public NGramMatch(int contextId, NodeId nodeId, String matchTerm, int frequency) { super(contextId, nodeId, matchTerm, frequency); } public NGramMatch(Match match) { super(match); } public Match createInstance(int contextId, NodeId nodeId, String matchTerm) { return new NGramMatch(contextId, nodeId, matchTerm); } public Match newCopy() { return new NGramMatch(this); } public String getIndexId() { return org.exist.indexing.ngram.NGramIndex.ID; } } }