/* * eXist Open Source Native XML Database * Copyright (C) 2001-07 The eXist Project * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id$ */ package org.exist.storage; //import java.io.EOFException; import org.exist.EXistException; import org.exist.collections.Collection; import org.exist.dom.*; import org.exist.fulltext.ElementContent; import org.exist.fulltext.FTMatch; import org.exist.numbering.NodeId; import org.exist.security.PermissionDeniedException; import org.exist.storage.analysis.TextToken; import org.exist.storage.btree.BTreeCallback; import org.exist.storage.btree.BTreeException; import org.exist.storage.btree.DBException; import org.exist.storage.btree.IndexQuery; import org.exist.storage.btree.Value; import org.exist.storage.index.BFile; import org.exist.storage.io.VariableByteArrayInput; import org.exist.storage.io.VariableByteInput; import org.exist.storage.io.VariableByteOutputStream; import org.exist.storage.lock.Lock; import org.exist.util.ByteArray; import org.exist.util.ByteConversion; import org.exist.util.Configuration; import org.exist.util.LockException; import org.exist.util.Occurrences; import org.exist.util.ProgressIndicator; import org.exist.util.ReadOnlyException; import org.exist.util.UTF8; import org.exist.util.XMLString; import org.exist.xquery.Constants; import org.exist.xquery.TerminatedException; import org.exist.xquery.XQueryContext; import org.w3c.dom.Node; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.regex.Pattern; /** * This class is responsible for fulltext-indexing. Text-nodes are handed over * to this class to be fulltext-indexed. Method storeText() is called by * RelationalBroker whenever it finds a TextNode. Method getNodeIDsContaining() * is used by the XPath-engine to process queries where a fulltext-operator is * involved. The class keeps two database tables: table <code>dbTokens</code> stores the words * found with their unique id. Table <code>invertedIndex</code> contains the word occurrences for * every word-id per document. * * TODO: store node type (attribute or text) with each entry * * @author Wolfgang Meier */ public class NativeTextEngine extends TextSearchEngine implements ContentLoadingObserver { public static final String FILE_NAME = "words.dbx"; public static final String FILE_KEY_IN_CONFIG = "db-connection.words"; public static final double DEFAULT_WORD_CACHE_GROWTH = 1.4; public static final double DEFAULT_WORD_KEY_THRESHOLD = 0.01; public static final double DEFAULT_WORD_VALUE_THRESHOLD = 0.015; public final static byte TEXT_SECTION = 0; public final static byte ATTRIBUTE_SECTION = 1; public final static byte QNAME_SECTION = 2; private final static byte IDX_GENERIC = 0; private final static byte IDX_QNAME = 1; public static int ATTRIBUTE_BY_QNAME = 0; public static int ATTRIBUTE_NOT_BY_QNAME = 1; public static int TOKENIZE = 0; public static int DO_NOT_TOKENIZE = 1; public static int TEXT_BY_QNAME = 2; public static int FOURTH_OPTION = 3; public final static int LENGTH_NODE_TYPE = 1; //sizeof byte public final static int LENGTH_NODE_IDS_FREQ_OFFSETS = 4; //sizeof int public final static int OFFSET_NODE_TYPE = 0; public final static int OFFSET_ELEMENT_CHILDREN_COUNT = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1 public final static int OFFSET_ATTRIBUTE_DLN_LENGTH = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1 public final static int OFFSET_TEXT_DLN_LENGTH = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1 public final static int OFFSET_DLN = OFFSET_TEXT_DLN_LENGTH + NodeId.LENGTH_NODE_ID_UNITS; //3 /** Length limit for the tokens */ public final static int MAX_TOKEN_LENGTH = 2048; /** The datastore for this token index */ protected BFile dbTokens; protected InvertedIndex invertedIndex; /** The current document */ private DocumentImpl doc; /** Work output Stream that should be cleared before every use */ private VariableByteOutputStream os = new VariableByteOutputStream(7); public NativeTextEngine(DBBroker broker, BFile dbFile, Configuration config) throws DBException { super(broker, config); this.invertedIndex = new InvertedIndex(); this.dbTokens = dbFile; } public String getFileName() { return FILE_NAME; } public String getConfigKeyForFile() { return FILE_KEY_IN_CONFIG; } public NativeTextEngine getInstance() { return this; } /** * Checks if the given string could be a regular expression. * * @param str The string */ public final static boolean containsWildcards(String str) { if (str == null || str.length() == 0) return false; for (int i = 0; i < str.length(); i++) switch (str.charAt(i)) { case '*' : case '?' : case '\\' : case '[' : case ']' : return true; } return false; } public final static boolean startsWithWildcard(String str) { if (str == null || str.length() == 0) return false; switch (str.charAt(0)) { case '*' : case '?' : case '\\' : case '[' : return true; } return false; } public int getTrackMatches() { return trackMatches; } public void setTrackMatches(int flags) { trackMatches = flags; } public void setDocument(DocumentImpl document) { if (this.doc != null && this.doc.getDocId() != document.getDocId()) flush(); this.doc = document; invertedIndex.setDocument(doc); } /** * Indexes the tokens contained in an attribute. * * @param node The attribute to be indexed */ //TODO : unify functionalities with storeText -pb public void storeAttribute(AttrImpl node, NodePath currentPath, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) { if ((indexingHint & ATTRIBUTE_BY_QNAME) == ATTRIBUTE_BY_QNAME || (indexingHint & ATTRIBUTE_NOT_BY_QNAME) == ATTRIBUTE_NOT_BY_QNAME) { //final DocumentImpl doc = (DocumentImpl)node.getOwnerDocument(); //TODO : case conversion should be handled by the tokenizer -pb tokenizer.setText(node.getValue().toLowerCase()); TextToken token; while (null != (token = tokenizer.nextToken())) { if (token.length() > MAX_TOKEN_LENGTH) { // LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " + token.getText().substring(0,20) + "..."); continue; } if (stoplist.contains(token)) { continue; } //TODO : the tokenizer should strip unwanted token types itself -pb if (!token.isAlpha() && indexSpec != null && !indexSpec.getIncludeAlphaNum()) { continue; } if (indexingHint == ATTRIBUTE_BY_QNAME) invertedIndex.addAttribute(token, node, remove); else invertedIndex.addAttribute(token, node.getNodeId(), remove); } } } //TODO : unify with above choosing one of these 2 strategies : //1) compute the indexing strategy from thhe broker (introduce some kind of dependency) //2) read the configuration from the indexer (possible performance loss) public void storeAttribute(AttrImpl node, NodePath currentPath, int indexingHint, RangeIndexSpec idx, boolean remove) { } /** * Indexes the tokens contained in a text node. * * @param indexSpec The index configuration * @param node The text node to be indexed * @param indexingHint * if <code>true</code>, given text is indexed as a single token * if <code>false</code>, it is tokenized before being indexed */ //TODO : use an indexSpec member in order to get rid of <code>noTokenizing</code> public void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) { if (indexingHint == TOKENIZE || indexingHint == DO_NOT_TOKENIZE) { //final DocumentImpl doc = (DocumentImpl)node.getOwnerDocument(); //TODO : case conversion should be handled by the tokenizer -pb final XMLString t = node.getXMLString().transformToLower(); TextToken token; if (indexingHint == DO_NOT_TOKENIZE) { token = new TextToken(TextToken.ALPHA, t, 0, t.length()); //invertedIndex.setDocument(doc); invertedIndex.addText(token, node.getNodeId(), remove); } else if (indexingHint == TOKENIZE){ tokenizer.setText(t); while (null != (token = tokenizer.nextToken())) { if (token.length() > MAX_TOKEN_LENGTH) { // LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " + token.getText().substring(0,20) + "..."); continue; } if (stoplist.contains(token)) { continue; } if (indexSpec != null) { //TODO : the tokenizer should strip unwanted token types itself -pb if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) { continue; } } //invertedIndex.setDocument(doc); invertedIndex.addText(token, node.getNodeId(), remove); } } } } public void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) { //final DocumentImpl doc = (DocumentImpl)parent.getOwnerDocument(); //TODO : case conversion should be handled by the tokenizer -pb TextToken token; ElementContent.TextSpan span = text.getFirst(); XMLString data = null; int currentOffset = 0; while (span != null) { if (data == null) data = span.getContent().transformToLower(); else { currentOffset = data.length(); data.append(span.getContent().transformToLower()); } tokenizer.setText(data, currentOffset); while (null != (token = tokenizer.nextToken())) { if (token.length() > MAX_TOKEN_LENGTH) { LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " + token.getText().substring(0,20) + "..."); continue; } if (stoplist.contains(token)) { continue; } if (indexSpec != null) { //TODO : the tokenizer should strip unwanted token types itself -pb if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) { continue; } } //invertedIndex.setDocument(doc); if (indexingHint == TEXT_BY_QNAME) invertedIndex.addText(token, (ElementImpl) parent, remove); else invertedIndex.addText(token, parent.getNodeId(), remove); } span = span.getNext(); } } public void storeText(TextImpl node, NodePath currentPath, int indexingHint) { // TODO Auto-generated method stub } public void removeNode(StoredNode node, NodePath currentPath, String content) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.exist.storage.ContentLoadingObserver#sync() */ public void sync() { final Lock lock = dbTokens.getLock(); try { lock.acquire(Lock.WRITE_LOCK); dbTokens.flush(); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); //TODO : throw an exception ? -pb } catch (DBException e) { LOG.error(e.getMessage(), e); //TODO : throw an exception ? -pb } finally { lock.release(Lock.WRITE_LOCK); } } /* (non-Javadoc) * @see org.exist.storage.ContentLoadingObserver#flush() */ public void flush() { invertedIndex.flush(); } public void remove() { invertedIndex.remove(); } /* Drop all index entries for the given collection. * @see org.exist.storage.ContentLoadingObserver#dropIndex(org.exist.collections.Collection) */ public void dropIndex(Collection collection) { final Lock lock = dbTokens.getLock(); try { lock.acquire(Lock.WRITE_LOCK); // remove generic index Value value = new WordRef(collection.getId()); dbTokens.removeAll(null, new IndexQuery(IndexQuery.TRUNC_RIGHT, value)); // remove QName index value = new QNameWordRef(collection.getId()); dbTokens.removeAll(null, new IndexQuery(IndexQuery.TRUNC_RIGHT, value)); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (IOException e) { LOG.error(e.getMessage(), e); } finally { lock.release(Lock.WRITE_LOCK); } } /* Drop all index entries for the given document. * @see org.exist.storage.ContentLoadingObserver#dropIndex(org.exist.dom.DocumentImpl) */ public void dropIndex(DocumentImpl document) { invertedIndex.dropIndex(document); } public NodeSet getNodesContaining(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname, String expr, int type, boolean matchAll) throws TerminatedException { if (type == DBBroker.MATCH_EXACT && containsWildcards(expr)) { //TODO : log this fallback ? -pb type = DBBroker.MATCH_WILDCARDS; } switch (type) { case DBBroker.MATCH_EXACT : return getNodesExact(context, docs, contextSet, axis, qname, expr); //TODO : stricter control -pb default : return getNodesRegexp(context, docs, contextSet, axis, qname, expr, type, matchAll); } } /** * Get all nodes whose content exactly matches the give expression. */ public NodeSet getNodesExact(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname, String expr) throws TerminatedException { //Return early if (expr == null) return null; //TODO : filter the expression *before* -pb if (stoplist.contains(expr)) return null; //TODO : case conversion should be handled by the tokenizer -pb expr = expr.toLowerCase(); //TODO : use an indexSpec member in order to get rid of this or do the job *before* -pb String token; if (stem) token = stemmer.stem(expr); else token = expr; final NodeSet result = new NewArrayNodeSet(docs.getDocumentCount(), 250); for (Iterator iter = docs.getCollectionIterator(); iter.hasNext();) { final int collectionId = ((Collection) iter.next()).getId(); Value key; if (qname == null) key = new WordRef(collectionId, token); else { key = new QNameWordRef(collectionId, qname, token, broker.getBrokerPool().getSymbols()); // LOG.debug("Using qname: " + qname.toString() + " " + key.dump() + " '" + key.toString() + "'"); } final Lock lock = dbTokens.getLock(); try { lock.acquire(Lock.READ_LOCK); VariableByteInput is = dbTokens.getAsStream(key); //Does the token already has data in the index ? if (is == null) continue; while (is.available() > 0) { int storedDocId = is.readInt(); int storedSection = is.readByte(); int gidsCount = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); DocumentImpl storedDocument = docs.getDoc(storedDocId); //Exit if the document is not concerned if (storedDocument == null) { is.skipBytes(length); continue; } //Process the nodes NodeId previous = null; for (int m = 0; m < gidsCount; m++) { // NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(is); NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); NodeProxy storedNode; switch (storedSection) { case ATTRIBUTE_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, Node.ATTRIBUTE_NODE); break; case TEXT_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, Node.TEXT_NODE); break; case QNAME_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE); break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'"); } // if a context set is specified, we can directly check if the // matching text node is a descendant of one of the nodes // in the context set. if (contextSet != null) { NodeProxy parent; switch(storedSection) { case ATTRIBUTE_SECTION : if (contextSet instanceof VirtualNodeSet) { parent = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL); if (parent != null && !parent.getNodeId().equals(storedNode.getNodeId())) parent = null; } else parent = contextSet.get(storedNode); break; case QNAME_SECTION: case TEXT_SECTION : parent = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL); break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'"); } if (parent != null) { Match match = new FTMatch(-1, nodeId, token, freq); readOccurrences(freq, is, match, token.length()); if (axis == NodeSet.ANCESTOR) { parent.addMatch(match); int sizeHint = contextSet.getSizeHint(storedDocument); result.add(parent, sizeHint); } else { storedNode.addMatch(match); int sizeHint = contextSet.getSizeHint(storedDocument); result.add(storedNode, sizeHint); } } else { is.skip(freq); } // otherwise, we add all text nodes without check } else { Match match = new FTMatch(-1, nodeId, token, freq); readOccurrences(freq, is, match, token.length()); storedNode.addMatch(match); result.add(storedNode, Constants.NO_SIZE_HINT); } context.proceed(); } } } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e); //TODO : return ? } finally { lock.release(Lock.READ_LOCK); } } return result; } private NodeSet getNodesRegexp(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname, String expr, int type, boolean matchAll) throws TerminatedException { //Return early if (expr == null) return null; if (stoplist.contains(expr)) return null; //TODO : case conversion should be handled by the tokenizer -pb expr = expr.toLowerCase(); // if the regexp starts with a char sequence, we restrict the index scan to entries starting with // the same sequence. Otherwise, we have to scan the whole index. CharSequence start = ""; if (matchAll) { StringBuilder buf = new StringBuilder(); for (int i = 0; i < expr.length(); i++) { if (Character.isLetterOrDigit(expr.charAt(i))) buf.append(expr.charAt(i)); else break; } start = buf; } try { TermMatcher comparator = new RegexMatcher(expr, type, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE, matchAll); return getNodes(context, docs, contextSet, axis, qname, comparator, start); } catch (EXistException e) { return null; } } /* Return all nodes for wich the matcher matches. * @see org.exist.storage.TextSearchEngine#getNodes(org.exist.xquery.XQueryContext, org.exist.dom.DocumentSet, org.exist.dom.NodeSet, org.exist.storage.TermMatcher, java.lang.CharSequence) */ public NodeSet getNodes(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname, TermMatcher matcher, CharSequence startTerm) throws TerminatedException { if (LOG.isTraceEnabled() && qname != null) LOG.trace("Index lookup by QName: " + qname); final NodeSet result = new NewArrayNodeSet(); final SearchCallback cb = new SearchCallback(context, matcher, result, contextSet, axis, docs, qname); final Lock lock = dbTokens.getLock(); for (Iterator iter = docs.getCollectionIterator(); iter.hasNext();) { final int collectionId = ((Collection) iter.next()).getId(); //Compute a key for the token Value value; if (startTerm != null && startTerm.length() > 0) { //TODO : case conversion should be handled by the tokenizer -pb if (qname == null) { value = new WordRef(collectionId, startTerm.toString().toLowerCase()); } else { value = new QNameWordRef(collectionId, qname, startTerm.toString().toLowerCase(), broker.getBrokerPool().getSymbols()); } } else { if (qname == null) { value = new WordRef(collectionId); } else { value = new QNameWordRef(collectionId, qname, broker.getBrokerPool().getSymbols()); } } IndexQuery query = new IndexQuery(IndexQuery.TRUNC_RIGHT, value); try { lock.acquire(Lock.READ_LOCK); dbTokens.query(query, cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); //TODO return null ? rethrow ? -pb } catch (IOException e) { LOG.error(e.getMessage(), e); //TODO return null ? rethrow ? -pb } finally { lock.release(Lock.READ_LOCK); } } return result; } public String[] getIndexTerms(DocumentSet docs, TermMatcher matcher) { final IndexCallback cb = new IndexCallback(null, matcher); final Lock lock = dbTokens.getLock(); for (Iterator iter = docs.getCollectionIterator(); iter.hasNext();) { final int collectionId = ((Collection) iter.next()).getId(); //Compute a key for the token Value value = new WordRef(collectionId); IndexQuery query = new IndexQuery(IndexQuery.TRUNC_RIGHT, value); try { lock.acquire(Lock.READ_LOCK); dbTokens.query(query, cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage(), e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (TerminatedException e) { LOG.warn(e.getMessage(), e); } finally { lock.release(Lock.READ_LOCK); } } return cb.getMatches(); } public Occurrences[] scanIndexTerms(DocumentSet docs, NodeSet contextSet, String start, String end) throws PermissionDeniedException { final IndexScanCallback cb = new IndexScanCallback(docs, contextSet, false); final Lock lock = dbTokens.getLock(); for (Iterator i = docs.getCollectionIterator(); i.hasNext();) { final int collectionId = ((Collection) i.next()).getId(); final IndexQuery query; if (start == null) { Value startRef = new WordRef(collectionId); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else if (end == null) { Value startRef = new WordRef(collectionId, start.toLowerCase()); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else { Value startRef = new WordRef(collectionId, start.toLowerCase()); Value endRef = new WordRef(collectionId, end.toLowerCase()); query = new IndexQuery(IndexQuery.BW, startRef, endRef); } try { lock.acquire(Lock.READ_LOCK); dbTokens.query(query, cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage(), e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (TerminatedException e) { LOG.warn(e.getMessage(), e); } finally { lock.release(Lock.READ_LOCK); } } Occurrences[] result = new Occurrences[cb.map.size()]; return (Occurrences[]) cb.map.values().toArray(result); } public Occurrences[] scanIndexTerms(DocumentSet docs, NodeSet contextSet, QName[] qnames, String start, String end) throws PermissionDeniedException { final Lock lock = dbTokens.getLock(); final IndexScanCallback cb = new IndexScanCallback(docs, contextSet, true); for (int q = 0; q < qnames.length; q++) { for (Iterator i = docs.getCollectionIterator(); i.hasNext();) { final int collectionId = ((Collection) i.next()).getId(); final IndexQuery query; if (start == null) { Value startRef = new QNameWordRef(collectionId, qnames[q], broker.getBrokerPool().getSymbols()); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else if (end == null) { Value startRef = new QNameWordRef(collectionId, qnames[q], start.toLowerCase(), broker.getBrokerPool().getSymbols()); query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef); } else { Value startRef = new QNameWordRef(collectionId, qnames[q], start.toLowerCase(), broker.getBrokerPool().getSymbols()); Value endRef = new QNameWordRef(collectionId, qnames[q], end.toLowerCase(), broker.getBrokerPool().getSymbols()); query = new IndexQuery(IndexQuery.BW, startRef, endRef); } try { lock.acquire(Lock.READ_LOCK); dbTokens.query(query, cb); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage(), e); } catch (BTreeException e) { LOG.error(e.getMessage(), e); } catch (TerminatedException e) { LOG.warn(e.getMessage(), e); } finally { lock.release(Lock.READ_LOCK); } } } Occurrences[] result = new Occurrences[cb.map.size()]; return (Occurrences[]) cb.map.values().toArray(result); } /** * @param freq * @param is * @param match * @throws IOException */ private void readOccurrences(int freq, VariableByteInput is, Match match, int length) throws IOException { for (int n = 0; n < freq; n++) { match.addOffset(is.readInt(), length); } } /** * Collect all words in a document to be removed * * @param words * Description of the Parameter * @param domIterator * Description of the Parameter */ //TODO : unify functionalities with storeText -pb private void collect(Set words, Iterator domIterator) { byte[] data = ((Value) domIterator.next()).getData(); short type = Signatures.getType(data[OFFSET_NODE_TYPE]); switch (type) { case Node.ELEMENT_NODE : int childrenCount = ByteConversion.byteToInt(data, OFFSET_ELEMENT_CHILDREN_COUNT); for (int i = 0; i < childrenCount; i++) //recursive call on children collect(words, domIterator); break; case Node.TEXT_NODE : int dlnLen = ByteConversion.byteToShort(data, OFFSET_TEXT_DLN_LENGTH); int nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN); try { int readOffset = nodeIdLen + OFFSET_DLN; String s = new String(data, readOffset, data.length - readOffset, "UTF-8"); tokenizer.setText(s); TextToken token; while (null != (token = tokenizer.nextToken())) { String word = token.getText(); if (stoplist.contains(word)) continue; words.add(word.toLowerCase()); } } catch (UnsupportedEncodingException e) { LOG.error(e.getMessage(), e); } break; case Node.ATTRIBUTE_NODE : byte idSizeType = (byte) (data[OFFSET_NODE_TYPE] & 0x3); boolean hasNamespace = (data[OFFSET_NODE_TYPE] & 0x10) == 0x10; dlnLen = ByteConversion.byteToShort(data, OFFSET_ATTRIBUTE_DLN_LENGTH); nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN); int readOffset = Signatures.getLength(idSizeType) + nodeIdLen + OFFSET_DLN; if (hasNamespace) { //TODO : check the order in wich both info are read (and discarded) readOffset += SymbolTable.LENGTH_LOCAL_NAME; // skip namespace id final short prefixLen = ByteConversion.byteToShort(data, readOffset); readOffset += prefixLen + SymbolTable.LENGTH_NS_URI; // skip prefix } try { String val = new String(data, readOffset, data.length - readOffset, "UTF-8"); tokenizer.setText(val); TextToken token; while (null != (token = tokenizer.nextToken())) { String word = token.getText(); if (stoplist.contains(word)) continue; words.add(word.toLowerCase()); } } catch (UnsupportedEncodingException e) { //val = new String(data, // 1 + Signatures.getLength(idSizeType), data.length // - 1 - Signatures.getLength(idSizeType)); LOG.error(e.getMessage(), e); } break; default : //Other types are ignored : some may be useful though -pb //TOUNDERSTAND : it looks like other types (got : Node.PROCESSING_INSTRUCTION_NODE) //are stored in the index ??? -pb } } public void closeAndRemove() { config.setProperty(getConfigKeyForFile(), null); dbTokens.closeAndRemove(); } public boolean close() throws DBException { config.setProperty(getConfigKeyForFile(), null); return dbTokens.close(); } public void printStatistics() { dbTokens.printStatistics(); } public String toString() { return this.getClass().getName() + " at "+ dbTokens.getFile().getName() + " owned by " + broker.toString(); } /** * This inner class is responsible for actually storing the list of * occurrences. * * @author Wolfgang Meier <meier@ifs.tu-darmstadt.de> */ final class InvertedIndex { private class QNameTerm implements Comparable { QName qname; String term; public QNameTerm(QName qname, String term) { this.qname = qname; this.term = term; } public int compareTo(Object o) { QNameTerm other = (QNameTerm) o; int cmp = qname.compareTo(other.qname); if (cmp == 0) return term.compareTo(other.term); else return cmp; } } private DocumentImpl doc = null; // To distinguish between attribute values and text, we use // two maps: words[0] collects text, words[1] stores attribute // values. //TODO : very tricky. Why not 2 inverted indexes ??? -pb private Map words[] = new Map[3]; private int TEXT_NODES = 0; private int ATTRIBUTE_NODES = 1; private int BY_QNAME = 2; public InvertedIndex() { words[TEXT_NODES] = new HashMap(512); words[ATTRIBUTE_NODES] = new HashMap(256); //seems to be linked with QName indexes words[BY_QNAME] = new TreeMap(); } public void setDocument(DocumentImpl document) { if (this.doc != null && this.doc.getDocId() != document.getDocId()) flush(); this.doc = document; } public void addText(TextToken token, NodeId nodeId, boolean remove) { if (!remove) { //Is this token already pending ? OccurrenceList list = (OccurrenceList) words[TEXT_NODES].get(token); //Create a GIDs list if (list == null) { list = new OccurrenceList(); list.add(nodeId, token.startOffset()); words[TEXT_NODES].put(token.getText(), list); } else { //Add node's GID to the list list.add(nodeId, token.startOffset()); } } else { if (!words[TEXT_NODES].containsKey(token)) words[TEXT_NODES].put(token, null); } } public void addText(TextToken token, ElementImpl ancestor, boolean remove) { QNameTerm term = new QNameTerm(ancestor.getQName(), token.getText()); if (!remove) { //Is this token already pending ? OccurrenceList list = (OccurrenceList) words[BY_QNAME].get(term); //Create a GIDs list if (list == null) { list = new OccurrenceList(); list.add(ancestor.getNodeId(), token.startOffset()); words[BY_QNAME].put(term, list); } else { //Add node's GID to the list list.add(ancestor.getNodeId(), token.startOffset()); } } else { if (!words[BY_QNAME].containsKey(term)) words[BY_QNAME].put(term, null); } } //TODO : unify functionalities with addText -pb public void addAttribute(TextToken token, NodeId nodeId, boolean remove) { //Is this token already pending ? if (!remove) { OccurrenceList list = (OccurrenceList) words[ATTRIBUTE_NODES].get(token); //Create a GIDs list if (list == null) { list = new OccurrenceList(); list.add(nodeId, token.startOffset()); words[ATTRIBUTE_NODES].put(token.getText(), list); } else { //Add node's GID to the list list.add(nodeId, token.startOffset()); } } else { if (!words[ATTRIBUTE_NODES].containsKey(token)) words[ATTRIBUTE_NODES].put(token, null); } } public void addAttribute(TextToken token, AttrImpl attr, boolean remove) { QNameTerm term = new QNameTerm(attr.getQName(), token.getText()); if (!remove) { //Is this token already pending ? OccurrenceList list = (OccurrenceList) words[BY_QNAME].get(term); //Create a GIDs list if (list == null) { list = new OccurrenceList(); list.add(attr.getNodeId(), token.startOffset()); words[BY_QNAME].put(term, list); } else { //Add node's GID to the list list.add(attr.getNodeId(), token.startOffset()); } } else { if (!words[BY_QNAME].containsKey(term)) words[BY_QNAME].put(term, null); } } public void flush() { //return early if (this.doc == null) return; final int wordsCount = words[TEXT_NODES].size() + words[ATTRIBUTE_NODES].size() + words[BY_QNAME].size(); if (wordsCount == 0) return; final ProgressIndicator progress = new ProgressIndicator(wordsCount, 100); final int collectionId = this.doc.getCollection().getId(); int count = 0; for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) { //Not very necessary, but anyway... switch (currentSection) { case TEXT_SECTION : case ATTRIBUTE_SECTION : case QNAME_SECTION : break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "' (inverted index)"); } for (Iterator i = words[currentSection].entrySet().iterator(); i.hasNext(); count++) { Map.Entry entry = (Map.Entry) i.next(); Object token = entry.getKey(); OccurrenceList occurences = (OccurrenceList) entry.getValue(); if (occurences == null) continue; // may happen if the index is in an invalid state due to earlier errors //Don't forget this one occurences.sort(); os.clear(); os.writeInt(this.doc.getDocId()); os.writeByte(currentSection); // os.writeByte(currentSection == QNAME_SECTION ? TEXT_SECTION : currentSection); os.writeInt(occurences.getTermCount()); //Mark position int lenOffset = os.position(); //Dummy value : actual one will be written below os.writeFixedInt(0); NodeId previous = null; for (int m = 0; m < occurences.getSize(); ) { try { previous = occurences.getNode(m).write(previous, os); // occurences.nodes[m].write(os); } catch (IOException e) { LOG.error("IOException while writing fulltext index: " + e.getMessage(), e); } int freq = occurences.getOccurrences(m); os.writeInt(freq); for (int n = 0; n < freq; n++) { os.writeInt(occurences.getOffset(m + n)); } m += freq; } //Write (variable) length of node IDs + frequency + offsets os.writeFixedInt(lenOffset, os.position() - lenOffset - LENGTH_NODE_IDS_FREQ_OFFSETS); flushWord(currentSection, collectionId, token, os.data()); progress.setValue(count); if (progress.changed()) { setChanged(); notifyObservers(progress); } } //TOUNDERSTAND : is this a flush ? //If so, the ProgressIndicator should be reinitialized -pb if (wordsCount > 100) { progress.finish(); setChanged(); notifyObservers(progress); } words[currentSection].clear(); } } private void flushWord(int currentSection, int collectionId, Object token, ByteArray data) { //return early //TODO : is this ever called ? -pb if (data.size() == 0) return; final Lock lock = dbTokens.getLock(); try { lock.acquire(Lock.WRITE_LOCK); Value key; if (currentSection == QNAME_SECTION) { QNameTerm term = (QNameTerm) token; key = new QNameWordRef(collectionId, term.qname, term.term, broker.getBrokerPool().getSymbols()); } else { key = new WordRef(collectionId, token.toString()); } dbTokens.append(key, data); } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "' (inverted index)", e); } catch (ReadOnlyException e) { LOG.warn("Read-only error on '" + dbTokens.getFile().getName() + "' (inverted index)", e); } catch (IOException e) { LOG.error(e.getMessage() + "' in '" + dbTokens.getFile().getName() + "' (inverted index)", e); } finally { lock.release(Lock.WRITE_LOCK); os.clear(); } } public void dropIndex(DocumentImpl document) { //Return early if (document == null) return; final int collectionId = document.getCollection().getId(); final Lock lock = dbTokens.getLock(); for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) { //Not very necessary, but anyway... switch (currentSection) { case TEXT_SECTION : case ATTRIBUTE_SECTION : case QNAME_SECTION : break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "' (inverted index)"); } LOG.debug("Removing " + words[currentSection].size() + " tokens"); for (Iterator i = words[currentSection].entrySet().iterator(); i.hasNext();) { //Compute a key for the token Map.Entry entry = (Map.Entry) i.next(); Object token = entry.getKey(); Value key; if (currentSection == QNAME_SECTION) { QNameTerm term = (QNameTerm) token; key = new QNameWordRef(collectionId, term.qname, term.term, broker.getBrokerPool().getSymbols()); } else { key = new WordRef(collectionId, token.toString()); } os.clear(); try { lock.acquire(Lock.WRITE_LOCK); boolean changed = false; os.clear(); VariableByteInput is = dbTokens.getAsStream(key); //Does the token already has data in the index ? if (is == null) continue; //try { while (is.available() > 0) { int storedDocId = is.readInt(); byte section = is.readByte(); int gidsCount = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); if (storedDocId != document.getDocId()) { // data are related to another document: // copy them to any existing data os.writeInt(storedDocId); os.writeByte(section); os.writeInt(gidsCount); os.writeFixedInt(length); is.copyRaw(os, length); } else { // data are related to our document: // skip them changed = true; is.skipBytes(length); } } //} catch (EOFException e) { //EOF is expected here //} //Store new data, if relevant if (changed) { //Well, nothing to store : remove the existing data if (os.data().size() == 0) { dbTokens.remove(key); } else { if (dbTokens.put(key, os.data()) == BFile.UNKNOWN_ADDRESS) { LOG.error("Could not put index data for token '" + token + "' in '" + dbTokens.getFile().getName() + "'"); //TODO : throw an exception ? } } } } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e); } catch (IOException e) { LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e); } catch (ReadOnlyException e) { LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e); } finally { lock.release(Lock.WRITE_LOCK); os.clear(); } } words[currentSection].clear(); } } /** * Remove the entries in the current list from the index. */ //TODO: use VariableInputStream public void remove() { //Return early if (doc == null) return; final int collectionId = this.doc.getCollection().getId(); final Lock lock = dbTokens.getLock(); for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) { //Not very necessary, but anyway... switch (currentSection) { case TEXT_SECTION : case ATTRIBUTE_SECTION : case QNAME_SECTION : break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "' (inverted index)"); } for (Iterator i = words[currentSection].entrySet().iterator(); i.hasNext();) { //Compute a key for the token Map.Entry entry = (Map.Entry) i.next(); OccurrenceList storedOccurencesList = (OccurrenceList) entry.getValue(); Object token = entry.getKey(); Value key; if (currentSection == QNAME_SECTION) { QNameTerm term = (QNameTerm) token; key = new QNameWordRef(collectionId, term.qname, term.term, broker.getBrokerPool().getSymbols()); } else { key = new WordRef(collectionId, token.toString()); } OccurrenceList newOccurencesList = new OccurrenceList(); os.clear(); try { lock.acquire(Lock.WRITE_LOCK); Value value = dbTokens.get(key); if (value == null) continue; //Add its data to the new list VariableByteArrayInput is = new VariableByteArrayInput(value.getData()); while (is.available() > 0) { int storedDocId = is.readInt(); byte storedSection = is.readByte(); int termCount = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); if (storedSection != currentSection || storedDocId != this.doc.getDocId()) { // data are related to another section or document: // append them to any existing data os.writeInt(storedDocId); os.writeByte(storedSection); os.writeInt(termCount); os.writeFixedInt(length); is.copyRaw(os, length); } else { // data are related to our section and document: // feed the new list with the GIDs NodeId previous = null; for (int m = 0; m < termCount; m++) { NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); // add the node to the new list if it is not // in the list of removed nodes if (!storedOccurencesList.contains(nodeId)) { for (int n = 0; n < freq; n++) { newOccurencesList.add(nodeId, is.readInt()); } } else { is.skip(freq); } } } } //append the data from the new list if(newOccurencesList.getSize() > 0) { //Don't forget this one newOccurencesList.sort(); os.writeInt(this.doc.getDocId()); os.writeByte(currentSection); os.writeInt(newOccurencesList.getTermCount()); //Mark position int lenOffset = os.position(); //Dummy value : actual one will be written below os.writeFixedInt(0); NodeId previous = null; for (int m = 0; m < newOccurencesList.getSize(); ) { previous = newOccurencesList.getNode(m).write(previous, os); int freq = newOccurencesList.getOccurrences(m); os.writeInt(freq); for (int n = 0; n < freq; n++) { os.writeInt(newOccurencesList.getOffset(m + n)); } m += freq; } //Write (variable) length of node IDs + frequency + offsets os.writeFixedInt(lenOffset, os.position() - lenOffset - LENGTH_NODE_IDS_FREQ_OFFSETS); } //Store the data if(os.data().size() == 0) dbTokens.remove(key); else if (dbTokens.update(value.getAddress(), key, os.data()) == BFile.UNKNOWN_ADDRESS) { LOG.error("Could not update index data for token '" + token + "' in '" + dbTokens.getFile().getName() + "' (inverted index)"); //TODO : throw an exception ? } } catch (LockException e) { LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "' (inverted index)", e); } catch (ReadOnlyException e) { LOG.warn("Read-only error on '" + dbTokens.getFile().getName() + "' (inverted index)", e); } catch (IOException e) { LOG.error(e.getMessage() + "' in '" + dbTokens.getFile().getName() + "' (inverted index)", e); } finally { lock.release(Lock.WRITE_LOCK); os.clear(); } } words[currentSection].clear(); } } } private class IndexCallback implements BTreeCallback { List matches = new ArrayList(); TermMatcher matcher; XQueryContext context; public IndexCallback(XQueryContext context, TermMatcher matcher) { this.matcher = matcher; this.context = context; } public String[] getMatches() { String[] a = new String[matches.size()]; return (String[]) matches.toArray(a); } /* (non-Javadoc) * @see org.dbxml.core.filer.BTreeCallback#indexInfo(org.dbxml.core.data.Value, long) */ public boolean indexInfo(Value key, long pointer) throws TerminatedException { if(context != null) context.proceed(); try { final String word = new String(key.getData(), Collection.LENGTH_COLLECTION_ID, key.getLength() - Collection.LENGTH_COLLECTION_ID, "UTF-8"); if (matcher.matches(word)) matches.add(word); return true; } catch (UnsupportedEncodingException e) { LOG.error(e.getMessage(), e); return true; } } } private final class SearchCallback implements BTreeCallback { DocumentSet docs; TermMatcher matcher; NodeSet result; NodeSet contextSet; int axis; XQueryContext context; XMLString word = new XMLString(64); QName qname; public SearchCallback(XQueryContext context, TermMatcher comparator, NodeSet result, NodeSet contextSet, int axis, DocumentSet docs, QName qname) { this.matcher = comparator; this.result = result; this.docs = docs; this.contextSet = contextSet; this.context = context; this.qname = qname; this.axis = axis; } public boolean indexInfo(Value key, long pointer) throws TerminatedException { VariableByteInput is; try { is = dbTokens.getAsStream(pointer); } catch (IOException e) { LOG.error(e.getMessage(), e); return true; } word.reuse(); if (qname == null) WordRef.decode(key, word); else QNameWordRef.decode(key, word); if (matcher.matches(word)) { try { while (is.available() > 0) { if(context != null) context.proceed(); int storedDocId = is.readInt(); byte storedSection = is.readByte(); int termCount = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); DocumentImpl storedDocument = docs.getDoc(storedDocId); //Exit if the document is not concerned if (storedDocument == null) { is.skipBytes(length); continue; } NodeId previous = null; for (int m = 0; m < termCount; m++) { NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); NodeProxy storedNode; switch (storedSection) { case TEXT_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, Node.TEXT_NODE); break; case ATTRIBUTE_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, Node.ATTRIBUTE_NODE); break; case QNAME_SECTION : storedNode = new NodeProxy(storedDocument, nodeId, qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE); break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'"); } if (contextSet != null) { NodeProxy parentNode; switch (storedSection) { case TEXT_SECTION : case QNAME_SECTION: parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL); break; case ATTRIBUTE_SECTION : if (contextSet instanceof VirtualNodeSet) { parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL); if (parentNode != null && parentNode.getNodeId().equals(nodeId)) parentNode = null; } else { parentNode = contextSet.get(storedNode); } break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'"); } if (parentNode != null) { Match match = new FTMatch(-1, nodeId, word.toString(), freq); readOccurrences(freq, is, match, word.length()); int sizeHint = contextSet.getSizeHint(storedDocument); if (axis == NodeSet.ANCESTOR) { parentNode.addMatch(match); result.add(parentNode, sizeHint); } else { storedNode.addMatch(match); result.add(storedNode, sizeHint); } } else is.skip(freq); } else { Match match = new FTMatch(-1, nodeId, word.toString(), freq); readOccurrences(freq, is, match, word.length()); storedNode.addMatch(match); result.add(storedNode, Constants.NO_SIZE_HINT); } } } //} catch (EOFException e) { // EOFExceptions are normal } catch (IOException e) { LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e); //TODO : return early -pb } } //TOUNDERSTAND : why sort here ? -pb if (contextSet != null) ((NewArrayNodeSet) result).sort(); return true; } } private final class IndexScanCallback implements BTreeCallback{ private DocumentSet docs; private NodeSet contextSet; private Map map = new TreeMap(); private XMLString word = new XMLString(64); private boolean byQName; IndexScanCallback(DocumentSet docs, NodeSet contextSet, boolean byQName) { this.docs = docs; this.contextSet = contextSet; this.byQName = byQName; } /* (non-Javadoc) * @see org.dbxml.core.filer.BTreeCallback#indexInfo(org.dbxml.core.data.Value, long) */ public boolean indexInfo(Value key, long pointer) throws TerminatedException { word.reuse(); if (byQName) QNameWordRef.decode(key, word); else WordRef.decode(key, word); final String term = word.toString(); VariableByteInput is; try { is = dbTokens.getAsStream(pointer); } catch (IOException e) { LOG.error(e.getMessage(), e); return true; } try { while (is.available() > 0) { boolean docAdded = false; int storedDocId = is.readInt(); byte storedSection = is.readByte(); int termCount = is.readInt(); //Read (variable) length of node IDs + frequency + offsets int length = is.readFixedInt(); DocumentImpl storedDocument = docs.getDoc(storedDocId); //Exit if the document is not concerned if (storedDocument == null) { is.skipBytes(length); continue; } NodeId previous = null; for (int m = 0; m < termCount; m++) { NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is); previous = nodeId; int freq = is.readInt(); is.skip(freq); if (contextSet != null) { boolean include = false; NodeProxy parentNode = contextSet.parentWithChild(storedDocument, nodeId, false, true); switch (storedSection) { case TEXT_SECTION : case QNAME_SECTION : //TODO : also test on Node.TEXT_NODE like below ? -pb include = (parentNode != null); break; case ATTRIBUTE_SECTION : include = (parentNode != null && parentNode.getNodeType() == Node.ATTRIBUTE_NODE); break; default : throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'"); } if (include) { Occurrences oc = (Occurrences) map.get(term); if (oc == null) { oc = new Occurrences(term); map.put(term, oc); } if (!docAdded) { oc.addDocument(storedDocument); docAdded = true; } oc.addOccurrences(freq); } } } } //} catch(EOFException e) { //EOFExceptions are expected } catch(IOException e) { LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e); //TODO : return early -pb } return true; } } private static class TermFrequencyList { protected static class TermFreq implements Comparable { long l; int count = 1; TermFreq next = null; public TermFreq(long l) { this.l = l; } public void increment() { ++count; } public int compareTo(Object o) { final TermFreq other = (TermFreq)o; if(l == other.l) return Constants.EQUAL; else return l < other.l ? Constants.INFERIOR : Constants.SUPERIOR; } } private TermFreq first = null; private TermFreq last = null; private int count = 0; public void add( long l ) { if(first == null) { first = new TermFreq( l ); last = first; } else { TermFreq next = new TermFreq( l ); last.next = next; last = next; } ++count; } public void incLastTerm() { if(last != null) last.increment(); } public void setLastTermFreq(int freq) { if(last != null) last.count = freq; } public long getLast() { if(last != null) return last.l; else return -1; } public boolean contains(long l) { TermFreq next = first; while( next != null ) { if(next.l == l) return true; next = next.next; } return false; } public int getSize() { return count; } public TermFreq[] toArray() { TermFreq[] data = new TermFreq[count]; TermFreq next = first; int i = 0; while( next != null ) { data[i++] = next; next = next.next; } return data; } } private final static class WordRef extends Value { public static int LENGTH_IDX_TYPE = 1; //sizeof byte public static int OFFSET_IDX_TYPE = 0; public static int OFFSET_COLLECTION_ID = OFFSET_IDX_TYPE + WordRef.LENGTH_IDX_TYPE; //1 public static int OFFSET_WORD = OFFSET_COLLECTION_ID + Collection.LENGTH_COLLECTION_ID; //3 public WordRef(int collectionId) { len = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID; data = new byte[len]; data[OFFSET_IDX_TYPE] = IDX_GENERIC; ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID); } public WordRef(int collectionId, String word) { len = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID + UTF8.encoded(word); data = new byte[len]; data[OFFSET_IDX_TYPE] = IDX_GENERIC; ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID); UTF8.encode(word, data, OFFSET_WORD); } public static XMLString decode(Value key, XMLString word) { int prefixLength = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID; return UTF8.decode(key.getData(), prefixLength, key.getLength() - prefixLength, word); } public String toString() { if (len > OFFSET_WORD) return new String(data, OFFSET_WORD, len - OFFSET_WORD); else return "no word"; } } //TODO : extend WordRef ? private final static class QNameWordRef extends Value { public static int LENGTH_IDX_TYPE = 1; //sizeof byte public static int LENGTH_QNAME_TYPE = 1; //sizeof byte public static int OFFSET_IDX_TYPE = 0; public static int OFFSET_COLLECTION_ID = OFFSET_IDX_TYPE + QNameWordRef.LENGTH_IDX_TYPE; //1 public static int OFFSET_QNAME_TYPE = OFFSET_COLLECTION_ID + Collection.LENGTH_COLLECTION_ID; //4 public static int OFFSET_NS_URI = OFFSET_QNAME_TYPE + LENGTH_QNAME_TYPE; //4 public static int OFFSET_LOCAL_NAME = OFFSET_NS_URI + SymbolTable.LENGTH_NS_URI; //6 public static int OFFSET_WORD = OFFSET_LOCAL_NAME + SymbolTable.LENGTH_LOCAL_NAME; //8 public QNameWordRef(int collectionId) { len = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID; data = new byte[len]; data[OFFSET_IDX_TYPE] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID); pos = OFFSET_IDX_TYPE; } public QNameWordRef(int collectionId, QName qname, SymbolTable symbols) { len = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID + QNameWordRef.LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI + SymbolTable.LENGTH_LOCAL_NAME; data = new byte[len]; final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI()); final short localNameId = symbols.getSymbol(qname.getLocalName()); data[OFFSET_IDX_TYPE] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID); data[OFFSET_QNAME_TYPE] = qname.getNameType(); ByteConversion.shortToByte(namespaceId, data, OFFSET_NS_URI); ByteConversion.shortToByte(localNameId, data, OFFSET_LOCAL_NAME); } public QNameWordRef(int collectionId, QName qname, String word, SymbolTable symbols) { len = UTF8.encoded(word) + QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID + LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI + SymbolTable.LENGTH_LOCAL_NAME; data = new byte[len]; final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI()); final short localNameId = symbols.getSymbol(qname.getLocalName()); data[OFFSET_IDX_TYPE] = IDX_QNAME; ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID); data[OFFSET_QNAME_TYPE] = qname.getNameType(); ByteConversion.shortToByte(namespaceId, data, OFFSET_NS_URI); ByteConversion.shortToByte(localNameId, data, OFFSET_LOCAL_NAME); UTF8.encode(word, data, OFFSET_WORD); } public static XMLString decode(Value key, XMLString word) { int prefixLength = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID + QNameWordRef.LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI + SymbolTable.LENGTH_LOCAL_NAME; return UTF8.decode(key.getData(), prefixLength, key.getLength() - prefixLength, word); } public String toString() { if (len > OFFSET_WORD) return new String(data, OFFSET_WORD, len - OFFSET_WORD); else return "no word"; } } }