package org.cdlib.xtf.lazyTree; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.FileNotFoundException; import java.io.IOException; import java.io.StringReader; import java.lang.ref.SoftReference; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.regex.Pattern; import net.sf.saxon.Configuration; import net.sf.saxon.om.AxisIterator; import net.sf.saxon.om.Item; import net.sf.saxon.om.NodeListIterator; import net.sf.saxon.om.StrippedNode; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.index.Term; import org.apache.lucene.mark.ContextMarker; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanQuery; import org.cdlib.xtf.textEngine.DocHit; import org.cdlib.xtf.textEngine.QueryProcessor; import org.cdlib.xtf.textEngine.QueryRequest; import org.cdlib.xtf.textEngine.QueryResult; import org.cdlib.xtf.textEngine.Snippet; import org.cdlib.xtf.util.CharMap; import org.cdlib.xtf.util.CheckingTokenStream; import org.cdlib.xtf.util.EasyNode; import org.cdlib.xtf.util.FastStringReader; import org.cdlib.xtf.util.FastTokenizer; import org.cdlib.xtf.util.StructuredStore; import org.cdlib.xtf.util.Trace; import org.cdlib.xtf.util.WordMap; /** * <p>SearchTree annotates a lazy-loading tree with TextEngine search results. * Many careful gyrations are required to load as little as possible of the * lazy tree from disk.</p> * * <p>This class maintains the illusion that the entire tree has been loaded * from disk, carefully searched, each hit annotated, and a list of all the * snippets inserted at the top. In reality, this is done on-the-fly as * needed, leaving as much as possible on disk.</p> * * <p>To use SearchTree, simply call the constructor: * {@link #SearchTree(Configuration, String, StructuredStore)}, * passing it the key to use for index * lookups and the persistent file to load from. Then call the * {@link #search(QueryProcessor, QueryRequest)} method to perform the * actual search, and use the tree normally. As you access various parts of * the tree, they'll be annotated on the fly. * * @author Martin Haye */ public class SearchTree extends LazyDocument { /** Prefix for this document in the Lucene index */ String sourceKey; /** Map containing all terms used in the query */ Set termMap; /** Set of "stop-words" (i.e. short words like "the", "and", "a", etc.) */ Set stopSet; /** Set of plural words to change from plural to singular */ WordMap pluralMap; /** Set of accented chars to remove diacritics from */ CharMap accentMap; /** Total number of hits (might be greater than the number of snippets */ int totalHits = 0; /** Number of hit snippets within this document */ int nHits = 0; /** Array of snippets sorted by descending score */ Snippet[] hitsByScore = new Snippet[0]; /** Original DocHit and number within it for each Snippet */ DocHit[] hitsToDocHit = new DocHit[0]; int[] hitsToDocHitNum = new int[0]; /** Array of snippets sorted in document order */ Snippet[] hitsByLocation = new Snippet[0]; /** Mapping from hitsByScore -> hitsByLocation */ int[] hitRankToNum; /** Where to mark terms (all, context only, etc.) */ int termMode; /** * True to suppress marking the hits with scores (useful for automated * testing where the exact score isn't being tested. */ boolean suppressScores; /** * All the synthetic nodes added in the tree are assigned a node number * >= MARKER_BASE */ static final int MARKER_BASE = 1000000000; /** * There are several kinds of synthetic nodes; each one takes up a range * of node numbers of size MARKER_RANGE. */ static final int MARKER_RANGE = 100000000; /** * Special node numbers are used to mark an un-loaded sibling so that * getNode() can catch them and secretly load the node before anybody * notices. These special elements all have node numbers x such that: * PREV_SIB_MARKER <= x < PREV_SIB_MARKER+MARKER_RANGE */ static final int PREV_SIB_MARKER = MARKER_BASE + MARKER_RANGE * 1; /** * Each hit in the document is marked by a <hit> element. These * elements all have node numbers x such that: * HIT_ELMT_MARKER <= x < HIT_ELMT_MARKER+MARKER_RANGE */ static final int HIT_ELMT_MARKER = MARKER_BASE + MARKER_RANGE * 2; /** * At the start of the document, the SearchTree adds a synthetic * <xtf:snippets> element, and under that creates on demand a * <xtf:snippet> element for each snippet. These elements all * have node numbers x such that: * SNIPPET_MARKER <= x < SNIPPET_MARKER+MARKER_RANGE */ static final int SNIPPET_MARKER = MARKER_BASE + MARKER_RANGE * 3; /** * Marking a hit in the middle of a string of text requires splitting * up real nodes and inserting virtual ones. These virtual nodes all have * node numbers x such that: * VIRTUAL_MARKER <= x < VIRTUAL_MARKER+MARKER_RANGE */ static final int VIRTUAL_MARKER = MARKER_BASE + MARKER_RANGE * 4; /** * Keeps track of the node number to assign the next virtual node (see * {@link #VIRTUAL_MARKER} for more info.) */ int nextVirtualNum = VIRTUAL_MARKER + 1; /** * The top-level <xtf:snippet> element. */ SearchElementImpl topSnippetNode; /** * Snippet, hit, and term elements will all be marked with the XTF * namespace, given by this URI: "http://cdlib.org/xtf" */ static final String xtfURI = "http://cdlib.org/xtf"; /** Namespace code for the XTF namespace */ int xtfNamespaceCode; /** Name fingerprint for <xtf:hit> elements (includes namespace) */ int hitElementFingerprint; /** * Name fingerprint for the <xtf:snippet> element * (includes namespace) */ int snippetElementFingerprint; /** Name-code for all <hit> elements */ int hitElementCode; /** Name-code for all <more> elements */ int moreElementCode; /** Name-code for all <term> elements */ int termElementCode; /** Name-code for all <snippet> elements */ int snippetElementCode; /** Name-code for the <snippets> element */ int snippetsElementCode; /** * Name-code for all <xtf:hitCount> attributes * (includes namespace) */ int xtfHitCountAttrCode; /** * Name-code for all <xtf:firstHit> attributes * (includes namespace) */ int xtfFirstHitAttrCode; /** Name-code for all <hitCount> attributes */ int hitCountAttrCode; /** Name-code for all <totalHitCount> attributes */ int totalHitCountAttrCode; /** Name-code for all <score> attributes */ int scoreAttrCode; /** Name-code for all <rank> attributes */ int rankAttrCode; /** Name-code for all <hitNum> attributes */ int hitNumAttrCode; /** Name-code for all <continues> attributes */ int continuesAttrCode; /** Name-code for all <sectionType> attributes */ int sectionTypeAttrCode; /** Name-code for all <subDocument> attributes */ int subDocumentAttrCode; /** * Load the tree from a disk file, and get ready to search it. To start * the actual search, use the {@link #search(QueryProcessor, QueryRequest)} * method. */ public SearchTree(Configuration config, String sourceKey, StructuredStore treeStore) throws FileNotFoundException, IOException { super(config); this.sourceKey = sourceKey; LazyTreeBuilder builder = new LazyTreeBuilder(config); builder.setNamePool(config.getNamePool()); builder.load(treeStore, this); // We'll be using a special namespace. addXTFNamespace(); // Get all the namecodes we'll be using, so we only have to do it once. hitElementCode = getNameCode("hit", true); moreElementCode = getNameCode("more", true); termElementCode = getNameCode("term", true); snippetElementCode = getNameCode("snippet", true); snippetsElementCode = getNameCode("snippets", true); xtfHitCountAttrCode = getNameCode("hitCount", true); // special xtfFirstHitAttrCode = getNameCode("firstHit", true); // special hitCountAttrCode = getNameCode("hitCount", false); totalHitCountAttrCode = getNameCode("totalHitCount", false); scoreAttrCode = getNameCode("score", false); rankAttrCode = getNameCode("rank", false); hitNumAttrCode = getNameCode("hitNum", false); continuesAttrCode = getNameCode("more", false); sectionTypeAttrCode = getNameCode("sectionType", false); subDocumentAttrCode = getNameCode("subDocument", false); hitElementFingerprint = namePool.getFingerprint(xtfURI, "hit"); snippetElementFingerprint = namePool.getFingerprint(xtfURI, "snippet"); } // constructor /** * Retrieve the proper name code from the name pool. */ private int getNameCode(String name, boolean withNamespace) { if (!withNamespace) return namePool.allocate("", "", name); String prefix = namePool.suggestPrefixForURI(xtfURI); if (prefix == null) prefix = "xtf"; return namePool.allocate(prefix, xtfURI, name); } // getNameCode /** * Suppresses score attributes on the snippets. Generally this is useful * when running regressions, since the scoring algorithm changes frequently. */ public void suppressScores(boolean flag) { suppressScores = flag; } /** * Run the search and save the results for annotating the tree. * * @param processor Processor used to run the query * @param origReq Query to run * * @throws IOException If anything goes wrong reading from the Lucene * index or the lazy tree file. */ public void search(QueryProcessor processor, QueryRequest origReq) throws IOException { // Don't modify the original query request, since it might be in use // by another thread at the same time. Rather, make a clone and then // modify that with our restricted query. // QueryRequest req = (QueryRequest)origReq.clone(); // Make sure the input request is reasonable if (req.query instanceof SpanQuery) { // -1, or some pos int OK. assert ((SpanQuery)req.query).getSpanRecording() != 0; } assert req.maxDocs != 0; // -1, or som pos int OK. assert req.startDoc == 0; // Record the real term mode (which we'll respond to directly). Then // limit the one in the query to only terms within the context, since // marking all terms would be really slow. // termMode = req.termMode; req.termMode = Math.min(req.termMode, ContextMarker.MARK_CONTEXT_TERMS); // Add a meta-query that restricts to this document alone. Besides // giving us only the hits we want, this also makes the query faster. // BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(new Term("docInfo", "1")), BooleanClause.Occur.MUST); Term t = new Term("key", sourceKey); bq.add(new TermQuery(t), BooleanClause.Occur.MUST); bq.add(req.query, BooleanClause.Occur.MUST); req.query = bq; // Run the query and get the results. QueryResult result = processor.processRequest(req); nHits = 0; totalHits = 0; for (int i = 0; i < result.docHits.length; i++) { nHits += result.docHits[i].nSnippets(); totalHits += result.docHits[i].totalSnippets(); } hitsToDocHit = new DocHit[nHits]; hitsToDocHitNum = new int[nHits]; hitsByScore = new Snippet[nHits]; int n = 0; for (int i = 0; i < result.docHits.length; i++) { DocHit docHit = result.docHits[i]; for (int j = 0; j < docHit.nSnippets(); j++) { hitsToDocHit[n] = docHit; hitsToDocHitNum[n] = j; hitsByScore[n] = docHit.snippet(j, false); n++; } } assert n == nHits; // We'll need the term map later when we're marking hits. if (nHits > 0) { termMap = result.textTerms; // We also need the stopword set, and the plural map. stopSet = result.context.stopSet; pluralMap = result.context.pluralMap; accentMap = result.context.accentMap; } // Make a second array of the hits, this time sorted by location. hitsByLocation = new Snippet[hitsByScore.length]; System.arraycopy(hitsByScore, 0, hitsByLocation, 0, hitsByScore.length); Arrays.sort(hitsByLocation, new Comparator() { public int compare(Object o1, Object o2) { final Snippet s1 = (Snippet)o1; final Snippet s2 = (Snippet)o2; int n; if ((n = s1.startNode - s2.startNode) != 0) return n; if ((n = s1.startOffset - s2.startOffset) != 0) return n; // Debugging help @SuppressWarnings("unused") String str1 = hitsToDocHit[s1.rank].snippet(hitsToDocHitNum[s1.rank], true).text; @SuppressWarnings("unused") String str2 = hitsToDocHit[s2.rank].snippet(hitsToDocHitNum[s2.rank], true).text; assert false : "Chunk hits should never overlap!"; return 0; } }); // Extra check to be absolutely sure the hits don't overlap. for (int i = 0; i < nHits - 1; i++) { Snippet s1 = hitsByLocation[i]; Snippet s2 = hitsByLocation[i + 1]; assert s1.endNode >= s1.startNode; assert s2.endNode >= s2.startNode; assert s2.startNode >= s1.endNode; if (s2.startNode == s1.endNode) { if (s2.startOffset < s1.endOffset) { s1 = hitsToDocHit[s1.rank].snippet(hitsToDocHitNum[s1.rank], true); s2 = hitsToDocHit[s2.rank].snippet(hitsToDocHitNum[s2.rank], true); // Debugging help @SuppressWarnings("unused") String t1 = s1.text; @SuppressWarnings("unused") String t2 = s2.text; assert false; } } } // Make a mapping between the two arrays. hitRankToNum = new int[nHits]; for (int i = 0; i < nHits; i++) hitRankToNum[hitsByLocation[i].rank] = i; // Add special nodes for snippets addSnippets(); } // search() /** * Get a node by its node number. Handles generating synthetic nodes if * necessary. * * @param num The number of the node to get * @return A node, or null if the number is invalid. */ public NodeImpl getNode(int num) { // Early out for not-a-node. if (num == -1) return null; // First, check the cache. Note that virtual nodes, once created, // will *always* be in the cache. // NodeImpl node = checkCache(num); if (node != null) return node; // Catch requests for snippet nodes. They are only created when needed, // on-the-fly. // if (num >= SNIPPET_MARKER && num < SNIPPET_MARKER + MARKER_RANGE) return (SearchElementImpl)createSnippetNode(num, true); // Catch requests for out-of-context hit nodes. if (num >= HIT_ELMT_MARKER && num < HIT_ELMT_MARKER + MARKER_RANGE) return getHitElement(num - HIT_ELMT_MARKER); // We have to treat requests for the previous sibling as special. This // is because the previous node might be a text node which needs to be // expanded. We use this detection logic so we can secretly expand it // and get the last node from the expansion as the previous sibling of // this node. // int normNum = num; if (num >= PREV_SIB_MARKER && num < PREV_SIB_MARKER + MARKER_RANGE) { normNum = num - PREV_SIB_MARKER; node = checkCache(normNum); if (node != null) { if (allPermanent) nodeCache.put(Integer.valueOf(num), node); else nodeCache.put(Integer.valueOf(num), new SoftReference(node)); return node; } } // Okay, load the node from disk. This also puts it into the cache. node = super.getNode(normNum); if (node == null) return null; if (allPermanent) nodeCache.put(Integer.valueOf(normNum), node); assert node.parentNum >= 0 || node == this; assert node.nextSibNum >= -1; assert node.prevSibNum >= -1; assert node.parentNum < 0 || node.parentNum < MARKER_BASE; assert node.nextSibNum < MARKER_BASE; assert node.prevSibNum < MARKER_BASE; assert node.prevSibNum != node.nextSibNum || node.prevSibNum < 0; // We need to differentiate backward references to other nodes. if (node.prevSibNum >= 0) { node.prevSibNum += PREV_SIB_MARKER; assert node.prevSibNum >= 0; } // Gotta do special stuff to text nodes. And if we're getting the // previous sibling, return the *last* node of the expansion rather // than the first. // if (node instanceof SearchTextImpl) node = expandText((SearchTextImpl)node, normNum != num); // All done. if (num >= MARKER_BASE) nodeCache.put(Integer.valueOf(num), node); return node; } // getNode() /** * Add our namespace to the list of namespaces. */ private void addXTFNamespace() { assert numberOfNamespaces >= 1 : "must start with root namespace"; numberOfNamespaces++; int[] codes2 = new int[numberOfNamespaces]; System.arraycopy(namespaceCode, 0, codes2, 0, numberOfNamespaces - 1); namespaceCode = codes2; int[] parents2 = new int[numberOfNamespaces]; System.arraycopy(namespaceParent, 0, parents2, 0, numberOfNamespaces - 1); namespaceParent = parents2; namespaceCode[numberOfNamespaces - 1] = xtfNamespaceCode = namePool.allocateNamespaceCode( "xtf", xtfURI); namespaceParent[numberOfNamespaces - 1] = 1; ElementImpl rootKid = getRootKid(); modifyNode(rootKid); rootKid.nameSpace = numberOfNamespaces - 1; } // addXTFNamespace() /** * Get the top-level element that can actually be modified. */ private ElementImpl getRootKid() { EasyNode root = new EasyNode(this); for (int i=0; i<root.nChildren(); i++) { EasyNode kid = root.child(i); if (kid.isElement()) return (ElementImpl) kid.getWrappedNode(); } throw new RuntimeException("Internal error: Search tree does not appear to have a root element"); } /** * Given a hit number, this method retrieves the synthetic hit node for it. */ private SearchElementImpl getHitElement(int hitNum) { // Get the associated text node. This will have the effect of generating // the element we need. // NodeImpl tn = getNode(hitsByLocation[hitNum].startNode); assert tn instanceof SearchTextImpl : "Lazy file text node does not match index node number"; // The element we want should now be in the cache. SearchElementImpl el = (SearchElementImpl)nodeCache.get( Integer.valueOf(HIT_ELMT_MARKER + hitNum)); assert el != null : "Search element must be created with its text"; return el; } // getHitElement /** * Create an element node. Derived classes can override this to provide * their own element implementation. */ protected @Override NodeImpl createElementNode() { return new SearchElementImpl(this); } /** * Create a text node. Derived classes can override this to provide their * own text implementation. */ protected @Override NodeImpl createTextNode() { return new SearchTextImpl(this); } /** * Checks to see if we've already loaded the node corresponding with the * given number. If so, return it, else null. */ protected NodeImpl checkCache(int num) { NodeImpl node = super.checkCache(num); assert !(node == null && num >= VIRTUAL_MARKER && num < VIRTUAL_MARKER + MARKER_RANGE) : "Missing virtual node"; return node; } // checkCache() /** * Annotate a text node with search results. * * @param origNode * The text node as loaded from disk. * @param returnLastNode * true to return the last added node, else first. * @return The adjusted node. */ private NodeImpl expandText(SearchTextImpl origNode, boolean returnLastNode) { // Figure out the first hit that involves this node. final int num = origNode.nodeNum; int hitNum = findFirstHit(num); SearchTextImpl curNode = origNode; final String text = curNode.getStringValue(); final int textLen = text.length(); Snippet snippet = null; int hitStart = -1; int hitEnd = -1; if (hitNum < nHits) { snippet = hitsByLocation[hitNum]; if (num < snippet.startNode) snippet = null; else { hitStart = (num == snippet.startNode) ? snippet.startOffset : 0; hitEnd = (num == snippet.endNode) ? snippet.endOffset : Integer.MAX_VALUE; } } // If we're only marking terms within hits and there are no hits in this // node, then we need do nothing more. // if (termMode < ContextMarker.MARK_ALL_TERMS && hitStart < 0) return origNode; // Okay, now scan every word. Use a fast tokenizer, since the Standard // one is dog-slow. Special case: if the 'check' flag is turned on, we // run both tokenizers in parallel and check that they give the exact // same tokens. // boolean check = false; TokenStream tokenizer = new FastTokenizer(new FastStringReader(text)); if (check) { TokenStream stdTok = new StandardTokenizer(new StringReader(text)); tokenizer = new CheckingTokenStream(tokenizer, stdTok); } tokenizer = new StandardFilter(tokenizer); int wordOffset = 0; int startChar = 0; int endChar = 0; boolean inHit = false; while (true) { // Get the next word. Token token; try { token = tokenizer.next(); } catch (Exception e) { assert false : "How can string tokenization fail?!"; throw new RuntimeException(e); } // At the start of a hit, skip any leading non-token chars. Don't // do that in mid-hit, since it could introduce non-marked gaps // at tag boundaries. // if (token != null && snippet != null && snippet.startNode == num) endChar = token.startOffset(); // Convert the term to lower-case, and depluralize if necessary. String mappedTerm = null; if (token != null) { mappedTerm = token.termText().toLowerCase(); if (pluralMap != null) { String singular = pluralMap.lookup(mappedTerm); if (singular != null) mappedTerm = singular; } if (accentMap != null) { String unaccented = accentMap.mapWord(mappedTerm); if (unaccented != null) mappedTerm = unaccented; } } // Are we at the start of a hit? if (wordOffset == hitStart) { assert snippet.startNode != num || termMap.contains(mappedTerm) : "first hit token must be in search terms"; assert !inHit; inHit = true; // Truncate the string in the current text node. curNode.setStringValue(text.substring(startChar, endChar)); // Add the "hit" element boolean firstForHit = (snippet.startNode == num); boolean lastForHit = (snippet.endNode == num); SearchElementImpl el = (SearchElementImpl)createHitElement(firstForHit, lastForHit, hitNum, true); // real not proxy linkSibling(curNode, el); // Resume inside the new element. curNode = addText(el, text.substring(endChar, textLen), true); startChar = endChar; inHit = true; } // if // Are we out of words? if (token == null) break; wordOffset++; // If the token matches a query term, mark it. if (termMap != null && termMap.contains(mappedTerm) && (termMode == ContextMarker.MARK_ALL_TERMS || (termMode >= ContextMarker.MARK_SPAN_TERMS && inHit)) && (inHit || stopSet == null || !stopSet.contains(mappedTerm))) { final int soff = token.startOffset(); final int eoff = token.endOffset(); // Truncate the string in the current text node. curNode.setStringValue(text.substring(startChar, soff)); // Add the "xtfTerm" element SearchElementImpl el = addElement(curNode, termElementCode, 0, false); // Put the term text inside it addText(el, text.substring(soff, eoff), true); // Resume with the text after the term. curNode = addText(el, text.substring(eoff, textLen), false); startChar = token.endOffset(); } // if endChar = token.endOffset(); // Are we at the end of a hit? If not, go again. if (hitEnd < 0 || wordOffset <= hitEnd) continue; assert snippet.endNode != num || termMap.contains(mappedTerm) : "last hit token must be a search term"; assert inHit; inHit = false; // Truncate the string in the current text node. curNode.setStringValue(text.substring(startChar, endChar)); // Resume outside the "hit" element. SearchElementImpl el = (SearchElementImpl)curNode.getParent(); curNode = addText(el, text.substring(endChar, textLen), false); startChar = endChar; inHit = false; // Try the next hit. hitNum++; snippet = null; hitStart = hitEnd = -1; if (hitNum < nHits) { snippet = hitsByLocation[hitNum]; if (num < snippet.startNode) snippet = null; else { hitStart = (num == snippet.startNode) ? snippet.startOffset : 0; hitEnd = (num == snippet.endNode) ? snippet.endOffset : Integer.MAX_VALUE; } // else } // if } // while // All done! if (returnLastNode) { if (inHit) return (NodeImpl)curNode.getParent(); return curNode; } return origNode; } // expandText() /** * Does the work of creating a "hit" element. * * @param firstForHit true if this is the first element for the hit * @param lastForHit true if this is the last element for the hit * @param hitNum The hit being referenced * @param realNotProxy true to create a real node, else make a proxy. */ SearchElement createHitElement(boolean firstForHit, boolean lastForHit, int hitNum, boolean realNotProxy) { Snippet snippet = hitsByLocation[hitNum]; int nameCode = firstForHit ? hitElementCode : moreElementCode; int nAttrs = suppressScores ? 3 : 4; SearchElement el = realNotProxy ? (SearchElement)new SearchElementImpl(this) : (SearchElement)new ProxyElement(this); initElement(el, nameCode, nAttrs); // If this is the first element for the hit, give it a special number. if (firstForHit) el.setNodeNum(HIT_ELMT_MARKER + hitNum); // Add the identifying attributes. We add one to the hit number // because XSLT generally expects 1-based counting. // int attrNum = 0; if (!suppressScores) el.setAttribute(attrNum++, scoreAttrCode, Integer.toString(Math.round(snippet.score * 100))); el.setAttribute(attrNum++, rankAttrCode, Integer.toString(snippet.rank + 1)); el.setAttribute(attrNum++, hitNumAttrCode, Integer.toString(hitNum + 1)); el.setAttribute(attrNum++, continuesAttrCode, lastForHit ? "no" : "yes"); assert attrNum == nAttrs; // All done! return el; } // createHitElement() /** * Create an element as the sibling of another node. * * @param prev * Node to add sibling to * @param elNameCode * Name of the new element * @param nAttribs * How many attributes it will have * @param addAsChild * true to add as a child of 'prev', false to add as a sibling. */ private SearchElementImpl addElement(NodeImpl prev, int elNameCode, int nAttribs, boolean addAsChild) { // Create the new node and link it in. SearchElementImpl el = createElement(elNameCode, nAttribs); if (addAsChild) linkChild((ParentNodeImpl)prev, el); else linkSibling(prev, el); return el; } /** * Create a text node. * * @param prev * Node to add sibling to * @param text * Initial text string for the new node * @param addAsChild * true to add as a child of 'prev', false to add as a sibling. */ private SearchTextImpl addText(NodeImpl prev, String text, boolean addAsChild) { // Create the new node and link it in. SearchTextImpl textNode = createText(text); if (addAsChild) linkChild((ParentNodeImpl)prev, textNode); else linkSibling(prev, textNode); return textNode; } // addSiblingElement() /** * Does the work of creating an element, but doesn't link it into the tree. * * @param elNameCode The name for the new element * @param nAttribs How many attributes it will have. * @return The new element. */ private SearchElementImpl createElement(int elNameCode, int nAttribs) { SearchElementImpl el = new SearchElementImpl(this); initElement(el, elNameCode, nAttribs); return el; } // createElement() /** * Initialize all the fields of a new element node. */ private void initElement(SearchElement el, int elNameCode, int nAttrs) { initNode(el); el.allocateAttributes(nAttrs); el.setNameCode(elNameCode); } // initElement() /** * Does the work of creating a text node, but doesn't link it into the tree. * * @param text * The initial text for the node * @return The newly created node. */ private SearchTextImpl createText(String text) { SearchTextImpl node = new SearchTextImpl(this); initNode(node); node.setStringValue(text); return node; } // createText() /** * Does the work of linking in a new sibling element or text node. */ private void linkSibling(NodeImpl prev, NodeImpl node) { // Mark the nodes to modify. NodeImpl next = (NodeImpl)prev.getNextSibling(); modifyNode(prev); modifyNode(next); // Link it in node.parentNum = prev.parentNum; node.prevSibNum = prev.nodeNum; node.nextSibNum = prev.nextSibNum; prev.nextSibNum = node.nodeNum; if (next != null) next.prevSibNum = node.nodeNum; } // linkSibling() /** * Does the work of linking in a new child element or text node. It will be * added as the first child. */ private void linkChild(ParentNodeImpl parent, NodeImpl node) { // Mark the node to modify. modifyNode(parent); // Link it in node.parentNum = parent.nodeNum; node.prevSibNum = -1; node.nextSibNum = parent.childNum; parent.childNum = node.nodeNum; } // linkChild() /** * Performs initialization tasks common to text and element nodes. */ private void initNode(SearchNode node) { node.setNodeNum(nextVirtualNum); if (!(node instanceof ProxyElement)) nodeCache.put(Integer.valueOf(nextVirtualNum), node); nextVirtualNum++; } // initNode /** * Prepares a node for modification. Essentially, makes sure that it will be * cached and never reloaded from disk. */ private void modifyNode(NodeImpl node) { // Before modifying the node, make sure we hold onto a hard reference // (normally the node cache only contains weak references.) // if (node != null) nodeCache.put(Integer.valueOf(node.nodeNum), node); } // modifyNode() /** * Adds the top-level <xtf:snippets> element. If its children are * fetched later, they'll be created on the fly. */ private void addSnippets() { ElementImpl rootKid = getRootKid(); topSnippetNode = addElement(rootKid, snippetsElementCode, 2, true); topSnippetNode.setAttribute(0, totalHitCountAttrCode, Integer.toString(totalHits)); topSnippetNode.setAttribute(1, hitCountAttrCode, Integer.toString(nHits)); if (nHits > 0) topSnippetNode.childNum = SNIPPET_MARKER + 0; } // addSnippets() /** * Creates an on-the-fly snippet node. * * @param num The node number (SNIPPET_MARKER + hit #) */ private SearchElement createSnippetNode(int num, boolean realNotProxy) { // Figure out which hit is being referenced int hitNum = num - SNIPPET_MARKER; DocHit docHit = hitsToDocHit[hitNum]; Snippet snippet = realNotProxy ? docHit.snippet(hitsToDocHitNum[hitNum], true) // we need the text. : hitsByScore[hitNum]; // Make the element, and create its links to other elements. int nAttribs = 2 + (snippet.sectionType != null ? 1 : 0) + (suppressScores ? 0 : 1) + (docHit.subDocument() == null ? 0 : 1); SearchElement snippetElement = realNotProxy ? (SearchElement)new SearchElementImpl(this) : (SearchElement)new ProxyElement(this); initElement(snippetElement, snippetElementCode, nAttribs); snippetElement.setPrevSibNum((hitNum == 0) ? -1 : SNIPPET_MARKER + hitNum - 1); snippetElement.setNextSibNum( (hitNum == nHits - 1) ? -1 : SNIPPET_MARKER + hitNum + 1); snippetElement.setParentNum(topSnippetNode.nodeNum); // Give it a special place in the node cache so we can find it again. snippetElement.setNodeNum(num); if (realNotProxy) nodeCache.put(Integer.valueOf(num), snippetElement); // Add the score (if not suppressed), hit number, and (if present) // the section type. // int attrNum = 0; if (!suppressScores) snippetElement.setAttribute(attrNum++, scoreAttrCode, Integer.toString(Math.round(snippet.score * 100))); snippetElement.setAttribute(attrNum++, rankAttrCode, Integer.toString(hitNum + 1)); // XSLT expects 1-based snippetElement.setAttribute(attrNum++, hitNumAttrCode, Integer.toString(hitRankToNum[hitNum] + 1)); if (snippet.sectionType != null) snippetElement.setAttribute(attrNum++, sectionTypeAttrCode, snippet.sectionType); if (docHit.subDocument() != null) snippetElement.setAttribute(attrNum++, subDocumentAttrCode, docHit.subDocument()); assert attrNum == nAttribs; // If we're only making a proxy node, don't do the text stuff. if (!realNotProxy) return snippetElement; // Add text before the <hit> marker. String text = snippet.text; int hitStart = text.indexOf("<hit"); assert hitStart >= 0 : "missing <hit> in snippet"; String beforeText = text.substring(0, hitStart); NodeImpl prev = breakupText(beforeText, (SearchElementImpl)snippetElement, true); // Add the stuff inside the <hit> int hitTextStart = text.indexOf('>', hitStart) + 1; int hitEnd = text.indexOf("</hit>"); assert hitEnd >= 0 : "missing </hit> in snippet"; prev = addElement(prev, hitElementCode, 0, false); String hitText = text.substring(hitTextStart, hitEnd); breakupText(hitText, prev, true); // as child, don't update prev. // And add the text after the <hit> int textResume = text.indexOf('>', hitEnd) + 1; String afterText = text.substring(textResume); prev = breakupText(afterText, prev, false); // All done! return snippetElement; } // createSnippetNode() // Precompiled patterns for undoing entity expansion in snippets private static final Pattern ampPattern = Pattern.compile("&"); private static final Pattern ltPattern = Pattern.compile("<"); private static final Pattern gtPattern = Pattern.compile(">"); /** * Change entities back into normal text (entities are created inside * snippets to differentiate them from normal tags.) * * @param str String to replace entities within * @return Modified string (or same string if no entities found). */ private String undoEntities(String str) { if (str.indexOf("&") >= 0) str = ampPattern.matcher(str).replaceAll("&"); if (str.indexOf("<") >= 0) str = ltPattern.matcher(str).replaceAll("<"); if (str.indexOf(">") >= 0) str = gtPattern.matcher(str).replaceAll(">"); return str; } // undoEntities() /** * Create the appropriate node(s) for text within a snippet, including * elements for any marked <term>s. * * @param text * Text to process, with " <term>" stuff inside it. * @param prev * Node to add to * @param addAsChild * true to add to prev as a child, else as sibling. * @return Last node added. */ private NodeImpl breakupText(String text, NodeImpl prev, boolean addAsChild) { int startPos = 0; while (true) { // Is there markup we need to worry about? int markerPos = text.indexOf("<term", startPos); if (markerPos < 0) markerPos = text.length(); // Add a text node for everything up to the marker (or the end if // there isn't a marker). // String beforeText = text.substring(startPos, markerPos); beforeText = undoEntities(beforeText); prev = addText(prev, beforeText, addAsChild); addAsChild = false; // If no marker, we're done. if (markerPos == text.length()) break; // Now insert the term element and its text. int termStart = text.indexOf('>', markerPos) + 1; int markEnd = text.indexOf("</term>", markerPos); String termText = text.substring(termStart, markEnd); termText = undoEntities(termText); prev = addElement(prev, termElementCode, 0, false); addText(prev, termText, true); // Go again... startPos = text.indexOf('>', markEnd) + 1; } // while return prev; } // breakupText() /** * Locates the first hit that could conceivably involve this node, that is, * the first hit with node number >= 'nodeNum'. * * @param nodeNum The node of interest. * @return Index of the hit (might be == nHits, meaning no hit * could apply.) */ int findFirstHit(final int nodeNum) { // Figure out the first hit that involves this node. int hitNum = Arrays.binarySearch(hitsByLocation, null, new Comparator() { public int compare(Object o1, Object o2) { return ((Snippet)o1).endNode < nodeNum ? -1 : 1; } }); // None? Get out. assert hitNum < 0 : "Comparator should never return an exact match"; hitNum = -hitNum - 1; assert hitNum >= 0 && hitNum <= nHits; return hitNum; } // findFirstHit /** * Locates the last hit that could conceivably involve this node, that is, * the last hit with node number >= 'nodeNum'. * * @param nodeNum Node number of the element in question. * @return Index of the hit (might be == nHits, meaning no hit * could apply.) */ int findLastHit(final int nodeNum) { // To figure out the last hit, we need the number of the next sibling, // or sibling of a parent. // NodeImpl node = getNode(nodeNum); while (node != null) { NodeImpl sib = (NodeImpl)node.getNextSibling(); if (sib != null) { node = sib; break; } node = (ParentNodeImpl)node.getParent(); } int lastNodeNum = (node == null) ? numberOfNodes : node.nodeNum; return findFirstHit(lastNodeNum); } // findLastHit() /** * Writes a disk-based version of an index. Use getIndex() later to read * it. This method is overriden to ensure that no virtual nodes ever get * written to a disk index. * * @param indexName Uniquely computed name * @param index HashMap mapping String -> ArrayList[NodeImpl] */ public void putIndex(String indexName, HashMap index) throws IOException { // Check each node in the index to make sure it's not virtual or // otherwise synthetic. // for (Iterator iter = index.values().iterator(); iter.hasNext();) { ArrayList list = (ArrayList)iter.next(); for (int i = 0; i < list.size(); i++) { Item item = (Item)list.get(i); NodeImpl node = (item instanceof ProxyElement) ? null : (item instanceof NodeImpl) ? ((NodeImpl)item) : (item instanceof StrippedNode) ? ((NodeImpl)((StrippedNode)item).getUnderlyingNode()) : null; if (node == null || node.nodeNum >= MARKER_BASE || ((node instanceof ParentNodeImpl) && ((ParentNodeImpl)node).childNum >= MARKER_BASE)) { throw new RuntimeException( "Error: Key index '" + indexName + "' references virtual search-related nodes.\n" + "Change the key so it doesn't reference dynamic " + "nodes, or else change the key's name to contain " + "'dynamic' so it won't be stored."); } } // for i } // for iter // All nodes are real... write the index to disk. super.putIndex(indexName, index); } // putIndex() /** * Get a list of all elements with a given name. This is implemented * as a memo function: the first time it is called for a particular * element type, it remembers the result for next time. * * It's overriden here to take the special case where "xtf:hit" or * "xtf:snippet" is specified. */ protected AxisIterator getAllElements(int fingerprint) { // Is it one of our special names? If not, do the normal thing. if (fingerprint != hitElementFingerprint && fingerprint != snippetElementFingerprint) return super.getAllElements(fingerprint); Trace.debug( " Building DYNAMIC list of elements named '" + namePool.getClarkName(fingerprint) + "'..."); // Both cases will result in a list of 'nHits' hits. ArrayList items = new ArrayList(nHits); // Now handle the cases. if (fingerprint == snippetElementFingerprint) { for (int i = 0; i < nHits; i++) items.add(createSnippetNode(i, false)); } // if else { assert fingerprint == hitElementFingerprint : "incorrect switching"; for (int i = 0; i < nHits; i++) { Snippet snippet = hitsByLocation[i]; boolean lastForHit = (snippet.startNode == snippet.endNode); items.add(createHitElement(true, lastForHit, i, false)); } } // else Trace.debug("done"); return new NodeListIterator(items); } // getAllElements() /** * DEBUGGING ONLY: Removes parts of the tree that haven't been loaded yet. * This can be useful to view the subset of the tree that have actually * been accessed. * * Note that to be useful, {@link #setAllPermanent(boolean)} should be * called before accessing the tree to ensure that all nodes referenced * are kept in RAM. */ public void pruneUnused() { assert allPermanent : "allPermanent should be true for pruneUnused()"; // To be able to reach every node in memory, we have to make sure all // of their parents and previous siblings are also in memory. Use a // stack to make sure we get them all (and any others we have to load). // NodeImpl[] stack = new NodeImpl[(numberOfNodes + nHits) * 3]; int top = 0; for (Iterator iter = nodeCache.values().iterator(); iter.hasNext();) { Object ref = iter.next(); if (ref instanceof NodeImpl) stack[top++] = (NodeImpl)ref; else assert false : "allPermanent should be true for pruneUnused()"; } // for iter // Keep processing until we've finished everything. while (top > 0) { NodeImpl node = stack[--top]; // Have we loaded the previous sibling? If not, load it and add it // to the stack for processing. // if (node.prevSibNum >= 0) { if (!nodeCache.containsKey(Integer.valueOf(node.prevSibNum))) stack[top++] = getNode(node.prevSibNum); assert nodeCache.containsKey(Integer.valueOf(node.prevSibNum)); } // Ditto the parent. if (node.parentNum != 0) stack[top++] = getNode(node.parentNum); } // while // Cool. We've loaded everything necessary to get to the nodes that // were loaded before. Kill off all other links. // for (Iterator iter = nodeCache.values().iterator(); iter.hasNext();) { NodeImpl node = (NodeImpl)iter.next(); if (node.prevSibNum >= 0 && !nodeCache.containsKey(Integer.valueOf(node.prevSibNum))) assert false : "Should have loaded prev sib"; if (node.nextSibNum >= 0 && !nodeCache.containsKey(Integer.valueOf(node.nextSibNum))) node.nextSibNum = -1; if (node instanceof ParentNodeImpl) { ParentNodeImpl pnode = (ParentNodeImpl)node; if (pnode.childNum >= 0 && !nodeCache.containsKey(Integer.valueOf(pnode.childNum))) pnode.childNum = -1; } } // for iter } // pruneUnused() public int getTotalHits() { return totalHits; } } // class SearchTree