package org.cdlib.xtf.textEngine; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.IOException; import org.apache.lucene.chunk.DocNumMap; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; /** * Used to map chunk indexes to the corresponding document index, and * vice-versa. Only performs the load when necessary (typically dynaXML uses * the DocNumMap, while crossQuery doesn't.) * * @author Martin Haye */ public class XtfDocNumMap implements DocNumMap { /** Where to get the data from */ private IndexReader reader; /** Max number of words in a chunk */ private int chunkSize; /** Number of words one chunk overlaps with the next */ private int chunkOverlap; /** Total number of docInfo chunks found */ private int nDocs; /** Array of indexes, one for each docInfo chunk */ private int[] docNums = null; /* null until load() called */ /** Caches result of previous scan, used for speed */ private int prevNum = -1; /** Used in binary searching */ private int low = -1; /** Used in binary searching */ private int high = -1; /** * Make a map for the given reader. This reads in all the docInfo chunks * to determine the range of text chunks for each document. */ public XtfDocNumMap(IndexReader reader, int chunkSize, int chunkOverlap) throws IOException { this.reader = reader; this.chunkSize = chunkSize; this.chunkOverlap = chunkOverlap; } // constructor private synchronized void load() { // If already loaded, don't do it again. if (docNums != null) return; try { // Figure out how many entries we'll have, and make our array // that big. // Term term = new Term("docInfo", "1"); nDocs = reader.docFreq(term); docNums = new int[nDocs]; // Get a list of all the "header" chunks for documents in this // index (i.e., documents with a "docInfo" field.) // TermDocs docHeaders = reader.termDocs(term); // Record each document number. int i = 0; while (docHeaders.next()) docNums[i++] = docHeaders.doc(); nDocs = i; // Account for possibly deleted docs } catch (IOException e) { throw new RuntimeException(e); } } /** Get the max number of words per chunk */ public int getChunkSize() { return chunkSize; } /** Get the number of words one chunk overlaps with the next */ public int getChunkOverlap() { return chunkOverlap; } /** * Return a count of the number of documents (not chunks) in the index. */ public final int getDocCount() { return nDocs; } /** * Given a chunk number, return the corresponding document number that it * is part of. Note that like all Lucene indexes, this is ephemeral and * only applies to the given reader. If not found, returns -1; this can * basically only happen if the chunk number is greater than all document * numbers. * * @param chunkNumber Chunk number to translate * @return Document index, or -1 if no match. */ public final synchronized int getDocNum(int chunkNumber) { // Do a binary search for the chunk scan(chunkNumber); // Return the upper end, since the document info is written after // all of its chunks. // if (high == nDocs) return -1; return docNums[high]; } // getDocNum() /** * Given a document number, this method returns the number of its first * chunk. */ public final synchronized int getFirstChunk(int docNum) { // Scan for the document scan(docNum); // If not found, get out. if (low < 0 || docNums[low] != docNum) return -1; if (low == 0) return 1; // Account for index info chunk else return docNums[low - 1] + 1; } // getFirstchunk() /** * Given a document number, this method returns the number of its last * chunk. */ public final int getLastChunk(int docNum) { return docNum - 1; } /** * Perform a binary search looking for the given number. On exit, the * 'low' and 'high' member variables will be indexes into the array that * bracket the value. * * @param num The number to look for. */ private void scan(int num) { // Early-out if (num == prevNum) return; // Make sure we load the data the first time. We do this lazily because // some indexes are only used for crossQuery, which doesn't really use // the info in a DocNumMap. // load(); // Perform a simple binary search. int high = nDocs; // Perform a simple binary search. int low = -1; // Perform a simple binary search. int probe; while (high - low > 1) { probe = (high + low) / 2; if (docNums[probe] > num) high = probe; else low = probe; } // At this point, low and high bracket the value searched for. assert low == -1 || docNums[low] <= num; assert high == nDocs || docNums[high] > num; this.low = low; this.high = high; } // scan() } // class DocNumMap