package org.cdlib.xtf.textEngine; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.LinkedHashSet; import java.util.Set; import java.util.zip.GZIPInputStream; import org.apache.lucene.bigram.BigramQueryRewriter; import org.apache.lucene.chunk.DocNumMap; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.spelt.SpellReader; import org.apache.lucene.store.Directory; import org.cdlib.xtf.textIndexer.TextIndexer; import org.cdlib.xtf.util.CharMap; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.WordMap; /** * Used to keep a set of searcher, reader, and doc-num-map that are consistent * with each other and also up-to-date. * * @author Martin Haye */ public class XtfSearcher { /** Path to the index directory */ private String indexPath; /** The index directory to read from */ private Directory directory; /** How often to check for an out-of-date directory */ private long updatePeriod; /** Last time we checked for out-of-date */ private long lastCheckTime; /** Version number of the index in memory */ private long curVersion; /** Version number of index on disk */ private long newVersion; /** Reader used to access the index */ private IndexReader indexReader; /** Keeps track of which chunks belong to which documents */ private DocNumMap docNumMap; /** Fetching spelling suggestions */ private SpellReader spellReader; /** Max # of words in a chunk */ private int chunkSize; /** Amount of overlap, in words, between adjacent chunks */ private int chunkOverlap; /** Stop-words associated with the index (e.g. "the", "a", "and", etc.) */ private Set stopSet; /** Map of plural words to singular words */ private WordMap pluralMap; /** Map of accented chars to remove diacritics from */ private CharMap accentMap; /** Set of all indexed fields in the index */ private Set indexedFields; /** Set of all fields which are tokenized in the index */ private Set tokenizedFields; /** Whether this index is "sparse" (i.e. more than 5 chunks per doc) */ private boolean isSparse; /** * Construct a searcher set on the given directory. * * @param indexPath Directory to load index data from * @param updateCheckSeconds How often to check for an updated index */ public XtfSearcher(String indexPath, int updateCheckSeconds) throws IOException { this(indexPath, NativeFSDirectory.getDirectory(indexPath), updateCheckSeconds); } // XtfSearcher /** * Construct a searcher set on the given directory. * * @param indexPath Path to index directory * @param dir Lucene version of the index directory * @param updateCheckSeconds How often to check for an updated index */ public XtfSearcher(String indexPath, Directory dir, int updateCheckSeconds) throws IOException { this.indexPath = indexPath; this.directory = dir; curVersion = -99; updatePeriod = ((long)updateCheckSeconds) * 1000; update(); } // XtfSearcher /** * Check if the version we have in memory is up-to-date relative to that * on disk. */ public boolean isUpToDate() throws IOException { // Get the version on disk. If it's the same as the one we have in // memory, no problem. // newVersion = IndexReader.getCurrentVersion(directory); return (newVersion == curVersion); } /** * Ensures that this searcher is up-to-date with regards to the index on * disk. */ public void update() throws IOException { // It would be too time-consuming to read the index version number // every single time. So only check it periodically. // if (curVersion >= 0) { long time = System.currentTimeMillis(); if (time - lastCheckTime < updatePeriod) return; lastCheckTime = time; } // Get the version on disk. If it's the same as the one we have in // memory, no problem. // if (isUpToDate()) return; // If we've been requested to never re-update, forget it. if (curVersion >= 0 && updatePeriod == 0) return; // Okay, better re-open to get the fresh data. close(); indexReader = IndexReader.open(directory); // Fetch the index information chunk. Hits match = new IndexSearcher(indexReader).search( new TermQuery(new Term("indexInfo", "1"))); if (match.length() == 0) throw new IOException("Index missing indexInfo doc"); assert match.id(0) == 0 : "indexInfo chunk must be first in index"; Document doc = match.doc(0); // Ensure that the index version is compatible. String indexVersion = doc.get("xtfIndexVersion"); if (indexVersion == null) indexVersion = "1.0"; if (indexVersion.compareTo(TextIndexer.REQUIRED_VERSION) < 0) { throw new IOException( "Incompatible index version " + indexVersion + "; require at least " + TextIndexer.REQUIRED_VERSION + "... consider re-indexing with '-clean'."); } // Validate the chunk size and overlap chunkSize = Integer.parseInt(doc.get("chunkSize")); chunkOverlap = Integer.parseInt(doc.get("chunkOvlp")); if (chunkSize <= 0 || chunkOverlap <= 0 || chunkOverlap >= chunkSize) throw new IOException("Invalid chunkSize/overlap in index"); // Construct a map from doc # to chunk #'s (and vice-versa) docNumMap = new XtfDocNumMap(indexReader, chunkSize, chunkOverlap); // Get the stop-word set. String stopWords = doc.get("stopWords"); stopSet = null; if (stopWords != null && stopWords.length() > 0) stopSet = BigramQueryRewriter.makeStopSet(stopWords); // If there's an accent map specified, load it. String accentMapName = doc.get("accentMap"); if (accentMapName != null && accentMapName.length() > 0) { File accentFile = new File(indexPath, accentMapName); InputStream stream = new FileInputStream(accentFile); if (accentMapName.endsWith(".gz")) stream = new GZIPInputStream(stream); accentMap = new CharMap(stream); } // If there's a plural map specified, load it. Be sure to apply // the accent map, if any, so that plural words get mapped // whether they're accented or not. // String pluralMapName = doc.get("pluralMap"); if (pluralMapName != null && pluralMapName.length() > 0) { File pluralFile = new File(indexPath, pluralMapName); InputStream stream = new FileInputStream(pluralFile); if (pluralMapName.endsWith(".gz")) stream = new GZIPInputStream(stream); pluralMap = new WordMap(stream, accentMap); } // If there's a spelling correction dictionary, attach to it. File spellDir = new File(indexPath, "spellDict"); if (SpellReader.isValidDictionary(spellDir)) { spellReader = SpellReader.open(spellDir); spellReader.setStopwords(stopSet); spellReader.setWordEquiv(new XtfWordEquiv(accentMap, pluralMap)); } // Determine whether this is a "sparse" index. Our definition of // sparse is that there are more than 5 chunks per document, meaning // that meta-data sorting and grouping will waste a lot of memory // if they allocate a slot per chunk. // int nDocs = indexReader.docFreq(new Term("docInfo", "1")); int nChunks = indexReader.maxDoc(); isSparse = nChunks > (nDocs * 5); // Determine the list of all fields. indexedFields = new LinkedHashSet( indexReader.getFieldNames(IndexReader.FieldOption.ALL)); // Determine which fields are tokenized. tokenizedFields = readTokenizedFields(indexPath, indexReader); // Remember the version that's now in memory. curVersion = newVersion; } // update() /** * Read in the list of fields that are tokenized in this index. */ public static LinkedHashSet readTokenizedFields(String indexPath, IndexReader indexReader) throws IOException { LinkedHashSet tokenizedFields = new LinkedHashSet(); // Read in the the file listing all the tokenized fields (if any). File tokFieldsFile = new File( Path.normalizePath(indexPath + "/tokenizedFields.txt")); if (tokFieldsFile.canRead()) { BufferedReader reader = new BufferedReader(new FileReader(tokFieldsFile)); String line; while ((line = reader.readLine()) != null) tokenizedFields.add(line); reader.close(); } // Previous versions of XTF stored the list of tokenized fields in a // Lucene field. Turns out this was a bad idea, because it gets lost if // the document attached to that field is deleted. Still, let's retain // compatibility and read it if present. // TermEnum tokTerms = indexReader.terms(new Term("tokenizedFields", "")); do { Term t = tokTerms.term(); if (t == null) break; if (!t.field().equals("tokenizedFields")) break; tokenizedFields.add(t.text()); } while (tokTerms.next()); // Of course, the "text" field is always tokenized. tokenizedFields.add("text"); // Also of interest: the "sectionType" special field is always tokenized. tokenizedFields.add("sectionType"); // All done. return tokenizedFields; } /** * Get the list of all tokenized fields. */ public Set tokenizedFields() { return tokenizedFields; } /** * Gets the reader this searcher is using to read indexes. */ public IndexReader indexReader() { return indexReader; } /** * Gets the set of all fields that have been indexed. */ public Set indexedFields() { return indexedFields; } /** * Gets a map for translating chunk IDs to document IDs (and vice-versa) */ public DocNumMap docNumMap() { return docNumMap; } /** * Find out how many words (max) are in a chunk. */ public int chunkSize() { return chunkSize; } /** * Find out how many words adjacent chunks can overlap. */ public int chunkOverlap() { return chunkOverlap; } /** * Find out the set of stop words, or null if none. */ public Set stopSet() { return stopSet; } /** * Find out the plural mapping, or null if none. */ public WordMap pluralMap() { return pluralMap; } /** * Find out the accent mapping, or null if none. */ public CharMap accentMap() { return accentMap; } public SpellReader spellReader() { return spellReader; } /** * Find out if the index is sparse (i.e. more than 5 chunks per doc) */ public boolean isSparse() { return isSparse; } /** * Close down the searcher and all its dependencies. */ public void close() throws IOException { docNumMap = null; if (indexReader != null) { indexReader.close(); indexReader = null; } if (spellReader != null) { spellReader.close(); spellReader = null; } curVersion = -99; } // close() } // class XtfSearcher