package org.cdlib.xtf.textIndexer; import java.util.ArrayList; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// /** * This class maintains configuration information about the current index that * the TextIndexer program is processing. <br><br> * * Information stored by this class includes: <br><br> * * - The name of the current index being processed. <br> * - The path where the Lucene index database is (to be) stored. <br> * - The path where the source text for this index can be found. <br> * - The path where any XSLT input filters for this index can be found. <br> * - A specification for source text files to ignore. <br> * - The text chunk size and overlap attributes for the current index. <br> * - Specifications for stop word removal. <br><br> * */ public class IndexInfo { /** Name of the current index being processed (as specified in the index * configuration file.) */ public String indexName; /** Name of a sub-directory to index, or null to index everything */ public ArrayList<String> subDirs; /** Name of the path to the current index's Lucene database. */ public String indexPath; /** Whether index rotation is enabled */ public boolean rotate = false; /** Path to the source text for the current index. */ public String sourcePath; /** * True to scan all dirs, false for pruned (e.g. stop at first data). * Defaults to false for backward compatibility. */ public boolean scanAllDirs = false; /** * True to make a clone of the data in index/dataClone. Useful so that * dynaXML can always get to files that match the index. */ public boolean cloneData = false; /** Path to stylesheet used to determine which documents to index */ public String docSelectorPath; /** Set of stop words to remove. Stop words are common words such as "the", * "and", etc. which are so ubiquitous as to add little value to queries. * Rather than remove them entirely however, we take an approach suggested * by Doug Cutting (inventor of Lucene).<br><br> * * Basically, stop words are joined to surrounding normal words. This speeds * queries while still producing good results for requests that contain * a mixture of stop words and normal words (which is by far the most common * case for queries.) <br><br> * * For example, the string "man of war" would be indexed like this: * "man man-of of-war war". This way, searching for "man war" will pull up a * hit, but a search for "man of war" will score higher, as long as the same * stop-word approach is applied to the query.<br><br> * * You might ask what happens in this case: "joke of the year" (two stop * words in a row.) We could index it as "joke joke-of of-the the-year", or * as the longer but more complete "joke joke-of joke-of-the of-the * of-the-year the-year". The second form doesn't offer much improvement * in searching and would make the index bigger and logic more complex. * So we always combine a stop word with at most one neighboring word. * <br><br> * * The words in this list may be separated by spaces, commas, and/or * semicolons. */ public String stopWords; /** Path to a mapping from plural words to their corresponding singular * forms that the textIndexer should fold together. This can yield better * search results. For instance, if a user searches for "cat" they probably * also would like results for "cats." * * The file should be a plain text file, with one word pair per line. * First is the plural form of a word, followed by a "|" character, * followed by the singular form. All should be lowercase, even in the * case of acronyms. * * Optionally, the file may be compressed in GZIP format, in which case * it must end in the extension ".gz". * * Non-ASCII characters should be encoded in UTF-8 format. */ public String pluralMapPath; /** Path to a mapping from accented characters to their corresponding * chars with teh diacritics removed. These chars will be folded together * which can yield better search results. For instance, a German user * on an American keyboard might want to find "Hut" with an umlaut over the * "u", but can't type the umlaut. This way, if they type "hat" they'll still * get a match. * * The file should be a plain text file, with one code pair per line. * First is the 4-digit hex Unicode point for the accented character, * followed by "|", then the 4-digit hex code for the unaccented form. */ public String accentMapPath; /** Path to a set of validation specifications for this index. This is * essentially a list of URLs, with specifications on how many hits * should be returned by each one. Validation is applied at index time * to determine if the index is valid (and before rotating), and is * also applied by the servlets before rotating in a new index. * * The file should be XML in the defined format. */ public String validationPath; /** Whether to create a spellcheck dictionary for this index */ public boolean createSpellcheckDict = false; /** * Whether to strip whitespace between elements in lazy tree files. Not * strictly safe for all XML documents, but it can make lazy trees * somewhat smaller and faster. */ public boolean stripWhitespace = false; /** Text chunk attribute array. Currently this array consists of two entries: * <br><br> * * - The size of the text chunk in words. <br> * - The overlap in words of adjacent text chunks. <br><br> * * These array members should be addressed using <code>chunkSize</code>} * and <code>chunkOvlp</code> constants defined by this class. * <br><br> * * @.notes For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public int[] chunkAtt; /** Index into Chunk Attribute Array for the chunk size attribute. <br><br> * * Indexed text stored in the a Lucine index is broken up in to small chunks * so that search result "summary blurbs" can be easily generated without * having to load the entire source text. The chunk size attribute reflects * the chunk size (in words) used by the current index. * */ public final static int chunkSize = 0; /** Index into Chunk Attribute Array for the chunk size attribute. <br><br> * * Indexed text stored in the a Lucine index is broken up in to small chunks * that overlap with adjacent chunks so that "summary blurbs" for proximity * searches can be easily generated without having to load the entire source * text. The chunk overlap attribute reflects the overlap (in words) used by * the current index. * */ public final static int chunkOvlp = 1; /** Constant defining the minimum size (in words) of a text chunk. * Value = {@value}. <br><br> * * @.notes For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public final static int minChunkSize = 2; /** Constant defining the default size (in words) of a text chunk. * Value = {@value}. <br><br> * * @.notes For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public final static int defaultChunkSize = 100; /** Constant defining the default overlap (in words) of two adjacent text * chunks. Value = {@value}. * * @.notes For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public final static int defaultChunkOvlp = 50; /** Constant defining the default list of stop words. These are common words * that are so ubiquitous as to be of little use in queries. Value = {@value}. * * @.notes For an explanation of stop word handling, * see {@link #stopWords stopWords} */ public final static String defaultStopWords = "a an and are as at be but by for if in into is it no not of on or s " + "such t that the their then there these they this to was will with"; //////////////////////////////////////////////////////////////////////////// /** * Default constructor. <br><br> * * Creates the chunk attribute array, and initializes the * <code>chunkSize</code> entry to * {@link org.cdlib.xtf.textIndexer.IndexInfo#defaultChunkSize defaultChunkSize}, * and the <code>chunkOvlp</code> entry to * {@link org.cdlib.xtf.textIndexer.IndexInfo#defaultChunkOvlp defaultChunkOvlp}. */ public IndexInfo() { // Create the chunk attribute array. chunkAtt = new int[2]; // Set the default chunk size and overlap. chunkAtt[chunkSize] = defaultChunkSize; chunkAtt[chunkOvlp] = defaultChunkOvlp; } //public IndexInfo() //////////////////////////////////////////////////////////////////////////// /** * Alternate constructor. <br><br> * * Initializes the fields needed to use InputStream-based indexing (that is, * all fields except subDir, sourcePath, and docSelectorPath.) * * Uses default values for chunk size/overlap, and for the stop word list. * After construction, these may of course be altered if desired. */ public IndexInfo(String indexName, String indexPath) { // Record the input parameters this.indexName = indexName; this.indexPath = indexPath; // Create the chunk attribute array. chunkAtt = new int[2]; // Set the default chunk size and overlap. chunkAtt[chunkSize] = defaultChunkSize; chunkAtt[chunkOvlp] = defaultChunkOvlp; // Use a default stop-word list. stopWords = defaultStopWords; } //public IndexInfo() //////////////////////////////////////////////////////////////////////////// /** Return the size of a text chunk for the current index. <br><br> * * @return The value of the <code>chunkSize</code> attribute. <br><br> * * @.notes * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public int getChunkSize() { return chunkAtt[chunkSize]; } //////////////////////////////////////////////////////////////////////////// /** Return the size of a text chunk (in words) for the current index * as a string. <br><br> * * @return The value of the <code>chunkSize</code> attribute converted * to a String. <br><br> * * @.notes This method is intended as a convenience call for code that * creats Lucene fields, which are all stored as strings. * <br><br> * * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public String getChunkSizeStr() { return Integer.toString(chunkAtt[chunkSize]); } //////////////////////////////////////////////////////////////////////////// /** Return the overlap of two adjacent text chunks for the current index. * <br><br> * * @return The value of the <code>chunkOvlp</code> attribute. <br><br> * * @.notes * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public int getChunkOvlp() { return chunkAtt[chunkOvlp]; } //////////////////////////////////////////////////////////////////////////// /** Return the overlap (in words) for two adjacent text text chunks in the * current index as a string. <br><br> * * @return The value of the <code>chunkOvlp</code> attribute * converted to a String. <br><br> * * @.notes This method is intended as a convenience call for code that * creats Lucene fields, which are all stored as strings. * <br><br> * * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public String getChunkOvlpStr() { return Integer.toString(chunkAtt[chunkOvlp]); } //////////////////////////////////////////////////////////////////////////// /** Sets the text chunk size attribute for the current index. <br><br> * * This method sets the value for the <code>chunkSize</code> * attribute, coercing its value to be greater than or equal to the * {@link org.cdlib.xtf.textIndexer.IndexInfo#minChunkSize minChunkSize} * value. <br><br> * * @return The resulting coerced chunkSize value. <br><br> * * @.notes This function also calls the * {@link org.cdlib.xtf.textIndexer.IndexInfo#setChunkOvlp(int) setChunkOvlp()} * method to ensure that the overlap value is valid for the * chunk size set by this call. * <br><br> * * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public int setChunkSize(int newChunkSize) { // If a negative chunk size was passed in, default to entire // document indexing. // if (newChunkSize < minChunkSize) newChunkSize = minChunkSize; // Set the new chunk size. chunkAtt[chunkSize] = newChunkSize; // Force the chunk overlap to be valid for the new size. chunkAtt[chunkOvlp] = setChunkOvlp(chunkAtt[chunkOvlp]); // Return the (possibly coerced) chunk size to the caller. return chunkAtt[chunkSize]; } // public setChunkSize() //////////////////////////////////////////////////////////////////////////// /** Sets the adjacent chunk overlap attribute for the current index. <br><br> * * This method sets the value for the * {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp} attribute, * coercing its value to be less than or equal to the half the current chunk * size for the current index. <br><br> * * @return The resulting coerced chunkOvlp value. <br><br> * * For an explanation of the text chunk size and overlap attributes, * see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize} * and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}. */ public int setChunkOvlp(int newChunkOverlap) { // If the chunk overlap is more than 1/2 the chunk size, // force it to be half the chunk size. // if (newChunkOverlap > chunkAtt[chunkSize] / 2) newChunkOverlap = chunkAtt[chunkSize] / 2; // Set the new chunk overlap value. chunkAtt[chunkOvlp] = newChunkOverlap; // And return the (possibly coerced) result to the caller. return chunkAtt[chunkOvlp]; } // public setChunkOverlap() } // class IndexInfo