IndexInfo.java example

Explorer

xtf-dsc-master
- WEB-INF
  - contrib
    - xtf-lucene
      - src
        java
        org
        apache
        lucene
        bigram
        BigramQueryRewriter.java
        BigramSpanRangeQuery.java
        BigramSpanWildcardQuery.java
        BigramStopFilter.java
        chunk
        Chunk.java
        ChunkMarkPos.java
        ChunkSource.java
        ChunkedWordIter.java
        DocNumMap.java
        SpanChunkedNotQuery.java
        SpanDechunkingQuery.java
        SparseStringComparator.java
        limit
        ExcessiveWorkException.java
        LimIndexReader.java
        LimTermDocs.java
        LimTermPositions.java
        TermLimitException.java
        mark
        BasicMarkPos.java
        BasicWordIter.java
        ContextMarker.java
        MarkCollector.java
        MarkPos.java
        WordIter.java
        search
        FieldSpanSource.java
        FlippableStringComparator.java
        QueryRewriter.java
        QueryTraverser.java
        RecordingSearcher.java
        SpanHitCollector.java
        spans
        EmptySpans.java
        FieldSpans.java
        NearSpans.java
        OrNearSpans.java
        Span.java
        SpanFirstQuery.java
        SpanNearQuery.java
        SpanNotNearQuery.java
        SpanNotQuery.java
        SpanOrNearQuery.java
        SpanOrQuery.java
        SpanPosComparator.java
        SpanQuery.java
        SpanRangeQuery.java
        SpanRecordingScorer.java
        SpanScorer.java
        SpanTermQuery.java
        SpanWeight.java
        SpanWildcardQuery.java
        Spans.java
        spell
        SpellKeywordTest.java
        spelt
        DoubleMetaphone.java
        FreqData.java
        LuceneIndexToDict.java
        MinimalAnalyzer.java
        QuerySpeller.java
        SimpleQueryRewriter.java
        SpellReader.java
        SpellTestCmdLine.java
        SpellWriter.java
        SpellWritingAnalyzer.java
        SpellWritingFilter.java
        TRStringDistance2.java
        WordEquiv.java
        util
        CountedInputStream.java
        CountedOutputStream.java
        FileSorter.java
        Hash64.java
        IntList.java
        LongList.java
        LongSet.java
        Prime.java
        PriorityQueue.java
        ProgressTracker.java
        RandomAccessInputStream.java
        StringUtil.java
      - test
        java
        org
        apache
        lucene
        spelt
        DoubleMetaphoneTest.java
        FreqDataTest.java
        LuceneIndexToDictTest.java
        QuerySpellerTest.java
        SimpleQueryRewriterTest.java
        SpellReadWriteTest.java
        SpellWritingAnalyzerTest.java
        TRStringDistance2Test.java
        util
        CountedInputStreamTest.java
        CountedOutputStreamTest.java
        FileSorterTest.java
        Hash64Test.java
        IntListTest.java
        LongListTest.java
        LongSetTest.java
        PrimeTest.java
        ProgressTrackerTest.java
        RandomAccessInputStreamTest.java
        StringUtilTest.java
  - src
    - net
      - sf
        saxon
        tinytree
        HackedTinyBuilder.java
        trans
        KeyManager.java
    - org
      - cdlib
        xtf
        cache
        Cache.java
        CacheDependency.java
        Dependency.java
        FileDependency.java
        GeneratingCache.java
        SimpleCache.java
        StringCache.java
        crossQuery
        CrossQuery.java
        CrossQueryConfig.java
        QueryRoute.java
        QueryRouteException.java
        TimeProfilingListener.java
        raw
        RawQuery.java
        test
        TestableCrossQuery.java
        dynaXML
        Authenticator.java
        DefaultDocLocator.java
        DocLocator.java
        DocRequest.java
        DynaXML.java
        DynaXMLConfig.java
        DynaXMLException.java
        InvalidDocumentException.java
        IpList.java
        NoPermissionException.java
        UnsupportedQueryException.java
        test
        TestableDynaXML.java
        lazyTree
        AncestorEnumeration.java
        AttributeEnumeration.java
        AttributeImpl.java
        ChildEnumeration.java
        DescendantEnumeration.java
        ElementImpl.java
        FastNodeTestPattern.java
        Flag.java
        FollowingEnumeration.java
        FollowingSiblingEnumeration.java
        LazyDocument.java
        LazyHashMap.java
        LazyKeyManager.java
        LazyProfilingListener.java
        LazyTreeBuilder.java
        NodeImpl.java
        ParentNodeImpl.java
        PersistentTree.java
        PrecedingEnumeration.java
        PrecedingOrAncestorEnumeration.java
        PrecedingSiblingEnumeration.java
        ProxyAttributeEnumeration.java
        ProxyAttributeImpl.java
        ProxyElement.java
        SearchElement.java
        SearchElementImpl.java
        SearchNode.java
        SearchTextImpl.java
        SearchTree.java
        TextImpl.java
        TreeEnumeration.java
        saxonExt
        ElementWithContent.java
        Exec.java
        Image.java
        InstructionWithContent.java
        Mail.java
        Pipe.java
        Redirect.java
        SQL.java
        exec
        ArgElement.java
        InputElement.java
        PipeImageElement.java
        PipeImageInstruction.java
        RunElement.java
        RunInstruction.java
        image
        ImageCache.java
        OutputElement.java
        mail
        SendElement.java
        pipe
        PipeBufferPool.java
        PipeFileElement.java
        PipeFopElement.java
        PipeRequestElement.java
        redirect
        HttpErrorElement.java
        RedirectElement.java
        sql
        SQLClose.java
        SQLColumn.java
        SQLConnect.java
        SQLDelete.java
        SQLInsert.java
        SQLProperty.java
        SQLQuery.java
        SQLUpdate.java
        servletBase
        CQLParseException.java
        DTDSuppressingXMLReader.java
        LatencyCutoffStream.java
        RedirectException.java
        SessionURLRewriter.java
        StylesheetCache.java
        TextConfig.java
        TextServlet.java
        test
        FakeOutputStream.java
        FakeServletConfig.java
        FakeServletContext.java
        FakeServletRequest.java
        FakeServletResponse.java
        NullOutputStream.java
        RegressTest.java
        textEngine
        AccentFoldingRewriter.java
        BoostSet.java
        BoostSetParams.java
        BoundedMarkPos.java
        BoundedWordIter.java
        ConfigCache.java
        Constants.java
        DefaultQueryProcessor.java
        DocHit.java
        DocHitImpl.java
        FlippingDirectory.java
        HitLoadException.java
        HitQueue.java
        IndexUtil.java
        IndexValidator.java
        IndexWarmer.java
        MoreLikeThisQuery.java
        NativeFSDirectory.java
        NumericFieldData.java
        NumericRangeQuery.java
        PluralFoldingRewriter.java
        QueryContext.java
        QueryGenException.java
        QueryProcessor.java
        QueryRequest.java
        QueryRequestParser.java
        QueryResult.java
        RefieldingQueryRewriter.java
        SlopFixupRewriter.java
        Snippet.java
        SnippetMaker.java
        SpanExactQuery.java
        SpanSectionTypeQuery.java
        SpellSuggRewriter.java
        SpellcheckParams.java
        SpellingSuggestion.java
        StdTermFilter.java
        StdTermRewriter.java
        TotalHitsComparator.java
        UnspanningQueryRewriter.java
        XtfBigramQueryRewriter.java
        XtfChunk.java
        XtfChunkMarkPos.java
        XtfChunkSource.java
        XtfChunkedWordIter.java
        XtfDocNumMap.java
        XtfLimIndexReader.java
        XtfQueryRewriter.java
        XtfQueryTraverser.java
        XtfSearcher.java
        XtfSpanRangeQuery.java
        XtfSpanWildcardQuery.java
        XtfWordEquiv.java
        facet
        ChildSelector.java
        DescendantSelector.java
        DocsSelector.java
        DynamicGroupData.java
        EmptySelector.java
        FRBRData.java
        FRBRGroupData.java
        FacetSpec.java
        GroupCounts.java
        GroupData.java
        GroupSelector.java
        MarkSelector.java
        NameSelector.java
        PageSelector.java
        ParseException.java
        RangeSelector.java
        ResultFacet.java
        ResultGroup.java
        RootSelector.java
        SelectedSelector.java
        SelectorParser.java
        SelectorParserConstants.java
        SelectorParserTokenManager.java
        SiblingSelector.java
        SimpleCharStream.java
        SingletonSelector.java
        StaticGroupData.java
        Token.java
        TokenMgrError.java
        TopChoiceSelector.java
        UnionSelector.java
        freeform
        CharStream.java
        FreeformQueryParser.java
        FreeformQueryParserConstants.java
        FreeformQueryParserTokenManager.java
        ParseException.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        textIndexer
        AccentFoldingFilter.java
        CrimsonBugWorkaround.java
        DocSelCache.java
        FacetTokenizer.java
        HTMLIndexSource.java
        HTMLToString.java
        IdxTreeCleaner.java
        IdxTreeCuller.java
        IdxTreeDictMaker.java
        IdxTreeOptimizer.java
        IndexDump.java
        IndexInfo.java
        IndexMerge.java
        IndexRecord.java
        IndexSource.java
        IndexStats.java
        IndexSync.java
        IndexerConfig.java
        MARCIndexSource.java
        MSWordIndexSource.java
        PDFIndexSource.java
        PDFToString.java
        PluralFoldingFilter.java
        SectionInfo.java
        SectionInfoStack.java
        SpellWritingFilter.java
        SrcTreeProcessor.java
        StartEndFilter.java
        StructuredFileProxy.java
        TagFilter.java
        TextIndexSource.java
        TextIndexer.java
        TextIndexerException.java
        XMLConfigParser.java
        XMLIndexSource.java
        XMLTextProcessor.java
        XTFTextAnalyzer.java
        XtfSpecialTokensFilter.java
        tokenizer
        CharStream.java
        FastCharStream.java
        ParseException.java
        Token.java
        TokenMgrError.java
        Tokenizer.java
        XTFTokenizer.java
        XTFTokenizerConstants.java
        XTFTokenizerTokenManager.java
        util
        ArrayUtil.java
        Attrib.java
        AttribList.java
        Base64.java
        CharMap.java
        CheckingTokenStream.java
        CircularQueue.java
        ConsecutiveMap.java
        DirSync.java
        DiskHashReader.java
        DiskHashWriter.java
        DocTypeDeclRemover.java
        EasyNode.java
        EmbeddedList.java
        FastIntCache.java
        FastStringCache.java
        FastStringReader.java
        FastTokenizer.java
        FileWalker.java
        FloatList.java
        GeneralException.java
        IntHash.java
        IntMultiMap.java
        LimitedOutputStream.java
        LineReader.java
        Linkable.java
        LinkableImpl.java
        Normalizer.java
        PackedByteBuf.java
        Path.java
        ProcessRunner.java
        StringHash.java
        StructuredFile.java
        StructuredStore.java
        SubDirFilter.java
        SubFileReader.java
        SubFileWriter.java
        SubStoreReader.java
        SubStoreWriter.java
        TagArray.java
        TagChars.java
        Tester.java
        ThreadWatcher.java
        Trace.java
        TraceWriter.java
        WordMap.java
        XMLFormatter.java
        XMLWriter.java
        XTFSaxonErrorListener.java
        xslt
        CharUtils.java
        FileUtils.java
        FreeformQuery.java
        Session.java
        XMLStubReader.java
        zing
        SRU.java
        SRUConfig.java

package org.cdlib.xtf.textIndexer;

import java.util.ArrayList;


/**
 * Copyright (c) 2004, Regents of the University of California
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * - Neither the name of the University of California nor the names of its
 *   contributors may be used to endorse or promote products derived from this
 *   software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

/**
 * This class maintains configuration information about the current index that
 * the TextIndexer program is processing. <br><br>
 *
 * Information stored by this class includes: <br><br>
 *
 * - The name of the current index being processed. <br>
 * - The path where the Lucene index database is (to be) stored. <br>
 * - The path where the source text for this index can be found. <br>
 * - The path where any XSLT input filters for this index can be found. <br>
 * - A specification for source text files to ignore. <br>
 * - The text chunk size and overlap attributes for the current index. <br>
 * - Specifications for stop word removal. <br><br>
 *
 */
public class IndexInfo 
{
  /** Name of the current index being processed (as specified in the index
   *  configuration file.)
   */
  public String indexName;

  /** Name of a sub-directory to index, or null to index everything */
  public ArrayList<String> subDirs;

  /** Name of the path to the current index's Lucene database. */
  public String indexPath;
  
  /** Whether index rotation is enabled */
  public boolean rotate = false;

  /** Path to the source text for the current index. */
  public String sourcePath;
  
  /** 
   * True to scan all dirs, false for pruned (e.g. stop at first data).
   * Defaults to false for backward compatibility.
   */
  public boolean scanAllDirs = false;
  
  /**
   * True to make a clone of the data in index/dataClone. Useful so that
   * dynaXML can always get to files that match the index.
   */
  public boolean cloneData = false;
  
  /** Path to stylesheet used to determine which documents to index */
  public String docSelectorPath;

  /** Set of stop words to remove. Stop words are common words such as "the",
   *  "and", etc. which are so ubiquitous as to add little value to queries.
   *  Rather than remove them entirely however, we take an approach suggested
   *  by Doug Cutting (inventor of Lucene).<br><br>
   *
   *  Basically, stop words are joined to surrounding normal words. This speeds
   *  queries while still producing good results for requests that contain
   *  a mixture of stop words and normal words (which is by far the most common
   *  case for queries.) <br><br>
   *
   *  For example, the string "man of war" would be indexed like this:
   *  "man man-of of-war war". This way, searching for "man war" will pull up a
   *  hit, but a search for "man of war" will score higher, as long as the same
   *  stop-word approach is applied to the query.<br><br>
   *
   *  You might ask what happens in this case: "joke of the year" (two stop
   *  words in a row.) We could index it as "joke joke-of of-the the-year", or
   *  as the longer but more complete "joke joke-of joke-of-the of-the
   *  of-the-year the-year". The second form doesn't offer much improvement
   *  in searching and would make the index bigger and logic more complex.
   *  So we always combine a stop word with at most one neighboring word.
   *  <br><br>
   *
   *  The words in this list may be separated by spaces, commas, and/or
   *  semicolons.
   */
  public String stopWords;

  /** Path to a mapping from plural words to their corresponding singular
   *  forms that the textIndexer should fold together. This can yield better
   *  search results. For instance, if a user searches for "cat" they probably
   *  also would like results for "cats."
   *
   *  The file should be a plain text file, with one word pair per line.
   *  First is the plural form of a word, followed by a "|" character,
   *  followed by the singular form. All should be lowercase, even in the
   *  case of acronyms.
   *
   * Optionally, the file may be compressed in GZIP format, in which case
   * it must end in the extension ".gz".
   *
   * Non-ASCII characters should be encoded in UTF-8 format.
   */
  public String pluralMapPath;

  /** Path to a mapping from accented characters to their corresponding
   *  chars with teh diacritics removed. These chars will be folded together
   *  which can yield better search results. For instance, a German user
   *  on an American keyboard might want to find "Hut" with an umlaut over the
   *  "u", but can't type the umlaut. This way, if they type "hat" they'll still
   *  get a match.
   *
   *  The file should be a plain text file, with one code pair per line.
   *  First is the 4-digit hex Unicode point for the accented character,
   *  followed by "|", then the 4-digit hex code for the unaccented form.
   */
  public String accentMapPath;

  /** Path to a set of validation specifications for this index. This is
   *  essentially a list of URLs, with specifications on how many hits
   *  should be returned by each one. Validation is applied at index time
   *  to determine if the index is valid (and before rotating), and is
   *  also applied by the servlets before rotating in a new index.
   *
   *  The file should be XML in the defined format.
   */
  public String validationPath;

  /** Whether to create a spellcheck dictionary for this index */
  public boolean createSpellcheckDict = false;

  /**
   * Whether to strip whitespace between elements in lazy tree files. Not
   * strictly safe for all XML documents, but it can make lazy trees
   * somewhat smaller and faster.
   */
  public boolean stripWhitespace = false;

  /** Text chunk attribute array. Currently this array consists of two entries:
   *  <br><br>
   *
   *  - The size of the text chunk in words. <br>
   *  - The overlap in words of adjacent text chunks. <br><br>
   *
   * These array members should be addressed using <code>chunkSize</code>}
   * and <code>chunkOvlp</code> constants defined by this class.
   * <br><br>
   *
   * @.notes  For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public int[] chunkAtt;

  /** Index into Chunk Attribute Array for the chunk size attribute. <br><br>
   *
   *  Indexed text stored in the a Lucine index is broken up in to small chunks
   *  so that search result "summary blurbs" can be easily generated without
   *  having to load the entire source text. The chunk size attribute reflects
   *  the chunk size (in words) used by the current index.
   *
   */
  public final static int chunkSize = 0;

  /** Index into Chunk Attribute Array for the chunk size attribute. <br><br>
   *
   *  Indexed text stored in the a Lucine index is broken up in to small chunks
   *  that overlap with adjacent chunks so that "summary blurbs" for proximity
   *  searches can be easily generated without having to load the entire source
   *  text. The chunk overlap attribute reflects the overlap (in words) used by
   *  the current index.
   *
   */
  public final static int chunkOvlp = 1;

  /** Constant defining the minimum size (in words) of a text chunk.
   *  Value = {@value}. <br><br>
   *
   * @.notes  For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public final static int minChunkSize = 2;

  /** Constant defining the default size (in words) of a text chunk.
   *  Value = {@value}. <br><br>
   *
   * @.notes  For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public final static int defaultChunkSize = 100;

  /** Constant defining the default overlap (in words) of two adjacent text
   *  chunks. Value = {@value}.
   *
   * @.notes  For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public final static int defaultChunkOvlp = 50;

  /** Constant defining the default list of stop words. These are common words
   *  that are so ubiquitous as to be of little use in queries. Value = {@value}.
   *
   * @.notes  For an explanation of stop word handling,
   *       see {@link #stopWords stopWords}
   */
  public final static String defaultStopWords = "a an and are as at be but by for if in into is it no not of on or s " +
                                                "such t that the their then there these they this to was will with";

  //////////////////////////////////////////////////////////////////////////// 

  /**
   * Default constructor. <br><br>
   *
   * Creates the chunk attribute array, and initializes the
   * <code>chunkSize</code> entry to
   * {@link org.cdlib.xtf.textIndexer.IndexInfo#defaultChunkSize defaultChunkSize},
   * and the <code>chunkOvlp</code> entry to
   * {@link org.cdlib.xtf.textIndexer.IndexInfo#defaultChunkOvlp defaultChunkOvlp}.
   */
  public IndexInfo() 
  {
    // Create the chunk attribute array.
    chunkAtt = new int[2];

    // Set the default chunk size and overlap.
    chunkAtt[chunkSize] = defaultChunkSize;
    chunkAtt[chunkOvlp] = defaultChunkOvlp;
  } //public IndexInfo()

  //////////////////////////////////////////////////////////////////////////// 

  /**
   * Alternate constructor. <br><br>
   *
   * Initializes the fields needed to use InputStream-based indexing (that is,
   * all fields except subDir, sourcePath, and docSelectorPath.)
   *
   * Uses default values for chunk size/overlap, and for the stop word list.
   * After construction, these may of course be altered if desired.
   */
  public IndexInfo(String indexName, String indexPath) 
  {
    // Record the input parameters
    this.indexName = indexName;
    this.indexPath = indexPath;

    // Create the chunk attribute array.
    chunkAtt = new int[2];

    // Set the default chunk size and overlap.
    chunkAtt[chunkSize] = defaultChunkSize;
    chunkAtt[chunkOvlp] = defaultChunkOvlp;

    // Use a default stop-word list.
    stopWords = defaultStopWords;
  } //public IndexInfo()

  ////////////////////////////////////////////////////////////////////////////    

  /** Return the size of a text chunk for the current index. <br><br>
   *
   *  @return    The value of the <code>chunkSize</code> attribute. <br><br>
   *
   *  @.notes
   *      For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public int getChunkSize() {
    return chunkAtt[chunkSize];
  }

  ////////////////////////////////////////////////////////////////////////////    

  /** Return the size of a text chunk (in words) for the current index
   *  as a string. <br><br>
   *
   *  @return     The value of the <code>chunkSize</code> attribute converted
   *              to a String. <br><br>
   *
   *  @.notes      This method is intended as a convenience call for code that
   *               creats Lucene fields, which are all stored as strings.
   *               <br><br>
   *
   *       For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public String getChunkSizeStr() {
    return Integer.toString(chunkAtt[chunkSize]);
  }

  //////////////////////////////////////////////////////////////////////////// 

  /** Return the overlap of two adjacent text chunks for the current index.
   *  <br><br>
   *
   *  @return    The value of the <code>chunkOvlp</code> attribute. <br><br>
   *
   *  @.notes
   *       For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public int getChunkOvlp() {
    return chunkAtt[chunkOvlp];
  }

  //////////////////////////////////////////////////////////////////////////// 

  /** Return the overlap (in words) for two adjacent text text chunks in the
   *  current index as a string. <br><br>
   *
   *  @return     The value of the <code>chunkOvlp</code> attribute
   *              converted to a String. <br><br>
   *
   *  @.notes     This method is intended as a convenience call for code that
   *              creats Lucene fields, which are all stored as strings.
   *              <br><br>
   *
   *       For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public String getChunkOvlpStr() {
    return Integer.toString(chunkAtt[chunkOvlp]);
  }

  //////////////////////////////////////////////////////////////////////////// 

  /** Sets the text chunk size attribute for the current index. <br><br>
   *
   *  This method sets the value for the <code>chunkSize</code>
   *  attribute, coercing its  value to be greater than or equal to the
   *  {@link org.cdlib.xtf.textIndexer.IndexInfo#minChunkSize minChunkSize}
   *  value. <br><br>
   *
   *  @return     The resulting coerced chunkSize value. <br><br>
   *
   *  @.notes     This function also calls the
   *              {@link org.cdlib.xtf.textIndexer.IndexInfo#setChunkOvlp(int) setChunkOvlp()}
   *              method to ensure that the overlap value is valid for the
   *              chunk size set by this call.
   *              <br><br>
   *
   *       For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public int setChunkSize(int newChunkSize) 
  {
    // If a negative chunk size was passed in, default to entire
    // document indexing.
    //
    if (newChunkSize < minChunkSize)
      newChunkSize = minChunkSize;

    // Set the new chunk size.
    chunkAtt[chunkSize] = newChunkSize;

    // Force the chunk overlap to be valid for the new size.
    chunkAtt[chunkOvlp] = setChunkOvlp(chunkAtt[chunkOvlp]);

    // Return the (possibly coerced) chunk size to the caller.
    return chunkAtt[chunkSize];
  } // public setChunkSize()

  //////////////////////////////////////////////////////////////////////////// 

  /** Sets the adjacent chunk overlap attribute for the current index. <br><br>
   *
   *  This method sets the value for the
   *  {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp} attribute,
   *  coercing its value to be less than or equal to the half the current chunk
   *  size for the current index. <br><br>
   *
   *  @return    The resulting coerced chunkOvlp value. <br><br>
   *
   *       For an explanation of the text chunk size and overlap attributes,
   *       see {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkSize chunkSize}
   *       and {@link org.cdlib.xtf.textIndexer.IndexInfo#chunkOvlp chunkOvlp}.
   */
  public int setChunkOvlp(int newChunkOverlap) 
  {
    // If the chunk overlap is more than 1/2 the chunk size, 
    // force it to be half the chunk size.
    //
    if (newChunkOverlap > chunkAtt[chunkSize] / 2)
      newChunkOverlap = chunkAtt[chunkSize] / 2;

    // Set the new chunk overlap value.
    chunkAtt[chunkOvlp] = newChunkOverlap;

    // And return the (possibly coerced) result to the caller.
    return chunkAtt[chunkOvlp];
  } // public setChunkOverlap()    
} // class IndexInfo