package org.cdlib.xtf.textIndexer;
import java.util.LinkedList;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/**
* This class maintains information about the current section in a text
* document that the TextIndexer program is processing. <br><br>
*
* On-line documents are stored as "nodes" in XML files that contain information
* about the document, and the document text itself. The nodes usually form
* a heirarchical tree structure, with the outer-most nodes recording
* various bits of information about the text within. Inside the outer nodes
* are additional nodes that record the organization of the text itself,
* including things like section, chapter, and paragraph information. To the
* text indexer program and search engine, sections have special significance.
* Text in two adjacent sections that have different names, are considered
* to not be "near" one another, so that proximity searches will not produce
* results that span across two or more sections. <br><br>
*
* Since sections can be nested inside one-another, a stack of the current
* nesting level needs to be maintained by the text indexer when a document
* is being processed. Doing so does two things: <br><br>
*
* - It allows unnamed inner sections to inherit properties from the parent
* sections that contain them. <br>
* - When the end of an named section has been reached, the text indexer can
* return to using the parent section's properties and continue processing.
* <br><br>
*
* Information recorded for each section consists of the following: <br><br>
*
* - The type name of the current section. <br>
* - The repeat depth, if the section name is the same as the parent's. <br>
* - The number of words that this section should offset from the previous one.
* <br>
* - The previous word bump for this section, if any. <br>
* - The word bump to apply at the end of each sentence. <br>
* - The relevance boost to apply to words in this section. <br><br>
*
* This class is then used as the entry for a
* {@link org.cdlib.xtf.textIndexer.SectionInfoStack }
* that maintains the current stacking order within the source text being
* processed. <br><br>
*
*/
public class SectionInfo
{
/** Index/No-Index Flag Value: Use parent section index/no-index state.
* <br><br>
*
* @.notes
* This index flag value is never actually stored in the index flag attribute
* for a <code>SectionInfo</code> instance. It is only passed as an argument
* to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's index flag value,
* which will be either
* {@link org.cdlib.xtf.textIndexer.SectionInfo#index index} or
* {@link org.cdlib.xtf.textIndexer.SectionInfo#noIndex noIndex}.
* <br><br>
*/
public final static int parentIndex = -1;
/** Index/No-Index Flag Value: Index the current section.
* <br><br>
*
* This value is used for the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#indexFlag indexFlag} field
* to indicate that the current section should not be indexed.
* <br><br>
*/
public final static int noIndex = 0;
/** Index/No-Index Flag Value: Index the current section.
* <br><br>
*
* This value is used for the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#indexFlag indexFlag} field
* to indicate that the current section should be indexed.
* <br><br>
*/
public final static int index = 1;
/** Special Section Bump: Value = Use parent's section bump.
* <br><br>
*
* This special value when used for the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#sectionBump sectionBump}
* field indicates that the parent's section bump value should be used.
* <br><br>
*
* @.notes
* This section bump value is never actually stored in the section bump
* attribute for a <code>SectionInfo</code> instance. It is only passed as
* an argument to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's bump value for the
* new entry on the stack.<br><br>
*/
public final static int parentSectionBump = -1;
/** Default state for Index/No-Index Flag. Value = index.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#indexFlag indexFlag}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static int defaultIndexFlag = index;
/** Default section type name: Value = {@value}.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#sectionType sectionType}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static String defaultSectionType = "";
/** Default subdocument: Value = null.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#subDocument subDocument}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static String defaultSubDocument = null;
/** Default word bump for a section: Value = {@value}.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#sectionBump sectionBump}
* field whenever a <code>SectionInfo</code> class is constructed. <br><br>
*/
public final static int defaultSectionBump = 0;
/** Default word boost for a section: Value = {@value}.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#wordBoost wordBoost}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static float defaultWordBoost = 1;
/** Default sentence bump for a section: Value = {@value}.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#sentenceBump sentenceBump}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static int defaultSentenceBump = 5;
/** Default depth for a section: Value = {@value}.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#depth depth}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static int defaultDepth = 0;
/** Spell/No-Spell Flag Value: Use parent section spell/no-spell state.
* <br><br>
*
* @.notes
* This spell flag value is never actually stored in the spell flag attribute
* for a <code>SectionInfo</code> instance. It is only passed as an argument
* to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's spell flag value,
* which will be either
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spell spell} or
* {@link org.cdlib.xtf.textIndexer.SectionInfo#noSpell noSpell}.
* <br><br>
*/
public final static int parentSpell = -1;
/** No-Spell Flag Value: Do not add words from the current section to the
* spelling correction dictionary.
* <br><br>
*
* This value is used for the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spellFlag spellFlag} field
* to indicate that words from the current section should not be added to
* the spelling correction dictionary.
* <br><br>
*/
public final static int noSpell = 0;
/** Spell Flag Value: Add words from the current section to the
* spelling correction dictionary.
* <br><br>
*
* This value is used for the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spellFlag spellFlag} field
* to indicate that words from the current section should be added to the
* spelling correction dictionary.
* <br><br>
*/
public final static int spell = 1;
/** Default state for Spell/No-Spell Flag. Value = spell.
* <br><br>
*
* This is the default value applied to the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spellFlag spellFlag}
* field whenever a <code>SectionInfo</code> class is constructed.
* <br><br>
*/
public final static int defaultSpellFlag = spell;
/** Depth count for a section. <br><br>
*
* This field is used to count the depth of a section when more than one
* section with the same attributes nests inside another. Using a depth
* count saves having to add an entire duplicate entry to the stack. <br><br>
*/
public int depth;
/** Index flag for a section. <br><br>
*
* This field indicates whether the associated section should be indexed
* or not. There are three valid values for this flag:
* {@link org.cdlib.xtf.textIndexer.SectionInfo#parentIndex parentIndex},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#noIndex noIndex},
* and {@link org.cdlib.xtf.textIndexer.SectionInfo#index index}.
*
* @.notes
* The value {@link org.cdlib.xtf.textIndexer.SectionInfo#parentIndex parentIndex}
* is never actually stored in the index flag attribute for a
* <code>SectionInfo</code> instance. It is only passed as an argument to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's index flag value,
* which will be either
* {@link org.cdlib.xtf.textIndexer.SectionInfo#index index} or
* {@link org.cdlib.xtf.textIndexer.SectionInfo#index noIndex}.
* <br><br>
*/
public int indexFlag;
/** Type name for a section. <br><br>
*
* This field indentifies the name of the associated section. This field
* can be an empty string (""), in which case the parent section name (if
* any) is inherited, or an arbitrary string. <br><br>
*/
public String sectionType;
/** Word bump to add for a section. <br><br>
*
* This field specifies how far in words a section is from the previous or
* containing section, and is used to adjust the likelyhood of a proximity
* match being found across section boundaries as compared to within a
* single section. <br><br>
*/
public int sectionBump;
/** Previous section bump for this section. <br><br>
*
* This field is used correctly accumulate section bump values when multiple
* nested sections starts are encountered with no intervening text.
*
* @.notes
* The value {@link org.cdlib.xtf.textIndexer.SectionInfo#parentSectionBump parentSectionBump}
* is never actually stored in the sectionBump attribute for a
* <code>SectionInfo</code> instance. It is only passed as an argument to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's bump value. <br><br>
*/
public int prevSectionBump;
/** Word boost value for this section. <br><br>
*
* This field is identifies a relevance multiplier for words found in this
* section. If greater than 1.0, words in this section are considered better
* matches for searches when added to the index. If less than 1.0, words in
* this section are considered poorer matches.
*/
public float wordBoost;
/** Sentence bump value for this section. <br><br>
*
* This field is identifies the distance (in number of words) that occurs
* between the end of one sentence and the beginning of the next. This value
* is used to adjust the likelyhood that a proximity match is found across
* multiple sentences as compared to within a single sentence. <br><br>
*.
*/
public int sentenceBump;
/** Spell flag for a section. <br><br>
*
* This field indicates whether words from the associated section should be
* added to the spelling correction dictionary or not.
* There are three valid values for this flag:
* {@link org.cdlib.xtf.textIndexer.SectionInfo#parentSpell parentSpell},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#noSpell noSpell},
* and {@link org.cdlib.xtf.textIndexer.SectionInfo#spell spell}.
*
* @.notes
* The value {@link org.cdlib.xtf.textIndexer.SectionInfo#parentSpell parentSpell}
* is never actually stored in the spell flag attribute for a
* <code>SectionInfo</code> instance. It is only passed as an argument to the
* explicit section push
* method defined by the {@link org.cdlib.xtf.textIndexer.SectionInfoStack}
* class. That method in turn uses the parent section's spell flag value,
* which will be either
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spell spell} or
* {@link org.cdlib.xtf.textIndexer.SectionInfo#spell noSpell}.
* <br><br>
*/
public int spellFlag;
/** Name for a subdocument. <br><br>
*
* This field indicates a section of the document that should be treated
* as an individual searchable unit, but should be viewed in the context
* of its containing document. If null, the section is simply considered
* part of the document with no subdocument distinction.
*/
public String subDocument;
/** Meta-data collection list for a subdocument. <br><br>
*
* This field contains a list of meta-data that will be added to when
* xtf:meta attributes are encountered while indexing the current
* subdocument. Since only a subdocument can have unique meta-data, this
* attribute should only be pushed when a new subdocument is begun.
*/
public LinkedList metaInfo;
//////////////////////////////////////////////////////////////////////////////
/** Default Constructor. <br><br>
*
* Initializes all the fields in a <code>SectionInfo</code> instance to
* reasonable default values. <br><br>
*
* @.notes
* See the {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultDepth},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultIndexFlag},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultSectionType},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultSectionBump},
* {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultWordBoost},
* and
* {@link org.cdlib.xtf.textIndexer.SectionInfo#defaultSentenceBump}
* constants for more on the actual values set. <br><br>
*/
public SectionInfo()
{
// Set up some initial defaults.
this.depth = defaultDepth;
this.indexFlag = defaultIndexFlag;
this.sectionType = defaultSectionType;
this.sectionBump = defaultSectionBump;
this.prevSectionBump = 0;
this.wordBoost = defaultWordBoost;
this.sentenceBump = defaultSentenceBump;
this.spellFlag = defaultSpellFlag;
this.subDocument = null;
this.metaInfo = null;
} // SectionInfo()
//////////////////////////////////////////////////////////////////////////////
/** Explicit Constructor. <br><br>
*
* Initializes all the fields in a <code>SectionInfo</code> instance to
* values passed by the caller. <br><br>
*/
public SectionInfo(int depth, int indexFlag, String sectionType,
int sectionBump, float wordBoost, int sentenceBump,
int spellFlag, String subDocument, LinkedList metaInfo)
{
this.depth = depth;
this.indexFlag = indexFlag;
this.sectionType = sectionType;
this.prevSectionBump = 0;
this.sectionBump = sectionBump;
this.wordBoost = wordBoost;
this.sentenceBump = sentenceBump;
this.spellFlag = spellFlag;
this.subDocument = subDocument;
assert subDocument == null || subDocument.length() > 0;
this.metaInfo = metaInfo;
} // sectionBump()
//////////////////////////////////////////////////////////////////////////////
/** Saves the section bump value for later restore.<br><br>
*
* This method is used to save the specific bump value assigned to a section
* when accumulating nested section bumps with no intervening text.<br><br>
*
* @return The previous section bump value saved.<br><br>
*
* @.notes
* Once saved, the
* {@link org.cdlib.xtf.textIndexer.SectionInfo#sectionBump sectionBump}
* field is reset to zero in anticipation of accumulating bump values
* from previous sections. <br><br>
*/
public int saveSectionBump() {
prevSectionBump = sectionBump;
sectionBump = 0;
return prevSectionBump;
}
//////////////////////////////////////////////////////////////////////////////
/** Restore a previously saved section bump value.<br><br>
*
* This method is a convenience method for restoring the section bump value
* previously saved via
* {@link org.cdlib.xtf.textIndexer.SectionInfo#saveSectionBump() saveSectionBump()}.
* <br><br>
*/
public void restoreSectionBump() {
sectionBump = prevSectionBump;
}
} // class SectionInfo