package org.cdlib.xtf.textEngine;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.mark.BasicWordIter;
import org.apache.lucene.mark.MarkPos;
import org.apache.lucene.mark.WordIter;
/**
* Just like a BasicWordIter, except that it enforces "soft" boundaries if
* the source text contains XTF "bump markers" of a certain size. Basically,
* this prevents snippets from spanning section boundaries, or the boundaries
* between different fields of the same name.
*
* @author Martin Haye
*/
class BoundedWordIter extends BasicWordIter
{
int boundSize;
/**
* Construct a bounded word iterator on the given text. The tokens from
* the stream must refer to the same text. The skip() method works as
* normal, but next() and prev() will enforce a soft boundary for any
* tokens where the position offset meets or exceeds boundSize.
*/
public BoundedWordIter(String text, TokenStream stream, int boundSize)
throws IOException
{
super(text, stream);
this.boundSize = boundSize;
} // constructor
/**
* Advance to the next token.
*
* @return true if ok, false if no more.
*/
public final boolean next(boolean force)
{
if (force)
return super.next(force);
// Don't advance past separation in field value
if (tokNum < tokens.length - 1 &&
tokens[tokNum + 1].getPositionIncrement() >= boundSize)
{
return false;
}
// Don't advance past 'end-of-field' token
int offset = tokens[tokNum].endOffset();
if (offset < text.length() &&
text.charAt(offset) == Constants.FIELD_END_MARKER)
return false;
return super.next(force);
} // next()
/**
* Go to the previous token.
*
* @return true if ok, false if no more.
*/
public final boolean prev(boolean force)
{
if (force)
return super.prev(force);
// Don't back past separation in field value
if (tokens[tokNum].getPositionIncrement() >= boundSize)
return false;
// Don't back past 'start-of-field' token
int offset = tokens[tokNum].startOffset();
if (offset > 0 && text.charAt(offset - 1) == Constants.FIELD_START_MARKER)
return false;
return super.prev(force);
} // prev()
/** Create a new place to hold position info */
public MarkPos getPos(int startOrEnd) {
BoundedMarkPos pos = new BoundedMarkPos(tokens);
getPos(pos, startOrEnd);
return pos;
}
/**
* Get the position of the end of the current word.
*/
public void getPos(MarkPos pos, int startOrEnd)
{
super.getPos(pos, startOrEnd);
switch (startOrEnd)
{
case WordIter.FIELD_START:
((BoundedMarkPos)pos).setTokNum(0);
break;
case WordIter.FIELD_END:
((BoundedMarkPos)pos).setTokNum(tokens.length - 1);
break;
case WordIter.TERM_END_PLUS:
if (startOrEnd == WordIter.TERM_END_PLUS)
((BoundedMarkPos)pos).stripMarkers(tokens[tokNum].endOffset());
// fall through...
default:
((BoundedMarkPos)pos).setTokNum(tokNum);
} // switch
} // recordPos()
} // class BoundedWordIter