package org.cdlib.xtf.textEngine;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.mark.BasicMarkPos;
import org.apache.lucene.mark.MarkPos;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Helps with marking fields that contain bump markers.
*
* @author Martin Haye
*/
public class BoundedMarkPos extends BasicMarkPos
{
private Token[] tokens;
private int tokNum;
/** Creates a new mark pos */
BoundedMarkPos(Token[] tokens) {
this.tokens = tokens;
}
/** Establishes the token number of this mark pos */
final void setTokNum(int tokNum) {
this.tokNum = tokNum;
}
/**
* Ensures that no XML elements or attributes are accidentally included in
* the text. This is because, at the moment, we don't deal with all the
* complexities of marking across XML tags (and it is very complex.)
*/
public String getTextTo(MarkPos other, boolean checkUnmarkable)
{
if (checkUnmarkable && other != null)
{
// Check all the tokens between the two marks.
for (int i = tokNum; i <= ((BoundedMarkPos)other).tokNum; i++)
{
String term = tokens[i].termText();
if (term.length() == 0)
continue;
if (term.charAt(0) == Constants.ELEMENT_MARKER ||
term.charAt(0) == Constants.ATTRIBUTE_MARKER)
{
throw new UnmarkableException();
}
} // for i
} // if
// Check passed... get the text.
return super.getTextTo(other);
}
/**
* Called by BoundedWordIter when called to get the END_PLUS of a token. We
* strip off bump markers, whitespace, and end-of-field markers.
*/
public void stripMarkers(int termEnd)
{
// Remove bump markers.
while (true) {
int tmp = fullText.lastIndexOf(Constants.BUMP_MARKER, charPos - 1);
if (tmp < termEnd)
break;
charPos = tmp;
}
// Remove trailing whitespace and end-of-field markers.
for (; charPos > termEnd; charPos--)
{
char c = fullText.charAt(charPos - 1);
if (!Character.isWhitespace(fullText.charAt(charPos - 1)) &&
c != Constants.FIELD_END_MARKER)
{
break;
}
} // for
} // stripMarkers()
/** Exception thrown if asked to mark past XML elements or attributes */
public static class UnmarkableException extends RuntimeException {
}
} // class BoundedMarkPos()