/*
* LingPipe v. 3.8
* Copyright (C) 2003-2009 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package arkref.sent;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import arkref.parsestuff.U;
import arkref.parsestuff.RegexUtil.R;
import com.aliasi.sentences.AbstractSentenceModel;
/**
* A <code>HeuristicSentenceModel</code> determines sentence
* boundaries based on sets of tokens, a pair of flags, and an
* overridable method describing boundary conditions.
* <P>There are three sets of tokens specified for a heuristic model:
*
* <UL>
*
* <LI> <b>Possible Stops</b>: These are tokens that are allowed
* to be the final token in a sentence. This set typically includes
* sentence-final punctuation tokens such as periods (<code>.</code>)
* and double quotes (<code>"</code>).
*
* <LI> <b>Impossible Penultimates</b>: These are tokens that may
* <i>not</i> be the penultimate (second-to-last) token in a sentence.
* This set is typically made up of abbreviations or acronyms such as
* <code>"Mr"</code>.
*
* <LI> <b>Impossible Starts</b>: These are tokens that may <i>not</i>
* be the first token in a sentence. This set typically includes
* punctuation characters that should be attached to the previous
* sentence such as end quotes (<code>''</code>). Note that there is
* a method, described below, which may enforce additional conditions
* on start tokens.
*
* </UL>
*
* Note that all of these sets perform <i>case insensitive</i> tests.
*
* <P>There are also two flags in the constructor that determine
* aspects of sentence boundary detection:
*
* <UL>
* <LI> <b>Balance Parentheses</b>: If parentheses are being balanced,
* then as long as there are open parentheses that have not been
* closed, the current sentence may not end. Square brackets
* (<code>"[", "]"</code>) and round brackets (<code>"(", ")"</code>),
* are balanced separately. The brackets need not be nested, and
* extra close parentheses (<code>")"</code>) and brackets
* (<code>"]"</code>) are ignored.
*
* <LI> <b>Force Final Boundary</b>: If this flag is set to
* <code>true</code>, the final token in any input is taken to be a
* sentence terminator, whether or not is a possible stop token. This
* is useful for dealing with truncated inputs, such as those in
* MEDLINE abstracts.
*
* </UL>
*
* A further condition is imposed on sentence initial tokens by method
* {@link #possibleStart(String[],String[],int,int)}. This method
* checks a given token in sequence of tokens and whitespaces to
* determine if it is a possible sentence start. The default
* implementation in this class is to rule out tokens that start with
* lowercase letters.
*
* <P>The final condition is that a token cannot be a stop unless it
* is followed by non-empty whitespace.
*
* <p> The resulting model will miss tokens as boundaries that act as
* both sentence boundaries and end-of-abbreviation markers for known
* abbreviations. It will add spurious sentence boundaries that
* appear after unknown abbreviations and are followed by whitespace
* and a capitalized word.
*
* <p>Our approach is loosely based on the article:
*
* <blockquote>
* Mikheev, Andrei. 2002.
* <a href="http://acl.ldc.upenn.edu/J/J02/J02-3002.pdf">Periods, Capitalized Words, etc.</a>
* <i>Computational Linguistics</i> <b>28</b>(3):289-318.
* </blockquote>
*
* @author Mitzi Morris
* @author Bob Carpenter
* @version 3.8
* @since LingPipe1.0
*/
public class MyHeuristicSentenceModel extends AbstractSentenceModel {
// see original code for more comments
Set<String> mPossibleStops;
Set<String> mBadPrevious;
Set<String> mBadFollowing;
Set<Pattern> mStopPatterns;
private final boolean mForceFinalStop;
private final boolean mBalanceParens;
private final boolean mUsingCapitalizationConventions;
/**
* Construct a heuristic sentence model with the specified sets
* of possible stop tokens, impossible penultimate tokens, impossible
* start tokens, and flags for whether the final token is forced
* to be a stop, and whether parentheses are balanced. Note that
* the token sets are <i>case insensitive</i>.
*
* @param possibleStops Possible tokens on which to stop a sentence.
* @param impossiblePenultimate Tokens that may not precede a stop.
* @param impossibleStarts Tokens that may not follow a stop.
*/
public MyHeuristicSentenceModel(
Set<String> possibleStops,
Set<String> stopPatterns,
Set<String> impossiblePenultimate,
Set<String> impossibleStarts,
boolean forceFinalStop,
boolean balanceParens,
boolean usingCapitalizationConventions
) {
mPossibleStops = toLowerCase(possibleStops);
mStopPatterns = convertStopPatterns(stopPatterns);
mBadPrevious = toLowerCase(impossiblePenultimate);
mBadFollowing = toLowerCase(impossibleStarts);
mForceFinalStop = forceFinalStop;
mBalanceParens = balanceParens;
mUsingCapitalizationConventions = usingCapitalizationConventions;
}
private Set<Pattern> convertStopPatterns(Set<String> stopPatterns) {
Set<Pattern> ret = new HashSet<Pattern>();
for (String pat : stopPatterns) {
Pattern p = Pattern.compile(pat);
ret.add(p);
}
return ret;
}
/**
* Returns <code>true</code> if this model treats any input-final
* token as a stop. This ensures that in truncated inputs, all
* tokens are or are followed by a sentence boundary. For
* instance, if the input is the array of tokens
* <code>{"a", "b", ".",
* "c", "d"}</code>, then if
* <code>"d"</code> is <i>not</i> in the set of possible
* stops, then the tokens <code>"c"</code> and
* <code>"d"</code> will not be assigned to a sentence.
* If the allow-any-final-token flag is <code>true</code>, then in
* the case where the <code>"d"</code> is final in the
* input, it will be taken to end a sentence.
*
* <P>The value is set in the constructor {@link
* #HeuristicSentenceModel(Set,Set,Set,boolean,boolean)}.
* See the class documentation for more information.
*
* @return <code>true</code> if any token may be a stop if
* it is final in the input.
*/
public boolean forceFinalStop() {
return mForceFinalStop;
}
/**
* Returns <code>true</code> if this model does parenthesis
* balancing. Note that the value is set in the constructor
* {@link #HeuristicSentenceModel(Set,Set,Set,boolean,boolean)}.
* See the class documentation for more information.
*
*
* @return <code>true</code> if this model does parenthesis
* balancing.
*/
public boolean balanceParens() {
return mBalanceParens;
}
public boolean isPossibleStop(int tokPos, String[] tokens, String[] whites) {
boolean oneTokMatch = mPossibleStops.contains(tokens[tokPos].toLowerCase());
if (oneTokMatch) return true;
String tokPair = tokens[tokPos-1] + whites[tokPos] + tokens[tokPos];
for (Pattern p : mStopPatterns) {
// U.pl(p);
// U.pl("["+tokPair+"]");
Matcher m = p.matcher(tokPair);
if (m.find()) return true;
}
return false;
}
/**
* Adds the sentence final token indices as <code>Integer</code>
* instances to the specified collection, only considering tokens
* between index <code>start</code> and <code>end-1</code>
* inclusive.
*
* @param tokens Array of tokens to annotate.
* @param whitespaces Array of whitespaces to annotate.
* @param start Index of first token to annotate.
* @param length Number of tokens to annotate.
* @param indices Collection into which to write the boundary
* indices.
*/
@Override
public void boundaryIndices(String[] tokens, String[] whitespaces,
int start, int length,
Collection<Integer> indices) {
if (length == 0) return;
if (length == 1) {
if (mForceFinalStop || isPossibleStop(start, tokens, whitespaces)) {
indices.add(Integer.valueOf(start));
}
return;
}
// run from second to penultimate tag (first can't be stop)
boolean inParens = false;
if (tokens[start].equals("(")) inParens = true;
boolean inBrackets = false;
if (tokens[start].equals("[")) inBrackets = true;
int end = start+length-1;
for (int i = start+1; i < end; ++i) {
// check paren balancing
if (mBalanceParens) {
if (tokens[i].equals("(")) {
inParens=true;
continue;
}
if (tokens[i].equals(")")) {
inParens = false;
continue;
}
if (tokens[i].equals("[")) {
inBrackets=true;
continue;
}
if (tokens[i].equals("]")) {
inBrackets=false;
continue;
}
// don't break if we're in parenthetical or bracketed
if (inParens || inBrackets) continue;
}
// check that token is good end of sentence token
if (!isPossibleStop(i,tokens,whitespaces)) continue;
// only break after whitespace
if (whitespaces[i+1].length() == 0) continue;
// check that previous token is OK sentence end
if (mBadPrevious.contains(tokens[i-1].toLowerCase())) continue;
// check that following token is OK sentence start
if (mBadFollowing.contains(tokens[i+1].toLowerCase())) continue;
// check following tokens, as needed
if (!possibleStart(tokens,whitespaces,i+1,end)) continue;
indices.add(Integer.valueOf(i));
}
// deal with case of last tag
if (mForceFinalStop
|| (isPossibleStop(end, tokens, whitespaces)
&& !mBadPrevious.contains(tokens[end-1].toLowerCase())))
indices.add(Integer.valueOf(end));
}
/**
* Return <code>true</code> if the specified start index can
* be a sentence start in the specified array of tokens and
* whitespaces running up to the end token.
*
* <P>The implementation in this class requires the first token to
* be non-empty and have a first character that is not lower case
* according to {@link Character#isLowerCase(char)}.
*
* <P>The start and end indices should be within range for the
* tokens and whitespaces as a precondition to this method being
* called. For a precise definition, see {@link
* #verifyBounds(String[],String[],int,int)}. All calls from the
* abstract sentence model obey this constraint.
*
* @param tokens Array of tokens to check.
* @param whitespaces Array of whitespaces to check.
* @param start Index of first token to check.
* @param end Index of last token to check.
*/
protected boolean possibleStart(String[] tokens, String[] whitespaces,
int start, int end) {
String tok = tokens[start];
if (mUsingCapitalizationConventions)
return tok.length() > 0 && !Character.isLowerCase(tok.charAt(0));
else
return tok.length() > 0;
}
static Set<String> toLowerCase(Set<String> xs) {
Set<String> result = new HashSet<String>();
for (String s : xs)
result.add(s.toLowerCase());
return result;
}
}