MyHeuristicSentenceModel.java example

Explorer
arkref-master
- src
  - arkref
/*
 * LingPipe v. 3.8
 * Copyright (C) 2003-2009 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package arkref.sent;

import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import arkref.parsestuff.U;
import arkref.parsestuff.RegexUtil.R;

import com.aliasi.sentences.AbstractSentenceModel;

/**
 * A <code>HeuristicSentenceModel</code> determines sentence
 * boundaries based on sets of tokens, a pair of flags, and an
 * overridable method describing boundary conditions.

 * <P>There are three sets of tokens specified for a heuristic model:
 *
 * <UL>
 *
 * <LI> <b>Possible Stops</b>: These are tokens that are allowed
 * to be the final token in a sentence.  This set typically includes
 * sentence-final punctuation tokens such as periods (<code>.</code>)
 * and double quotes (<code>"</code>).
 *
 * <LI> <b>Impossible Penultimates</b>: These are tokens that may
 * <i>not</i> be the penultimate (second-to-last) token in a sentence.
 * This set is typically made up of abbreviations or acronyms such as
 * <code>"Mr"</code>.
 *
 * <LI> <b>Impossible Starts</b>: These are tokens that may <i>not</i>
 * be the first token in a sentence.  This set typically includes
 * punctuation characters that should be attached to the previous
 * sentence such as end quotes (<code>''</code>).  Note that there is
 * a method, described below, which may enforce additional conditions
 * on start tokens.
 *
 * </UL>
 *
 * Note that all of these sets perform <i>case insensitive</i> tests.
 *
 * <P>There are also two flags in the constructor that determine
 * aspects of sentence boundary detection:
 *
 * <UL>

 * <LI> <b>Balance Parentheses</b>: If parentheses are being balanced,
 * then as long as there are open parentheses that have not been
 * closed, the current sentence may not end.  Square brackets
 * (<code>"[", "]"</code>) and round brackets (<code>"(", ")"</code>),
 * are balanced separately.  The brackets need not be nested, and
 * extra close parentheses (<code>")"</code>) and brackets
 * (<code>"]"</code>) are ignored.
 *
 * <LI> <b>Force Final Boundary</b>: If this flag is set to
 * <code>true</code>, the final token in any input is taken to be a
 * sentence terminator, whether or not is a possible stop token.  This
 * is useful for dealing with truncated inputs, such as those in
 * MEDLINE abstracts.
 *
 * </UL>
 *
 * A further condition is imposed on sentence initial tokens by method
 * {@link #possibleStart(String[],String[],int,int)}.  This method
 * checks a given token in sequence of tokens and whitespaces to
 * determine if it is a possible sentence start.  The default
 * implementation in this class is to rule out tokens that start with
 * lowercase letters.
 *
 * <P>The final condition is that a token cannot be a stop unless it
 * is followed by non-empty whitespace.
 *
 * <p> The resulting model will miss tokens as boundaries that act as
 * both sentence boundaries and end-of-abbreviation markers for known
 * abbreviations.  It will add spurious sentence boundaries that
 * appear after unknown abbreviations and are followed by whitespace
 * and a capitalized word.
 *
 * <p>Our approach is loosely based on the article:
 *
 * <blockquote>
 * Mikheev, Andrei. 2002.
 * <a href="http://acl.ldc.upenn.edu/J/J02/J02-3002.pdf">Periods, Capitalized Words, etc.</a>
 * <i>Computational Linguistics</i> <b>28</b>(3):289-318.
 * </blockquote>
 *
 * @author  Mitzi Morris
 * @author Bob Carpenter
 * @version 3.8
 * @since   LingPipe1.0
 */
public class MyHeuristicSentenceModel extends AbstractSentenceModel {
	// see original code for more comments

    Set<String> mPossibleStops;
    Set<String> mBadPrevious;
    Set<String> mBadFollowing;
    Set<Pattern> mStopPatterns;
    private final boolean mForceFinalStop;
    private final boolean mBalanceParens;
    private final boolean mUsingCapitalizationConventions;
    
    /**
     * Construct a heuristic sentence model with the specified sets
     * of possible stop tokens, impossible penultimate tokens, impossible
     * start tokens, and flags for whether the final token is forced
     * to be a stop, and whether parentheses are balanced.  Note that
     * the token sets are <i>case insensitive</i>.
     *
     * @param possibleStops Possible tokens on which to stop a sentence.
     * @param impossiblePenultimate Tokens that may not precede a stop.
     * @param impossibleStarts Tokens that may not follow a stop.

    */
    public MyHeuristicSentenceModel(
    		Set<String> possibleStops,
    		Set<String> stopPatterns,						
			Set<String> impossiblePenultimate,
			Set<String> impossibleStarts,
			boolean forceFinalStop,
			boolean balanceParens,
			boolean usingCapitalizationConventions
          ) {
		mPossibleStops = toLowerCase(possibleStops);
		mStopPatterns = convertStopPatterns(stopPatterns);
        mBadPrevious = toLowerCase(impossiblePenultimate);
        mBadFollowing = toLowerCase(impossibleStarts);	
        mForceFinalStop = forceFinalStop;
        mBalanceParens = balanceParens;
        mUsingCapitalizationConventions = usingCapitalizationConventions;
    }
    
    private Set<Pattern> convertStopPatterns(Set<String> stopPatterns) {
    	Set<Pattern> ret = new HashSet<Pattern>();
    	for (String pat : stopPatterns) {
    		Pattern p = Pattern.compile(pat);
    		ret.add(p);
    	}
    	return ret;
    }
    
    /**
     * Returns <code>true</code> if this model treats any input-final
     * token as a stop.  This ensures that in truncated inputs, all
     * tokens are or are followed by a sentence boundary.  For
     * instance, if the input is the array of tokens
     * <code>{"a", "b", ".",
     * "c", "d"}</code>, then if
     * <code>"d"</code> is <i>not</i> in the set of possible
     * stops, then the tokens <code>"c"</code> and
     * <code>"d"</code> will not be assigned to a sentence.
     * If the allow-any-final-token flag is <code>true</code>, then in
     * the case where the <code>"d"</code> is final in the
     * input, it will be taken to end a sentence.
     *
     * <P>The value is set in the constructor {@link
     * #HeuristicSentenceModel(Set,Set,Set,boolean,boolean)}.
     * See the class documentation for more information.
     *
     * @return <code>true</code> if any token may be a stop if
     * it is final in the input.
     */
    public boolean forceFinalStop() {
        return mForceFinalStop;
    }

    /**
     * Returns <code>true</code> if this model does parenthesis
     * balancing.  Note that the value is set in the constructor
     * {@link #HeuristicSentenceModel(Set,Set,Set,boolean,boolean)}.
     * See the class documentation for more information.
     *
     *
     * @return <code>true</code> if this model does parenthesis
     * balancing.
     */
    public boolean balanceParens() {
        return mBalanceParens;
    }
    
    public boolean isPossibleStop(int tokPos, String[] tokens, String[] whites) {
		boolean oneTokMatch = mPossibleStops.contains(tokens[tokPos].toLowerCase());
		if (oneTokMatch) return true;
		String tokPair = tokens[tokPos-1] + whites[tokPos] + tokens[tokPos];
		for (Pattern p : mStopPatterns) {
//			U.pl(p);
//			U.pl("["+tokPair+"]");
			Matcher m = p.matcher(tokPair);
			if (m.find()) return true;
		}
		return false;
    }


    /**
     * Adds the sentence final token indices as <code>Integer</code>
     * instances to the specified collection, only considering tokens
     * between index <code>start</code> and <code>end-1</code>
     * inclusive.
     *
     * @param tokens Array of tokens to annotate.
     * @param whitespaces Array of whitespaces to annotate.
     * @param start Index of first token to annotate.
     * @param length Number of tokens to annotate.
     * @param indices Collection into which to write the boundary
     * indices.
     */
    @Override
    public void boundaryIndices(String[] tokens, String[] whitespaces,
                                int start, int length,
                                Collection<Integer> indices) {
        if (length == 0) return;

        if (length == 1) {
            if (mForceFinalStop || isPossibleStop(start, tokens, whitespaces)) {
                indices.add(Integer.valueOf(start));
            }
            return;
        }
        
        // run from second to penultimate tag (first can't be stop)
        boolean inParens = false;
        if (tokens[start].equals("(")) inParens = true;
        boolean inBrackets = false;
        if (tokens[start].equals("[")) inBrackets = true;
        int end = start+length-1;
        for (int i = start+1; i < end; ++i) {
            // check paren balancing
            if (mBalanceParens) {
                if (tokens[i].equals("(")) {
                    inParens=true;
                    continue;
                }
                if (tokens[i].equals(")")) {
                    inParens = false;
                    continue;
                }
                if (tokens[i].equals("[")) {
                    inBrackets=true;
                    continue;
                }
                if (tokens[i].equals("]")) {
                    inBrackets=false;
                    continue;
                }
                // don't break if we're in parenthetical or bracketed
                if (inParens || inBrackets) continue;
            }

            // check that token is good end of sentence token
            if (!isPossibleStop(i,tokens,whitespaces)) continue;

            // only break after whitespace
            if (whitespaces[i+1].length() == 0) continue;

            // check that previous token is OK sentence end
            if (mBadPrevious.contains(tokens[i-1].toLowerCase())) continue;

            // check that following token is OK sentence start
            if (mBadFollowing.contains(tokens[i+1].toLowerCase())) continue;

            // check following tokens, as needed
            if (!possibleStart(tokens,whitespaces,i+1,end)) continue;

            indices.add(Integer.valueOf(i));
        }

        // deal with case of last tag
        if (mForceFinalStop
            || (isPossibleStop(end, tokens, whitespaces)
                 && !mBadPrevious.contains(tokens[end-1].toLowerCase())))
            indices.add(Integer.valueOf(end));
    }

    /**
     * Return <code>true</code> if the specified start index can
     * be a sentence start in the specified array of tokens and
     * whitespaces running up to the end token.
     *
     * <P>The implementation in this class requires the first token to
     * be non-empty and have a first character that is not lower case
     * according to {@link Character#isLowerCase(char)}.
     *
     * <P>The start and end indices should be within range for the
     * tokens and whitespaces as a precondition to this method being
     * called.  For a precise definition, see {@link
     * #verifyBounds(String[],String[],int,int)}.  All calls from the
     * abstract sentence model obey this constraint.
     *
     * @param tokens Array of tokens to check.
     * @param whitespaces Array of whitespaces to check.
     * @param start Index of first token to check.
     * @param end Index of last token to check.
     */
    protected boolean possibleStart(String[] tokens, String[] whitespaces,
                                    int start, int end) {
        String tok = tokens[start];
        if (mUsingCapitalizationConventions)
        	return tok.length() > 0 && !Character.isLowerCase(tok.charAt(0));
        else
        	return tok.length() > 0;
        	
    }

    static Set<String> toLowerCase(Set<String> xs) {
        Set<String> result = new HashSet<String>();
        for (String s : xs)
            result.add(s.toLowerCase());
        return result;
    }
}