package LBJ2.nlp;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import LBJ2.parse.LineByLine;
/**
* Use this class to extract sentences from plain text. The user constructs
* an object of this class with the file name of a document written in
* natural English (i.e., with no annotations added or any type of
* preprocessing performed). <b>It should be noted that this class will
* interpret empty lines that appear in the input as paragraph
* boundaries.</b>
*
* <p> The user can then retrieve <code>Sentence</code>s one at a time with
* the <code>next()</code> method, or all at once with the
* <code>splitAll()</code> method. The returned <code>Sentence</code>s'
* <code>start</code> and <code>end</code> fields represent offsets into the
* file they were extracted from. Every character in between those two
* offsets inclusive, including extra spaces, newlines, etc., is included in
* the <code>Sentence</code> as it appeared in the paragraph.
*
* <p> A {@link #main(String[])} method is also implemented which applies
* this class to plain text in a straight-forward way.
*
* @see Sentence
* @author Nick Rizzolo
**/
public class SentenceSplitter extends LineByLine
{
/**
* Regular expression matching whitespace separated words including those
* that are hyphenated and cross over a line boundary.
**/
private static final Pattern wordMatcher =
Pattern.compile("([^-\\s]-\n\\s*(?=\\S)|\\S)+");
/**
* Regular expression matching an entire string if that string contains no
* capital letters except for those within angled brackets (<>).
**
private static final Pattern lowerCaseWithXML =
Pattern.compile("^([^A-Z]*(<[^>]*>)?)*$");
*/
/** Regular expression matching any lower case letter. */
private static final Pattern lowerCaseLetter = Pattern.compile("[a-z]");
/**
* Regular expression matching a sequence of capital letters and dots
* ending with a capital letter.
**/
private static final Pattern capitalsAndDots =
Pattern.compile("^([A-Z]\\.)*[A-Z]$");
/**
* Run this program on a file containing plain text, and it will produce
* the same text rearranged so that each line contains exactly one sentence
* on <code>STDOUT</code>.
*
* <p> Usage:
* <code> java LBJ2.nlp.SentenceSplitter <file name> </code>
*
* @param args The command line arguments.
**/
public static void main(String[] args) {
String filename = null;
try {
filename = args[0];
if (args.length > 1) throw new Exception();
}
catch (Exception e) {
System.err.println("usage: java LBJ2.nlp.SentenceSplitter <file name>");
System.exit(1);
}
SentenceSplitter splitter = new SentenceSplitter(filename);
for (Sentence s = (Sentence) splitter.next(); s != null;
s = (Sentence) splitter.next()) {
StringBuffer buffer = new StringBuffer(s.text);
for (int i = 0; i < buffer.length(); ++i) {
char c = buffer.charAt(i);
if (c == '\n' || c == '\r' || c == '\f') buffer.setCharAt(i, ' ');
}
System.out.println(buffer);
}
}
/** Contains the offset of a paragraph currently being processed. */
protected int currentOffset;
/** Contains sentences ready to be returned to the user upon request. */
protected LinkedList sentences;
/**
* When the constructor taking an array argument is used, this variable
* keeps track of the element in the array currently being used.
**/
protected int index;
/**
* When the constructor taking an array argument is used, this variable
* stores that array.
**/
protected String[] input;
/**
* Sentence splits the given file.
*
* @param file The name of the file to sentence split.
**/
public SentenceSplitter(String file) {
super(file);
sentences = new LinkedList();
}
/**
* Sentence splits the given input.
*
* @param input Plain text. Each element of this array represents a line,
* with any line termination characters removed.
**/
public SentenceSplitter(String[] input) {
this.input = input;
sentences = new LinkedList();
}
/**
* If constructor taking a file name as input was used, this method simply
* calls the method of the same name in <code>LineByLine</code>; otherwise,
* it returns the next element of the array.
*
* @return The next line of input.
**/
protected String readLine() {
if (input != null) {
if (index < input.length) return input[index++];
return null;
}
return super.readLine();
}
/**
* This method is used to extract a paragraph at a time from the input.
*
* @return The extracted paragraph, or a string containing only whitespace
* if no text remains in the input.
**/
protected String getParagraph() {
StringBuffer paragraph = new StringBuffer();
String line;
for (line = readLine(); line != null && line.trim().length() == 0;
line = readLine()) {
paragraph.append(line);
paragraph.append("\n");
}
for (; line != null && line.trim().length() != 0; line = readLine()) {
paragraph.append(line);
paragraph.append("\n");
}
if (line != null) {
paragraph.append(line);
paragraph.append("\n");
}
return paragraph.toString();
}
/**
* Retrieves the next sentence off the queue and returns it.
*
* @return The next sentence found or <code>null</code> if there are no
* more sentences.
**/
public Object next() {
if (sentences.size() == 0) {
String paragraph = getParagraph();
if (paragraph.trim().length() != 0) process(paragraph);
currentOffset += paragraph.length();
}
if (sentences.size() == 0) return null;
return sentences.removeFirst();
}
/**
* Retrieves every sentence found in the input paragraphs that have been
* provided so far in array form.
*
* @return All sentences in the input paragraphs.
**/
public Sentence[] splitAll() {
for (String paragraph = getParagraph(); paragraph.trim().length() != 0;
paragraph = getParagraph()) {
if (paragraph.trim().length() != 0) process(paragraph);
currentOffset += paragraph.length();
}
return (Sentence[]) sentences.toArray(new Sentence[sentences.size()]);
}
/**
* This method does the actual work, deciding where sentences begin and end
* and populating the <code>sentences</code> member variable.
*
* @param paragraph The paragraph to process.
**/
protected void process(String paragraph) {
if (paragraph.trim().length() == 0) return;
Matcher m = wordMatcher.matcher(paragraph);
LinkedList w = new LinkedList();
while (m.find()) w.add(new Word(m.group(), m.start(), m.end() - 1));
Word[] words = (Word[]) w.toArray(new Word[w.size()]);
int sentenceStart = words[0].start;
boolean dumpTrailingWords = true;
//boolean allLowerCase = lowerCaseWithXML.matcher(paragraph).matches();
// The line of code commented above seems to take time exponential in the
// distance from the start of the paragraph to the first capital letter.
// I don't get it. But since it does, we replace it with the code below.
boolean allLowerCase = true;
{
boolean insideTag = false;
char[] chars = paragraph.toCharArray();
for (int i = 0; i < paragraph.length() && allLowerCase; ++i) {
if (insideTag) insideTag = chars[i] != '>';
else {
if (chars[i] == '<') insideTag = paragraph.indexOf('>', i) != -1;
else allLowerCase = !Character.isUpperCase(chars[i]);
}
}
}
for (int i = 0; i < words.length; ++i) {
int punctuationIndex = words[i].form.lastIndexOf('.');
int index = words[i].form.lastIndexOf('?');
if (index > punctuationIndex) punctuationIndex = index;
index = words[i].form.lastIndexOf('!');
if (index > punctuationIndex) punctuationIndex = index;
if (punctuationIndex != -1) {
Word next1 = (i + 1 < words.length) ? words[i + 1] : null;
Word next2 = (i + 2 < words.length) ? words[i + 2] : null;
int length = words[i].form.length();
if (allLowerCase) index = words[i].form.indexOf('.');
if (allLowerCase && length > 5
&& (index == -1 || index == punctuationIndex)
&& !lowerCaseLetter.matcher(
words[i].form.substring(punctuationIndex)).find()
|| boundary(punctuationIndex, words[i], next1, next2)) {
sentences.add(
new Sentence(paragraph.substring(sentenceStart,
words[i].end + 1),
currentOffset + sentenceStart,
currentOffset + words[i].end));
if (i + 1 < words.length) sentenceStart = words[i + 1].start;
else dumpTrailingWords = false;
}
}
}
if (dumpTrailingWords)
sentences.add(
new Sentence(paragraph.substring(sentenceStart,
words[words.length - 1].end + 1),
currentOffset + sentenceStart,
currentOffset + words[words.length - 1].end));
}
/**
* Determines whether the given punctuation represents the end of a
* sentence based on elements of the paragraph immediately surrounding the
* punctuation.
*
* @param index The index of the punctuation in question in its word.
* @param word The word containing the punctuation.
* @param next1 The word one after the word containing the
* punctuation.
* @param next2 The word two after the word containing the
* punctuation.
**/
protected boolean boundary(int index, Word word, Word next1, Word next2) {
char punctuation = word.form.charAt(index);
Word prefix = new Word(word.form.substring(0, index));
Word suffix = new Word(word.form.substring(index + 1));
Word root = new Word(prefix.form);
while (root.form.length() > 0
&& "\"'`{[(".indexOf(root.form.charAt(0)) != -1)
root.form = root.form.substring(1);
if ("yahoo!".equalsIgnoreCase(root.form + punctuation)) return false;
if (punctuation == '?' || punctuation == '!')
return next1 == null
|| suffix.form.length() == 0
&& (next1.capitalized || startsWithQuote(next1)
|| next1.form.equals(".")
|| next2 != null && next2.capitalized
&& (next1.form.equals("--")
|| next1.form.equals("-RBR-")))
|| isClose(suffix) && hasStartMarker(next1);
if (next1 == null) return true;
if (suffix.form.length() == 0) {
if (startsWithQuote(next1) || startsWithOpenBracket(next1)) return true;
if (next1.form.equals("-RBR-") && next2 != null
&& next2.form.equals("--"))
return false;
if (isClosingBracket(next1)) return true;
if (prefix.form.length() == 0 && next1.form.equals("."))
return false;
if (next1.form.equals(".")) return true;
if (next1.form.equals("--") && next2 != null && next2.capitalized
&& endsWithQuote(prefix))
return false;
if (next1.form.equals("--")
&& next2 != null && (next2.capitalized || startsWithQuote(next2)))
return true;
if (next1.capitalized || Character.isDigit(next1.form.charAt(0)))
return isTerminal(root)
|| !((root.form.equals("p.m")
|| root.form.equals("a.m"))
&& isTimeZone(next1)
|| isHonorific(root) || startsWithQuote(prefix)
|| startsWithOpenBracket(prefix)
&& !endsWithCloseBracket(prefix)
|| capitalsAndDots.matcher(prefix.form).find()
&& !sentenceBeginner(next1));
}
return isClose(suffix) && hasStartMarker(next1) && !isHonorific(root);
}
/**
* Simple check to see if the given word can reliably be identified as the
* first word of a sentence.
*
* @param word The word in question.
**/
protected boolean sentenceBeginner(Word word) {
return word.form.equals("The");
}
/**
* Determines whether the first character of the argument is any of the
* three varieties of quotes: ' " `.
*
* @param w The word in question.
* @return <code>true</code> if and only if the first character of the
* argument is any of the three varieties of quotes.
**/
protected boolean startsWithQuote(Word w) {
if (w.form.length() == 0) return false;
return w.form.charAt(0) == '\'' || w.form.charAt(0) == '"'
|| w.form.charAt(0) == '`';
}
/**
* Determines whether the argument ends with any of the following varieties
* of closing quote: ' '' ''' " '" .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument ends with any of
* the varieties of quotes named above.
**/
protected boolean endsWithQuote(Word w) {
return w.form.endsWith("'") || w.form.endsWith("''")
|| w.form.endsWith("'''") || w.form.endsWith("\"")
|| w.form.endsWith("'\"");
}
/**
* Determines whether the argument represents a closing bracket or a
* closing quote.
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument represents either
* a closing bracket or a closing quote.
**/
protected boolean isClose(Word w) {
return isClosingBracket(w) || isClosingQuote(w);
}
/**
* Determines whether the argument is exactly equal to any of the following
* varieties of closing bracket: ) } ] -RBR- .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument is exactly equal
* to any of the above varieties of closing bracket.
**/
protected boolean isClosingBracket(Word w) {
return w.form.equals(")") || w.form.equals("}") || w.form.equals("]")
|| w.form.equals("-RBR-");
}
/**
* Determines whether the argument is exactly equal to any of the following
* varieties of closing quote: ' '' ''' " '" .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument is exactly equal
* to any of the above varieties of closing quote.
**/
protected boolean isClosingQuote(Word w) {
return w.form.equals("'") || w.form.equals("''") || w.form.equals("'''")
|| w.form.equals("\"") || w.form.equals("'\"");
}
/**
* Determines whether the argument contains any of the following varieties
* of "start marker" at its beginning: an open quote, and open bracket, or
* a capital letter.
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument starts with a
* "start marker".
**/
protected boolean hasStartMarker(Word w) {
return w.capitalized || startsWithOpenQuote(w)
|| startsWithOpenBracket(w);
}
/**
* Determines whether the argument starts with any of the following
* varieties of open quote: ` `` ``` " "` .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument starts with one of
* the varieties of open quote named above.
**/
protected boolean startsWithOpenQuote(Word w) {
return w.form.startsWith("`") || w.form.startsWith("``")
|| w.form.startsWith("```") || w.form.startsWith("\"")
|| w.form.startsWith("\"`");
}
/**
* Determines whether the argument starts with any of the following
* varieties of open bracket: ( { [ -LBR- .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument starts with any of
* the varieties of open bracket named above.
**/
protected boolean startsWithOpenBracket(Word w) {
return w.form.startsWith("(") || w.form.startsWith("{")
|| w.form.startsWith("[") || w.form.startsWith("-LBR-");
}
/**
* Determines whether the argument ends with any of the following
* varieties of open bracket: ) } ] -RBR- .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument starts with any of
* the varieties of open bracket named above.
**/
protected boolean endsWithCloseBracket(Word w) {
return w.form.endsWith(")") || w.form.endsWith("}")
|| w.form.endsWith("]") || w.form.endsWith("-RBR-");
}
/**
* Determines whether the argument is a United States time zone
* abbreviation (AST, CST, EST, HST, MST, PST, ADT, CDT, EDT, HDT, MDT,
* PDT, or UTC-11).
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument matches any of the
* above time zone abbreviations.
**/
protected boolean isTimeZone(Word w) {
return w.form.equals("AST") || w.form.equals("CST")
|| w.form.equals("EST") || w.form.equals("HST")
|| w.form.equals("MST") || w.form.equals("PST")
|| w.form.equals("ADT") || w.form.equals("CDT")
|| w.form.equals("EDT") || w.form.equals("HDT")
|| w.form.equals("MDT") || w.form.equals("PDT")
|| w.form.equals("UTC") || w.form.equals("UTC-11");
}
/**
* Determines whether the argument is exactly equal to any of the following
* terminal abbreviations: Esq Jr Sr M.D Ph.D .
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument matches any of the
* above terminal abbreviations.
**/
protected boolean isTerminal(Word w) {
return w.form.equals("Esq") || w.form.equals("Jr")
|| w.form.equals("Sr") || w.form.equals("M.D")
|| w.form.equals("Ph.D");
}
/**
* Determines wheter the argument is exactly equal to any of the honorifics
* listed below.
*
* <ul>
* <li> APR <li> AUG <li> Adj <li> Adm <li> Adv <li> Apr <li> Asst
* <li> Aug <li> Bart <li> Bldg <li> Brig <li> Bros <li> Capt <li> Cmdr
* <li> Col <li> Comdr <li> Con <li> Cpl <li> DEC <li> DR <li> Dec
* <li> Dr <li> Ens <li> FEB <li> Feb <li> Gen <li> Gov <li> Hon
* <li> Hosp <li> Insp <li> JAN <li> JUL <li> JUN <li> Jan <li> Jul
* <li> Jun <li> Lt <li> MAR <li> MM <li> MR <li> MRS <li> MS <li> MT
* <li> Maj <li> Mar <li> Messrs <li> Mlle <li> Mme <li> Mr <li> Mrs
* <li> Ms <li> Msgr <li> Mt <li> NO <li> NOV <li> Nov <li> OCT <li> Oct
* <li> Op <li> Ord <li> Pfc <li> Ph <li> Prof <li> Pvt <li> Rep
* <li> Reps <li> Res <li> Rev <li> Rt <li> SEP <li> SEPT <li> Sen
* <li> Sens <li> Sep <li> Sept <li> Sfc <li> Sgt <li> Sr <li> St
* <li> Supt <li> Surg <li> U.S <li> apr <li> aug <li> dec <li> feb
* <li> jan <li> jul <li> jun
* <li>
* <strike>mar</strike> -- It's a word, so it must be capitalized to be
* considered an honorific.
* <li> nov <li> oct <li> sep <li> sept <li> v <li> vs
* </ul>
*
* @param w The word in question.
* @return <code>true</code> if and only if the argument is exactly equal
* to any of the honorifics listed above.
**/
protected boolean isHonorific(Word w) {
return w.form.equals("APR") || w.form.equals("AUG")
|| w.form.equals("Adj") || w.form.equals("Adm")
|| w.form.equals("Adv") || w.form.equals("Apr")
|| w.form.equals("Asst") || w.form.equals("Aug")
|| w.form.equals("Bart") || w.form.equals("Bldg")
|| w.form.equals("Brig") || w.form.equals("Bros")
|| w.form.equals("Capt") || w.form.equals("Cmdr")
|| w.form.equals("Col") || w.form.equals("Comdr")
|| w.form.equals("Con") || w.form.equals("Cpl")
|| w.form.equals("DEC") || w.form.equals("DR")
|| w.form.equals("Dec") || w.form.equals("Dr")
|| w.form.equals("Ens") || w.form.equals("FEB")
|| w.form.equals("Feb") || w.form.equals("Gen")
|| w.form.equals("Gov") || w.form.equals("Hon")
|| w.form.equals("Hosp") || w.form.equals("Insp")
|| w.form.equals("JAN") || w.form.equals("JUL")
|| w.form.equals("JUN") || w.form.equals("Jan")
|| w.form.equals("Jul") || w.form.equals("Jun")
|| w.form.equals("Lt") || w.form.equals("MAR")
|| w.form.equals("MM") || w.form.equals("MR")
|| w.form.equals("MRS") || w.form.equals("MS")
|| w.form.equals("MT") || w.form.equals("Maj")
|| w.form.equals("Mar") || w.form.equals("Messrs")
|| w.form.equals("Mlle") || w.form.equals("Mme")
|| w.form.equals("Mr") || w.form.equals("Mrs")
|| w.form.equals("Ms") || w.form.equals("Msgr")
|| w.form.equals("Mt") || w.form.equals("NO")
|| w.form.equals("NOV") || w.form.equals("No")
|| w.form.equals("Nov") || w.form.equals("OCT")
|| w.form.equals("Oct") || w.form.equals("Op")
|| w.form.equals("Ord") || w.form.equals("Pfc")
|| w.form.equals("Ph") || w.form.equals("Prof")
|| w.form.equals("Pvt") || w.form.equals("Rep")
|| w.form.equals("Reps") || w.form.equals("Res")
|| w.form.equals("Rev") || w.form.equals("Rt")
|| w.form.equals("SEP") || w.form.equals("SEPT")
|| w.form.equals("ST") || w.form.equals("Sen")
|| w.form.equals("Sens") || w.form.equals("Sep")
|| w.form.equals("Sept") || w.form.equals("Sfc")
|| w.form.equals("Sgt") || w.form.equals("Sr")
|| w.form.equals("St") || w.form.equals("Supt")
|| w.form.equals("Surg") || w.form.equals("U.S")
|| w.form.equals("apr") || w.form.equals("aug")
|| w.form.equals("dec") || w.form.equals("feb")
|| w.form.equals("jan") || w.form.equals("jul")
|| w.form.equals("jun") // || w.form.equals("mar")
|| w.form.equals("nov") || w.form.equals("oct")
|| w.form.equals("sep") || w.form.equals("sept")
|| w.form.equals("v") || w.form.equals("vs");
}
}