package LBJ2.nlp; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; import LBJ2.parse.LineByLine; /** * Use this class to extract sentences from plain text. The user constructs * an object of this class with the file name of a document written in * natural English (i.e., with no annotations added or any type of * preprocessing performed). <b>It should be noted that this class will * interpret empty lines that appear in the input as paragraph * boundaries.</b> * * <p> The user can then retrieve <code>Sentence</code>s one at a time with * the <code>next()</code> method, or all at once with the * <code>splitAll()</code> method. The returned <code>Sentence</code>s' * <code>start</code> and <code>end</code> fields represent offsets into the * file they were extracted from. Every character in between those two * offsets inclusive, including extra spaces, newlines, etc., is included in * the <code>Sentence</code> as it appeared in the paragraph. * * <p> A {@link #main(String[])} method is also implemented which applies * this class to plain text in a straight-forward way. * * @see Sentence * @author Nick Rizzolo **/ public class SentenceSplitter extends LineByLine { /** * Regular expression matching whitespace separated words including those * that are hyphenated and cross over a line boundary. **/ private static final Pattern wordMatcher = Pattern.compile("([^-\\s]-\n\\s*(?=\\S)|\\S)+"); /** * Regular expression matching an entire string if that string contains no * capital letters except for those within angled brackets (<>). ** private static final Pattern lowerCaseWithXML = Pattern.compile("^([^A-Z]*(<[^>]*>)?)*$"); */ /** Regular expression matching any lower case letter. */ private static final Pattern lowerCaseLetter = Pattern.compile("[a-z]"); /** * Regular expression matching a sequence of capital letters and dots * ending with a capital letter. **/ private static final Pattern capitalsAndDots = Pattern.compile("^([A-Z]\\.)*[A-Z]$"); /** * Run this program on a file containing plain text, and it will produce * the same text rearranged so that each line contains exactly one sentence * on <code>STDOUT</code>. * * <p> Usage: * <code> java LBJ2.nlp.SentenceSplitter <file name> </code> * * @param args The command line arguments. **/ public static void main(String[] args) { String filename = null; try { filename = args[0]; if (args.length > 1) throw new Exception(); } catch (Exception e) { System.err.println("usage: java LBJ2.nlp.SentenceSplitter <file name>"); System.exit(1); } SentenceSplitter splitter = new SentenceSplitter(filename); for (Sentence s = (Sentence) splitter.next(); s != null; s = (Sentence) splitter.next()) { StringBuffer buffer = new StringBuffer(s.text); for (int i = 0; i < buffer.length(); ++i) { char c = buffer.charAt(i); if (c == '\n' || c == '\r' || c == '\f') buffer.setCharAt(i, ' '); } System.out.println(buffer); } } /** Contains the offset of a paragraph currently being processed. */ protected int currentOffset; /** Contains sentences ready to be returned to the user upon request. */ protected LinkedList sentences; /** * When the constructor taking an array argument is used, this variable * keeps track of the element in the array currently being used. **/ protected int index; /** * When the constructor taking an array argument is used, this variable * stores that array. **/ protected String[] input; /** * Sentence splits the given file. * * @param file The name of the file to sentence split. **/ public SentenceSplitter(String file) { super(file); sentences = new LinkedList(); } /** * Sentence splits the given input. * * @param input Plain text. Each element of this array represents a line, * with any line termination characters removed. **/ public SentenceSplitter(String[] input) { this.input = input; sentences = new LinkedList(); } /** * If constructor taking a file name as input was used, this method simply * calls the method of the same name in <code>LineByLine</code>; otherwise, * it returns the next element of the array. * * @return The next line of input. **/ protected String readLine() { if (input != null) { if (index < input.length) return input[index++]; return null; } return super.readLine(); } /** * This method is used to extract a paragraph at a time from the input. * * @return The extracted paragraph, or a string containing only whitespace * if no text remains in the input. **/ protected String getParagraph() { StringBuffer paragraph = new StringBuffer(); String line; for (line = readLine(); line != null && line.trim().length() == 0; line = readLine()) { paragraph.append(line); paragraph.append("\n"); } for (; line != null && line.trim().length() != 0; line = readLine()) { paragraph.append(line); paragraph.append("\n"); } if (line != null) { paragraph.append(line); paragraph.append("\n"); } return paragraph.toString(); } /** * Retrieves the next sentence off the queue and returns it. * * @return The next sentence found or <code>null</code> if there are no * more sentences. **/ public Object next() { if (sentences.size() == 0) { String paragraph = getParagraph(); if (paragraph.trim().length() != 0) process(paragraph); currentOffset += paragraph.length(); } if (sentences.size() == 0) return null; return sentences.removeFirst(); } /** * Retrieves every sentence found in the input paragraphs that have been * provided so far in array form. * * @return All sentences in the input paragraphs. **/ public Sentence[] splitAll() { for (String paragraph = getParagraph(); paragraph.trim().length() != 0; paragraph = getParagraph()) { if (paragraph.trim().length() != 0) process(paragraph); currentOffset += paragraph.length(); } return (Sentence[]) sentences.toArray(new Sentence[sentences.size()]); } /** * This method does the actual work, deciding where sentences begin and end * and populating the <code>sentences</code> member variable. * * @param paragraph The paragraph to process. **/ protected void process(String paragraph) { if (paragraph.trim().length() == 0) return; Matcher m = wordMatcher.matcher(paragraph); LinkedList w = new LinkedList(); while (m.find()) w.add(new Word(m.group(), m.start(), m.end() - 1)); Word[] words = (Word[]) w.toArray(new Word[w.size()]); int sentenceStart = words[0].start; boolean dumpTrailingWords = true; //boolean allLowerCase = lowerCaseWithXML.matcher(paragraph).matches(); // The line of code commented above seems to take time exponential in the // distance from the start of the paragraph to the first capital letter. // I don't get it. But since it does, we replace it with the code below. boolean allLowerCase = true; { boolean insideTag = false; char[] chars = paragraph.toCharArray(); for (int i = 0; i < paragraph.length() && allLowerCase; ++i) { if (insideTag) insideTag = chars[i] != '>'; else { if (chars[i] == '<') insideTag = paragraph.indexOf('>', i) != -1; else allLowerCase = !Character.isUpperCase(chars[i]); } } } for (int i = 0; i < words.length; ++i) { int punctuationIndex = words[i].form.lastIndexOf('.'); int index = words[i].form.lastIndexOf('?'); if (index > punctuationIndex) punctuationIndex = index; index = words[i].form.lastIndexOf('!'); if (index > punctuationIndex) punctuationIndex = index; if (punctuationIndex != -1) { Word next1 = (i + 1 < words.length) ? words[i + 1] : null; Word next2 = (i + 2 < words.length) ? words[i + 2] : null; int length = words[i].form.length(); if (allLowerCase) index = words[i].form.indexOf('.'); if (allLowerCase && length > 5 && (index == -1 || index == punctuationIndex) && !lowerCaseLetter.matcher( words[i].form.substring(punctuationIndex)).find() || boundary(punctuationIndex, words[i], next1, next2)) { sentences.add( new Sentence(paragraph.substring(sentenceStart, words[i].end + 1), currentOffset + sentenceStart, currentOffset + words[i].end)); if (i + 1 < words.length) sentenceStart = words[i + 1].start; else dumpTrailingWords = false; } } } if (dumpTrailingWords) sentences.add( new Sentence(paragraph.substring(sentenceStart, words[words.length - 1].end + 1), currentOffset + sentenceStart, currentOffset + words[words.length - 1].end)); } /** * Determines whether the given punctuation represents the end of a * sentence based on elements of the paragraph immediately surrounding the * punctuation. * * @param index The index of the punctuation in question in its word. * @param word The word containing the punctuation. * @param next1 The word one after the word containing the * punctuation. * @param next2 The word two after the word containing the * punctuation. **/ protected boolean boundary(int index, Word word, Word next1, Word next2) { char punctuation = word.form.charAt(index); Word prefix = new Word(word.form.substring(0, index)); Word suffix = new Word(word.form.substring(index + 1)); Word root = new Word(prefix.form); while (root.form.length() > 0 && "\"'`{[(".indexOf(root.form.charAt(0)) != -1) root.form = root.form.substring(1); if ("yahoo!".equalsIgnoreCase(root.form + punctuation)) return false; if (punctuation == '?' || punctuation == '!') return next1 == null || suffix.form.length() == 0 && (next1.capitalized || startsWithQuote(next1) || next1.form.equals(".") || next2 != null && next2.capitalized && (next1.form.equals("--") || next1.form.equals("-RBR-"))) || isClose(suffix) && hasStartMarker(next1); if (next1 == null) return true; if (suffix.form.length() == 0) { if (startsWithQuote(next1) || startsWithOpenBracket(next1)) return true; if (next1.form.equals("-RBR-") && next2 != null && next2.form.equals("--")) return false; if (isClosingBracket(next1)) return true; if (prefix.form.length() == 0 && next1.form.equals(".")) return false; if (next1.form.equals(".")) return true; if (next1.form.equals("--") && next2 != null && next2.capitalized && endsWithQuote(prefix)) return false; if (next1.form.equals("--") && next2 != null && (next2.capitalized || startsWithQuote(next2))) return true; if (next1.capitalized || Character.isDigit(next1.form.charAt(0))) return isTerminal(root) || !((root.form.equals("p.m") || root.form.equals("a.m")) && isTimeZone(next1) || isHonorific(root) || startsWithQuote(prefix) || startsWithOpenBracket(prefix) && !endsWithCloseBracket(prefix) || capitalsAndDots.matcher(prefix.form).find() && !sentenceBeginner(next1)); } return isClose(suffix) && hasStartMarker(next1) && !isHonorific(root); } /** * Simple check to see if the given word can reliably be identified as the * first word of a sentence. * * @param word The word in question. **/ protected boolean sentenceBeginner(Word word) { return word.form.equals("The"); } /** * Determines whether the first character of the argument is any of the * three varieties of quotes: ' " `. * * @param w The word in question. * @return <code>true</code> if and only if the first character of the * argument is any of the three varieties of quotes. **/ protected boolean startsWithQuote(Word w) { if (w.form.length() == 0) return false; return w.form.charAt(0) == '\'' || w.form.charAt(0) == '"' || w.form.charAt(0) == '`'; } /** * Determines whether the argument ends with any of the following varieties * of closing quote: ' '' ''' " '" . * * @param w The word in question. * @return <code>true</code> if and only if the argument ends with any of * the varieties of quotes named above. **/ protected boolean endsWithQuote(Word w) { return w.form.endsWith("'") || w.form.endsWith("''") || w.form.endsWith("'''") || w.form.endsWith("\"") || w.form.endsWith("'\""); } /** * Determines whether the argument represents a closing bracket or a * closing quote. * * @param w The word in question. * @return <code>true</code> if and only if the argument represents either * a closing bracket or a closing quote. **/ protected boolean isClose(Word w) { return isClosingBracket(w) || isClosingQuote(w); } /** * Determines whether the argument is exactly equal to any of the following * varieties of closing bracket: ) } ] -RBR- . * * @param w The word in question. * @return <code>true</code> if and only if the argument is exactly equal * to any of the above varieties of closing bracket. **/ protected boolean isClosingBracket(Word w) { return w.form.equals(")") || w.form.equals("}") || w.form.equals("]") || w.form.equals("-RBR-"); } /** * Determines whether the argument is exactly equal to any of the following * varieties of closing quote: ' '' ''' " '" . * * @param w The word in question. * @return <code>true</code> if and only if the argument is exactly equal * to any of the above varieties of closing quote. **/ protected boolean isClosingQuote(Word w) { return w.form.equals("'") || w.form.equals("''") || w.form.equals("'''") || w.form.equals("\"") || w.form.equals("'\""); } /** * Determines whether the argument contains any of the following varieties * of "start marker" at its beginning: an open quote, and open bracket, or * a capital letter. * * @param w The word in question. * @return <code>true</code> if and only if the argument starts with a * "start marker". **/ protected boolean hasStartMarker(Word w) { return w.capitalized || startsWithOpenQuote(w) || startsWithOpenBracket(w); } /** * Determines whether the argument starts with any of the following * varieties of open quote: ` `` ``` " "` . * * @param w The word in question. * @return <code>true</code> if and only if the argument starts with one of * the varieties of open quote named above. **/ protected boolean startsWithOpenQuote(Word w) { return w.form.startsWith("`") || w.form.startsWith("``") || w.form.startsWith("```") || w.form.startsWith("\"") || w.form.startsWith("\"`"); } /** * Determines whether the argument starts with any of the following * varieties of open bracket: ( { [ -LBR- . * * @param w The word in question. * @return <code>true</code> if and only if the argument starts with any of * the varieties of open bracket named above. **/ protected boolean startsWithOpenBracket(Word w) { return w.form.startsWith("(") || w.form.startsWith("{") || w.form.startsWith("[") || w.form.startsWith("-LBR-"); } /** * Determines whether the argument ends with any of the following * varieties of open bracket: ) } ] -RBR- . * * @param w The word in question. * @return <code>true</code> if and only if the argument starts with any of * the varieties of open bracket named above. **/ protected boolean endsWithCloseBracket(Word w) { return w.form.endsWith(")") || w.form.endsWith("}") || w.form.endsWith("]") || w.form.endsWith("-RBR-"); } /** * Determines whether the argument is a United States time zone * abbreviation (AST, CST, EST, HST, MST, PST, ADT, CDT, EDT, HDT, MDT, * PDT, or UTC-11). * * @param w The word in question. * @return <code>true</code> if and only if the argument matches any of the * above time zone abbreviations. **/ protected boolean isTimeZone(Word w) { return w.form.equals("AST") || w.form.equals("CST") || w.form.equals("EST") || w.form.equals("HST") || w.form.equals("MST") || w.form.equals("PST") || w.form.equals("ADT") || w.form.equals("CDT") || w.form.equals("EDT") || w.form.equals("HDT") || w.form.equals("MDT") || w.form.equals("PDT") || w.form.equals("UTC") || w.form.equals("UTC-11"); } /** * Determines whether the argument is exactly equal to any of the following * terminal abbreviations: Esq Jr Sr M.D Ph.D . * * @param w The word in question. * @return <code>true</code> if and only if the argument matches any of the * above terminal abbreviations. **/ protected boolean isTerminal(Word w) { return w.form.equals("Esq") || w.form.equals("Jr") || w.form.equals("Sr") || w.form.equals("M.D") || w.form.equals("Ph.D"); } /** * Determines wheter the argument is exactly equal to any of the honorifics * listed below. * * <ul> * <li> APR <li> AUG <li> Adj <li> Adm <li> Adv <li> Apr <li> Asst * <li> Aug <li> Bart <li> Bldg <li> Brig <li> Bros <li> Capt <li> Cmdr * <li> Col <li> Comdr <li> Con <li> Cpl <li> DEC <li> DR <li> Dec * <li> Dr <li> Ens <li> FEB <li> Feb <li> Gen <li> Gov <li> Hon * <li> Hosp <li> Insp <li> JAN <li> JUL <li> JUN <li> Jan <li> Jul * <li> Jun <li> Lt <li> MAR <li> MM <li> MR <li> MRS <li> MS <li> MT * <li> Maj <li> Mar <li> Messrs <li> Mlle <li> Mme <li> Mr <li> Mrs * <li> Ms <li> Msgr <li> Mt <li> NO <li> NOV <li> Nov <li> OCT <li> Oct * <li> Op <li> Ord <li> Pfc <li> Ph <li> Prof <li> Pvt <li> Rep * <li> Reps <li> Res <li> Rev <li> Rt <li> SEP <li> SEPT <li> Sen * <li> Sens <li> Sep <li> Sept <li> Sfc <li> Sgt <li> Sr <li> St * <li> Supt <li> Surg <li> U.S <li> apr <li> aug <li> dec <li> feb * <li> jan <li> jul <li> jun * <li> * <strike>mar</strike> -- It's a word, so it must be capitalized to be * considered an honorific. * <li> nov <li> oct <li> sep <li> sept <li> v <li> vs * </ul> * * @param w The word in question. * @return <code>true</code> if and only if the argument is exactly equal * to any of the honorifics listed above. **/ protected boolean isHonorific(Word w) { return w.form.equals("APR") || w.form.equals("AUG") || w.form.equals("Adj") || w.form.equals("Adm") || w.form.equals("Adv") || w.form.equals("Apr") || w.form.equals("Asst") || w.form.equals("Aug") || w.form.equals("Bart") || w.form.equals("Bldg") || w.form.equals("Brig") || w.form.equals("Bros") || w.form.equals("Capt") || w.form.equals("Cmdr") || w.form.equals("Col") || w.form.equals("Comdr") || w.form.equals("Con") || w.form.equals("Cpl") || w.form.equals("DEC") || w.form.equals("DR") || w.form.equals("Dec") || w.form.equals("Dr") || w.form.equals("Ens") || w.form.equals("FEB") || w.form.equals("Feb") || w.form.equals("Gen") || w.form.equals("Gov") || w.form.equals("Hon") || w.form.equals("Hosp") || w.form.equals("Insp") || w.form.equals("JAN") || w.form.equals("JUL") || w.form.equals("JUN") || w.form.equals("Jan") || w.form.equals("Jul") || w.form.equals("Jun") || w.form.equals("Lt") || w.form.equals("MAR") || w.form.equals("MM") || w.form.equals("MR") || w.form.equals("MRS") || w.form.equals("MS") || w.form.equals("MT") || w.form.equals("Maj") || w.form.equals("Mar") || w.form.equals("Messrs") || w.form.equals("Mlle") || w.form.equals("Mme") || w.form.equals("Mr") || w.form.equals("Mrs") || w.form.equals("Ms") || w.form.equals("Msgr") || w.form.equals("Mt") || w.form.equals("NO") || w.form.equals("NOV") || w.form.equals("No") || w.form.equals("Nov") || w.form.equals("OCT") || w.form.equals("Oct") || w.form.equals("Op") || w.form.equals("Ord") || w.form.equals("Pfc") || w.form.equals("Ph") || w.form.equals("Prof") || w.form.equals("Pvt") || w.form.equals("Rep") || w.form.equals("Reps") || w.form.equals("Res") || w.form.equals("Rev") || w.form.equals("Rt") || w.form.equals("SEP") || w.form.equals("SEPT") || w.form.equals("ST") || w.form.equals("Sen") || w.form.equals("Sens") || w.form.equals("Sep") || w.form.equals("Sept") || w.form.equals("Sfc") || w.form.equals("Sgt") || w.form.equals("Sr") || w.form.equals("St") || w.form.equals("Supt") || w.form.equals("Surg") || w.form.equals("U.S") || w.form.equals("apr") || w.form.equals("aug") || w.form.equals("dec") || w.form.equals("feb") || w.form.equals("jan") || w.form.equals("jul") || w.form.equals("jun") // || w.form.equals("mar") || w.form.equals("nov") || w.form.equals("oct") || w.form.equals("sep") || w.form.equals("sept") || w.form.equals("v") || w.form.equals("vs"); } }