package LBJ2.nlp;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import LBJ2.parse.LinkedChild;
import LBJ2.parse.LinkedVector;
/**
* This representation of a sentence simply stores the entire text of the
* sentence in a string. This may include any newlines present in the input,
* depending on the parser (e.g., {@link SentenceSplitter} will leave them
* in). However, this class also provides methods to convert that string to
* other representations.
*
* @author Nick Rizzolo
**/
public class Sentence extends LinkedChild
{
/**
* URL prefixes; used by {@link #partOfURL(int)}. The values in this array
* need to be sorted by decreasing order of length to make the regular
* expressions that use them work properly.
**/
private static final String[] protocols =
{ "telnet", "https", "file", "http", "nntp", "smtp", };
/**
* Domain name suffixes; used by {@link #partOfURL(int)}. The values in
* this array need to be sorted by decreasing order of length to make the
* regular expressions that use them work properly.
**/
private static final String[] topLevelDomains =
{
"museum", "travel", "aero", "arpa", "coop", "info", "jobs", "name",
"biz", "com", "edu", "gov", "int", "mil", "net", "org", "pro", "ac",
"ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as",
"at", "au", "aw", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi",
"bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", "by", "bz", "ca",
"cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr",
"cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec",
"ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr",
"ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp",
"gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht",
"hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je",
"jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kr", "kw", "ky",
"kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly",
"ma", "mc", "md", "mg", "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq",
"mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc", "ne",
"nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe",
"pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py",
"qa", "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh",
"si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st", "su", "sv", "sy",
"sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to",
"tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "um", "us", "uy",
"uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt",
"yu", "za", "zm", "zw"
};
/**
* Indicates whether the corresponding index in the text has been
* determined to be part of a URL; used by {@link #partOfURL(int)}.
**/
private boolean[] inURL = null;
/** The actual text of the sentence. */
public String text = "";
/**
* Constructs a sentence from its text.
*
* @param t The text of the sentence.
**/
public Sentence(String t) { text = t; }
/**
* Constructor that sets the character offsets of this sentence.
*
* @param t The text of the sentence.
* @param s The offset at which this child starts.
* @param e The offset at which this child ends.
**/
public Sentence(String t, int s, int e) {
super(s, e);
text = t;
}
/**
* For debugging purposes, it's useful to insert print statements here.
*
* @param l The list to add to.
* @param i The item to add.
* @param description A string describing why the addition is happening.
**/
private void myAdd(LinkedList l, int i, String description) {
l.add(new Integer(i));
//System.out.println("Adding boundary at " + i + ": " + description);
}
/**
* Creates and returns a <code>LinkedVector</code> representation of this
* sentence in which every <code>LinkedChild</code> is a <code>Word</code>.
* Offset information is respected and propagated.
*
* @see Word
* @return A <code>LinkedVector</code> representation of this sentence.
**/
public LinkedVector wordSplit() {
LinkedList boundaries = new LinkedList();
// Whitespace always signals a word boundary.
Matcher m = Pattern.compile("\\s+").matcher(text);
while (m.find()) {
myAdd(boundaries, m.start() - 1, ")whitespace");
myAdd(boundaries, m.end(), "(whitespace");
}
// The beginning and end of the text are also word boundaries, unless
// there's whitespace there.
if (boundaries.size() > 0
&& ((Integer) boundaries.getLast()).intValue() >= text.length())
boundaries.removeLast();
else myAdd(boundaries, text.length() - 1, ")$");
if (boundaries.size() > 1
&& ((Integer) boundaries.getFirst()).intValue() == -1)
boundaries.removeFirst();
else myAdd(boundaries, 0, "(^");
Pattern pNoSpaceOrDigit = Pattern.compile("[^\\s\\d]");
Pattern pDigit = Pattern.compile("\\d");
Pattern pDigitCommaNoDigit = Pattern.compile("\\d,\\D");
Pattern pNoDigitCommaDigit = Pattern.compile("\\D,\\d");
// Commas are separate words unless they're part of a number.
for (int i = text.indexOf(','); i != -1; i = text.indexOf(',', i + 1)) {
if (i > 0 && text.charAt(i - 1) != ','
&& (pNoSpaceOrDigit.matcher(text.substring(i - 1, i)).find()
|| i + 1 == text.length()
&& pDigit.matcher(text.substring(i - 1, i)).find()
|| i + 1 < text.length()
&& pDigitCommaNoDigit
.matcher(text.substring(i - 1, i + 2)).find())) {
myAdd(boundaries, i - 1, ")comma1");
myAdd(boundaries, i, "(comma1");
}
if (i + 1 < text.length()
&& (pNoSpaceOrDigit.matcher(text.substring(i + 1, i + 2)).find()
|| i == 0 && pDigit.matcher(text.substring(i + 1, i + 2)).find()
|| i > 0
&& pNoDigitCommaDigit.matcher(text.substring(i - 1, i + 2))
.find())) {
myAdd(boundaries, i, ")comma2");
myAdd(boundaries, i + 1, "(comma2");
}
}
Pattern pApostropheMask = Pattern.compile("[^\\s,']");
//Pattern pAbbreviation = Pattern.compile("[A-Za-z]'[A-Za-z]");
//Pattern pPossessive = Pattern.compile("s[^A-Za-z']");
//Pattern pShortWill = Pattern.compile("ll[^A-Za-z']");
// Apostrophes are handled by making consecutive occurrences a single
// separate word and treating all other occurences as abbreviations which
// should not be separated, with the following exceptions which are
// considered contractions:
// ' Plural possessive (must follow the letter 's')
// 'd "I'd", "he'd", "they'd"
// 'll "I'll", "he'll", "they'll"
// 'm "I'm"
// 're "they're"
// 's Possessive
// 've "I've", "they've"
// n't "can't", "won't", "shouldn't", "aren't"
for (int i = text.indexOf('\''); i != -1; i = text.indexOf('\'', i + 1)) {
if (i - 1 > 0 && Character.isLetter(text.charAt(i - 2))
&& text.charAt(i - 1) == 'n' && i + 1 < text.length()
&& text.charAt(i + 1) == 't'
&& (i + 2 == text.length()
|| !Character.isLetter(text.charAt(i + 2))
&& text.charAt(i + 2) != '\'')) {
myAdd(boundaries, i - 2, ")n't");
myAdd(boundaries, i - 1, "(n't");
}
else
if (i > 0
&& (pApostropheMask.matcher(text.substring(i - 1, i)).find()
&& i + 1 < text.length() && text.charAt(i + 1) == '\''
|| text.charAt(i - 1) == 's'
&& (i + 1 == text.length()
|| !Character.isLetter(text.charAt(i + 1))
&& text.charAt(i + 1) != '\'')
|| Character.isLetter(text.charAt(i - 1))
&& (i + 1 < text.length()
&& (i + 2 == text.length()
|| !Character.isLetter(text.charAt(i + 2))
&& text.charAt(i + 2) != '\'')
&& (text.charAt(i + 1) == 'd'
|| text.charAt(i + 1) == 'm'
|| text.charAt(i + 1) == 's')
|| i + 2 < text.length()
&& (i + 3 == text.length()
|| !Character.isLetter(text.charAt(i + 3))
&& text.charAt(i + 3) != '\'')
&& (text.substring(i + 1, i + 3).equals("ll")
|| text.substring(i + 1, i + 3).equals("re")
|| text.substring(i + 1, i + 3).equals("ve")))
|| text.charAt(i - 1) == '.' && i - 1 > 0
&& Character.isLetter(text.charAt(i - 2))
&& i + 1 < text.length()
&& (i + 2 == text.length()
|| !Character.isLetter(text.charAt(i + 2))
&& text.charAt(i + 2) != '\'')
&& text.charAt(i + 1) == 's')) {
myAdd(boundaries, i - 1, ")contraction1");
myAdd(boundaries, i, "(contraction1");
}
if (i + 1 < text.length()
&& pApostropheMask.matcher(text.substring(i + 1, i + 2)).find()
&& (!Character.isLetter(text.charAt(i + 1))
|| i > 0 && text.charAt(i - 1) == '\'')) {
myAdd(boundaries, i, ")contraction2");
myAdd(boundaries, i + 1, "(contraction2");
}
}
Pattern pColonMask = Pattern.compile("[^\\s,':]");
Pattern pColonSeparator = Pattern.compile("\\d:\\d");
// Colons get separated into their own word unless it looks like they're
// part of a time (or some other useful structure involving digits) or a
// URL.
for (int i = text.indexOf(':'); i != -1; i = text.indexOf(':', i + 1))
if (!(i >= 2 && i + 2 < text.length()
&& pColonSeparator.matcher(text.substring(i - 2, i + 3)).find()
|| i > 2 && i + 2 < text.length()
&& (text.substring(i - 2, i + 3).equals("tp://")
|| text.substring(i - 2, i + 3).equals("TP://"))
|| partOfURL(i))) {
if (i >= 1 && pColonMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1, ")colon1");
myAdd(boundaries, i, "(colon1");
}
if (i + 1 < text.length()
&& pColonMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i, ")colon2");
myAdd(boundaries, i + 1, "(colon2");
}
}
Pattern pSlashMask = Pattern.compile("[^\\s,':/]");
Pattern pSlashSeparator = Pattern.compile("\\d/\\d");
// Slashes get separated into their own word unless it looks like they're
// part of a date (or some other useful structure involving digits) or a
// URL.
for (int i = text.indexOf('/'); i != -1; i = text.indexOf('/', i + 1))
if (!(i >= 2 && i + 2 < text.length()
&& pSlashSeparator.matcher(text.substring(i - 2, i + 3)).find()
|| i > 3 && i + 1 < text.length()
&& (text.substring(i - 3, i + 2).equals("tp://")
|| text.substring(i - 3, i + 2).equals("TP://"))
|| i > 4
&& (text.substring(i - 4, i + 1).equals("tp://")
|| text.substring(i - 4, i + 1).equals("TP://"))
|| partOfURL(i))) {
if (i >= 1 && pSlashMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1, ")slash1");
myAdd(boundaries, i, "(slash1");
}
if (i + 1 < text.length()
&& pSlashMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i, ")slash2");
myAdd(boundaries, i + 1, "(slash2");
}
}
Pattern pDashMask = Pattern.compile("[^\\s,':/-]");
Pattern pDashSeparator = Pattern.compile("\\w-\\w");
Pattern pNegative1 = Pattern.compile("-\\.?\\d");
Pattern pNegative2 = Pattern.compile("\\s-\\.?\\d");
// Dashes get separated into their own words unless it looks like they're
// part of some useful structure like a compound word, a number, or a URL.
for (int i = text.indexOf('-'); i != -1; i = text.indexOf('-', i + 1))
if (!(i + 1 < text.length() && i >= 1
&& pDashSeparator.matcher(text.substring(i - 1, i + 2)).find()
|| (i + 2 < text.length()
&& (i == 0
&& pNegative1.matcher(text.substring(i, i + 3)).find()
|| i > 0
&& pNegative2.matcher(text.substring(i - 1, i + 3))
.find()))
|| partOfURL(i))) {
if (i >= 1 && pDashMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1, ")dash1");
myAdd(boundaries, i, "(dash1");
}
if (i + 1 < text.length()
&& pDashMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i, ")dash2");
myAdd(boundaries, i + 1, "(dash2");
}
}
Pattern pDollarMask = Pattern.compile("[^\\s,':/\\$-]");
Pattern pMoney1 = Pattern.compile("\\$\\.?\\d");
Pattern pMoney2 = Pattern.compile("(\\s|-)\\$\\.?\\d");
// Dollar signs get separated into their own words unless it looks like
// they're in fact delimiting the start of a dollar amount, or are part of
// a URL.
for (int i = text.indexOf('$'); i != -1; i = text.indexOf('$', i + 1))
if (!(i == 0 && i + 2 < text.length()
&& pMoney1.matcher(text.substring(i, i + 3)).find()
|| i > 0 && i + 2 < text.length()
&& pMoney2.matcher(text.substring(i - 1, i + 3)).find()
|| partOfURL(i))) {
if (i >= 1 && pDollarMask.matcher(text.substring(i - 1, i)).find()) {
myAdd(boundaries, i - 1, ")dollar1");
myAdd(boundaries, i, "(dollar1");
}
if (i + 1 < text.length()
&& pDollarMask.matcher(text.substring(i + 1, i + 2)).find()) {
myAdd(boundaries, i, ")dollar2");
myAdd(boundaries, i + 1, "(dollar2");
}
}
Pattern pBeforeElipsis = Pattern.compile("[^\\s,':/\\$\\.-]\\.\\.\\.");
Pattern pAfterElipsis = Pattern.compile("\\.\\.\\.[^\\s,':/\\$\\.-]");
// Three or more consecutive periods form their own word.
for (int i = text.indexOf('.'); i != -1; i = text.indexOf('.', i + 1)) {
if (i > 0 && i + 2 < text.length()
&& pBeforeElipsis.matcher(text.substring(i - 1, i + 3)).find()) {
myAdd(boundaries, i - 1, ")ellipsis1");
myAdd(boundaries, i, "(ellipsis1");
}
if (i >= 2 && i + 1 < text.length()
&& pAfterElipsis.matcher(text.substring(i - 2, i + 2)).find()) {
myAdd(boundaries, i, ")ellipsis2");
myAdd(boundaries, i + 1, "(ellipsis2");
}
}
// If the last occurrence of a period in the sentence comes after all
// occurrences of letters and digits, it is an end of sentence marker
// which constitutes its own word, unless it appears immediately after two
// other periods.
int period = text.lastIndexOf('.');
if (period != -1) {
boolean endOfSentence = true;
for (int i = period + 1; i < text.length() && endOfSentence; ++i)
endOfSentence = !Character.isLetterOrDigit(text.charAt(i));
if (endOfSentence) {
if (period >= 1
&& (text.charAt(period - 1) != '.' || period == 1
|| text.charAt(period - 2) != '.')
&& pDollarMask.matcher(text.substring(period - 1, period)).find())
{
myAdd(boundaries, period - 1, ")period1");
myAdd(boundaries, period, "(period1");
}
if (period + 1 < text.length()
&& (period == 0 || text.charAt(period - 1) != '.' || period == 1
|| text.charAt(period - 2) != '.')
&& pDollarMask.matcher(text.substring(period + 1, period + 2))
.find()) {
myAdd(boundaries, period, ")period2");
myAdd(boundaries, period + 1, "(period2");
}
}
else period = -1;
}
// All other punctuation marks constitute their own words, unless they
// appear immediately after themselves (consecutive identical punctuation
// marks form a single word) or are part of a URL.
Pattern pPunctuation = Pattern.compile("[^\\s\\w,'\\.:/\\$-]");
m = pPunctuation.matcher(text);
while (m.find())
if (!partOfURL(m.start())) {
if (m.start() + 1 < text.length()
&& text.charAt(m.start()) != text.charAt(m.start() + 1)
&& m.start() + 1 != period
&& pPunctuation
.matcher(text.substring(m.start() + 1, m.start() + 2)).find())
{
myAdd(boundaries, m.start(), ")punctuation1");
myAdd(boundaries, m.start() + 1, "(punctuation1");
}
}
m = Pattern.compile("[^\\s\\w,'\\.:/\\$-]\\w").matcher(text);
while (m.find())
if (!partOfURL(m.start())) {
myAdd(boundaries, m.start(), ")punctuation2");
myAdd(boundaries, m.start() + 1, "(punctuation2");
}
m = Pattern.compile("\\w[^\\s\\w,'\\.:/\\$-]").matcher(text);
while (m.find())
if (!partOfURL(m.start())) {
myAdd(boundaries, m.start(), ")punctuation3");
myAdd(boundaries, m.start() + 1, "(punctuation3");
}
// Now we just have to create the LinkedVector.
Integer[] temp =
(Integer[]) boundaries.toArray(new Integer[boundaries.size()]);
int[] I = new int[temp.length];
for (int i = 0; i < I.length; ++i) I[i] = temp[i].intValue();
Arrays.sort(I);
Word w = new Word(text.substring(I[0], I[1] + 1), I[0] + start,
I[1] + start);
for (int i = 2; i < I.length; i += 2) {
w.next = new Word(text.substring(I[i], I[i + 1] + 1),
w,
I[i] + start,
I[i + 1] + start);
w = (Word) w.next;
}
inURL = null;
return new LinkedVector(w);
}
/**
* Does a simple check to determine if the symbol at the specified index in
* the specified string is likely to be part of a URL. If the specified
* text contains any of the following strings before the specified symbol,
* and there is no whitespace in between the two, the specified symbol is
* deemed likely to be part of a URL.
*
* @param index The index of the symbol in question.
* @return <code>true</code> if and only if the specified symbol appears to
* be part of a URL.
**/
private boolean partOfURL(int index) {
if (inURL != null) return inURL[index];
inURL = new boolean[text.length()];
StringBuffer pattern = new StringBuffer();
pattern.append("(?i)(");
pattern.append(protocols[0]);
for (int i = 1; i < protocols.length; ++i) {
pattern.append("|");
pattern.append(protocols[i]);
}
pattern.append(")://\\S+|[a-zA-Z0-9][a-zA-Z0-9-]*\\.(");
pattern.append(topLevelDomains[0]);
for (int i = 0; i < topLevelDomains.length; ++i) {
pattern.append("|");
pattern.append(topLevelDomains[i]);
}
pattern.append(")(/\\S+)?");
Matcher m = Pattern.compile(pattern.toString()).matcher(text);
while (m.find())
for (int i = m.start(); i < m.end(); ++i) inURL[i] = true;
return inURL[index];
}
/**
* The string representation of a <code>Sentence</code> is just its text.
*
* @return The text of this sentence.
**/
public String toString() { return text; }
}