Sentence.java example

Explorer
MinorThird-master
package LBJ2.nlp;

import java.util.Arrays;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import LBJ2.parse.LinkedChild;
import LBJ2.parse.LinkedVector;


/**
  * This representation of a sentence simply stores the entire text of the
  * sentence in a string.  This may include any newlines present in the input,
  * depending on the parser (e.g., {@link SentenceSplitter} will leave them
  * in).  However, this class also provides methods to convert that string to
  * other representations.
  *
  * @author Nick Rizzolo
 **/
public class Sentence extends LinkedChild
{
  /**
    * URL prefixes; used by {@link #partOfURL(int)}.  The values in this array
    * need to be sorted by decreasing order of length to make the regular
    * expressions that use them work properly.
   **/
  private static final String[] protocols =
    { "telnet", "https", "file", "http", "nntp", "smtp", };

  /**
    * Domain name suffixes; used by {@link #partOfURL(int)}.  The values in
    * this array need to be sorted by decreasing order of length to make the
    * regular expressions that use them work properly.
   **/
  private static final String[] topLevelDomains =
    {
      "museum", "travel", "aero", "arpa", "coop", "info", "jobs", "name",
      "biz", "com", "edu", "gov", "int", "mil", "net", "org", "pro", "ac",
      "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as",
      "at", "au", "aw", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi",
      "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", "by", "bz", "ca",
      "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr",
      "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec",
      "ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr",
      "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp",
      "gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht",
      "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je",
      "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kr", "kw", "ky",
      "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly",
      "ma", "mc", "md", "mg", "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq",
      "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc", "ne",
      "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe",
      "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py",
      "qa", "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh",
      "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st", "su", "sv", "sy",
      "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to",
      "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "um", "us", "uy",
      "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt",
      "yu", "za", "zm", "zw"
    };

  /**
    * Indicates whether the corresponding index in the text has been
    * determined to be part of a URL; used by {@link #partOfURL(int)}.
   **/
  private boolean[] inURL = null;


  /** The actual text of the sentence. */
  public String text = "";


  /**
    * Constructs a sentence from its text.
    *
    * @param t  The text of the sentence.
   **/
  public Sentence(String t) { text = t; }

  /**
    * Constructor that sets the character offsets of this sentence.
    *
    * @param t  The text of the sentence.
    * @param s  The offset at which this child starts.
    * @param e  The offset at which this child ends.
   **/
  public Sentence(String t, int s, int e) {
    super(s, e);
    text = t;
  }


  /**
    * For debugging purposes, it's useful to insert print statements here.
    *
    * @param l            The list to add to.
    * @param i            The item to add.
    * @param description  A string describing why the addition is happening.
   **/
  private void myAdd(LinkedList l, int i, String description) {
    l.add(new Integer(i));
    //System.out.println("Adding boundary at " + i + ": " + description);
  }


  /**
    * Creates and returns a <code>LinkedVector</code> representation of this
    * sentence in which every <code>LinkedChild</code> is a <code>Word</code>.
    * Offset information is respected and propagated.
    *
    * @see    Word
    * @return A <code>LinkedVector</code> representation of this sentence.
   **/
  public LinkedVector wordSplit() {
    LinkedList boundaries = new LinkedList();

    // Whitespace always signals a word boundary.
    Matcher m = Pattern.compile("\\s+").matcher(text);
    while (m.find()) {
      myAdd(boundaries, m.start() - 1, ")whitespace");
      myAdd(boundaries, m.end(), "(whitespace");
    }

    // The beginning and end of the text are also word boundaries, unless
    // there's whitespace there.
    if (boundaries.size() > 0
        && ((Integer) boundaries.getLast()).intValue() >= text.length())
      boundaries.removeLast();
    else myAdd(boundaries, text.length() - 1, ")$");

    if (boundaries.size() > 1
        && ((Integer) boundaries.getFirst()).intValue() == -1)
      boundaries.removeFirst();
    else myAdd(boundaries, 0, "(^");

    Pattern pNoSpaceOrDigit = Pattern.compile("[^\\s\\d]");
    Pattern pDigit = Pattern.compile("\\d");
    Pattern pDigitCommaNoDigit = Pattern.compile("\\d,\\D");
    Pattern pNoDigitCommaDigit = Pattern.compile("\\D,\\d");

    // Commas are separate words unless they're part of a number.
    for (int i = text.indexOf(','); i != -1; i = text.indexOf(',', i + 1)) {
      if (i > 0 && text.charAt(i - 1) != ','
          && (pNoSpaceOrDigit.matcher(text.substring(i - 1, i)).find()
              || i + 1 == text.length()
                 && pDigit.matcher(text.substring(i - 1, i)).find()
              || i + 1 < text.length()
                 && pDigitCommaNoDigit
                    .matcher(text.substring(i - 1, i + 2)).find())) {
        myAdd(boundaries, i - 1, ")comma1");
        myAdd(boundaries, i, "(comma1");
      }

      if (i + 1 < text.length()
          && (pNoSpaceOrDigit.matcher(text.substring(i + 1, i + 2)).find()
              || i == 0 && pDigit.matcher(text.substring(i + 1, i + 2)).find()
              || i > 0
                 && pNoDigitCommaDigit.matcher(text.substring(i - 1, i + 2))
                                      .find())) {
        myAdd(boundaries, i, ")comma2");
        myAdd(boundaries, i + 1, "(comma2");
      }
    }

    Pattern pApostropheMask = Pattern.compile("[^\\s,']");
    //Pattern pAbbreviation = Pattern.compile("[A-Za-z]'[A-Za-z]");
    //Pattern pPossessive = Pattern.compile("s[^A-Za-z']");
    //Pattern pShortWill = Pattern.compile("ll[^A-Za-z']");

    // Apostrophes are handled by making consecutive occurrences a single
    // separate word and treating all other occurences as abbreviations which
    // should not be separated, with the following exceptions which are
    // considered contractions:
    //    '         Plural possessive (must follow the letter 's')
    //    'd        "I'd", "he'd", "they'd"
    //    'll       "I'll", "he'll", "they'll"
    //    'm        "I'm"
    //    're       "they're"
    //    's        Possessive
    //    've       "I've", "they've"
    //    n't       "can't", "won't", "shouldn't", "aren't"
    for (int i = text.indexOf('\''); i != -1; i = text.indexOf('\'', i + 1)) {
      if (i - 1 > 0 && Character.isLetter(text.charAt(i - 2))
          && text.charAt(i - 1) == 'n' && i + 1 < text.length()
          && text.charAt(i + 1) == 't'
          && (i + 2 == text.length()
              || !Character.isLetter(text.charAt(i + 2))
                 && text.charAt(i + 2) != '\'')) {
        myAdd(boundaries, i - 2, ")n't");
        myAdd(boundaries, i - 1, "(n't");
      }
      else
        if (i > 0
            && (pApostropheMask.matcher(text.substring(i - 1, i)).find()
                   && i + 1 < text.length() && text.charAt(i + 1) == '\''
                || text.charAt(i - 1) == 's'
                   && (i + 1 == text.length()
                       || !Character.isLetter(text.charAt(i + 1))
                          && text.charAt(i + 1) != '\'')
                || Character.isLetter(text.charAt(i - 1))
                   && (i + 1 < text.length()
                          && (i + 2 == text.length()
                              || !Character.isLetter(text.charAt(i + 2))
                                 && text.charAt(i + 2) != '\'')
                          && (text.charAt(i + 1) == 'd'
                              || text.charAt(i + 1) == 'm'
                              || text.charAt(i + 1) == 's')
                       || i + 2 < text.length()
                          && (i + 3 == text.length()
                              || !Character.isLetter(text.charAt(i + 3))
                                 && text.charAt(i + 3) != '\'')
                          && (text.substring(i + 1, i + 3).equals("ll")
                              || text.substring(i + 1, i + 3).equals("re")
                              || text.substring(i + 1, i + 3).equals("ve")))
                || text.charAt(i - 1) == '.' && i - 1 > 0
                   && Character.isLetter(text.charAt(i - 2))
                   && i + 1 < text.length()
                   && (i + 2 == text.length()
                       || !Character.isLetter(text.charAt(i + 2))
                          && text.charAt(i + 2) != '\'')
                   && text.charAt(i + 1) == 's')) {
          myAdd(boundaries, i - 1, ")contraction1");
          myAdd(boundaries, i, "(contraction1");
        }

      if (i + 1 < text.length()
          && pApostropheMask.matcher(text.substring(i + 1, i + 2)).find()
          && (!Character.isLetter(text.charAt(i + 1))
              || i > 0 && text.charAt(i - 1) == '\'')) {
        myAdd(boundaries, i, ")contraction2");
        myAdd(boundaries, i + 1, "(contraction2");
      }
    }

    Pattern pColonMask = Pattern.compile("[^\\s,':]");
    Pattern pColonSeparator = Pattern.compile("\\d:\\d");
    // Colons get separated into their own word unless it looks like they're
    // part of a time (or some other useful structure involving digits) or a
    // URL.
    for (int i = text.indexOf(':'); i != -1; i = text.indexOf(':', i + 1))
      if (!(i >= 2 && i + 2 < text.length()
              && pColonSeparator.matcher(text.substring(i - 2, i + 3)).find()
            || i > 2 && i + 2 < text.length()
               && (text.substring(i - 2, i + 3).equals("tp://")
                   || text.substring(i - 2, i + 3).equals("TP://"))
            || partOfURL(i))) {
        if (i >= 1 && pColonMask.matcher(text.substring(i - 1, i)).find()) {
          myAdd(boundaries, i - 1, ")colon1");
          myAdd(boundaries, i, "(colon1");
        }

        if (i + 1 < text.length()
            && pColonMask.matcher(text.substring(i + 1, i + 2)).find()) {
          myAdd(boundaries, i, ")colon2");
          myAdd(boundaries, i + 1, "(colon2");
        }
      }

    Pattern pSlashMask = Pattern.compile("[^\\s,':/]");
    Pattern pSlashSeparator = Pattern.compile("\\d/\\d");

    // Slashes get separated into their own word unless it looks like they're
    // part of a date (or some other useful structure involving digits) or a
    // URL.
    for (int i = text.indexOf('/'); i != -1; i = text.indexOf('/', i + 1))
      if (!(i >= 2 && i + 2 < text.length()
              && pSlashSeparator.matcher(text.substring(i - 2, i + 3)).find()
            || i > 3 && i + 1 < text.length()
               && (text.substring(i - 3, i + 2).equals("tp://")
                   || text.substring(i - 3, i + 2).equals("TP://"))
            || i > 4
               && (text.substring(i - 4, i + 1).equals("tp://")
                   || text.substring(i - 4, i + 1).equals("TP://"))
            || partOfURL(i))) {
        if (i >= 1 && pSlashMask.matcher(text.substring(i - 1, i)).find()) {
          myAdd(boundaries, i - 1, ")slash1");
          myAdd(boundaries, i, "(slash1");
        }

        if (i + 1 < text.length()
            && pSlashMask.matcher(text.substring(i + 1, i + 2)).find()) {
          myAdd(boundaries, i, ")slash2");
          myAdd(boundaries, i + 1, "(slash2");
        }
      }

    Pattern pDashMask = Pattern.compile("[^\\s,':/-]");
    Pattern pDashSeparator = Pattern.compile("\\w-\\w");
    Pattern pNegative1 = Pattern.compile("-\\.?\\d");
    Pattern pNegative2 = Pattern.compile("\\s-\\.?\\d");

    // Dashes get separated into their own words unless it looks like they're
    // part of some useful structure like a compound word, a number, or a URL.
    for (int i = text.indexOf('-'); i != -1; i = text.indexOf('-', i + 1))
      if (!(i + 1 < text.length() && i >= 1
              && pDashSeparator.matcher(text.substring(i - 1, i + 2)).find()
            || (i + 2 < text.length()
                && (i == 0
                      && pNegative1.matcher(text.substring(i, i + 3)).find()
                    || i > 0
                       && pNegative2.matcher(text.substring(i - 1, i + 3))
                                    .find()))
            || partOfURL(i))) {
        if (i >= 1 && pDashMask.matcher(text.substring(i - 1, i)).find()) {
          myAdd(boundaries, i - 1, ")dash1");
          myAdd(boundaries, i, "(dash1");
        }

        if (i + 1 < text.length()
            && pDashMask.matcher(text.substring(i + 1, i + 2)).find()) {
          myAdd(boundaries, i, ")dash2");
          myAdd(boundaries, i + 1, "(dash2");
        }
      }

    Pattern pDollarMask = Pattern.compile("[^\\s,':/\\$-]");
    Pattern pMoney1 = Pattern.compile("\\$\\.?\\d");
    Pattern pMoney2 = Pattern.compile("(\\s|-)\\$\\.?\\d");

    // Dollar signs get separated into their own words unless it looks like
    // they're in fact delimiting the start of a dollar amount, or are part of
    // a URL.
    for (int i = text.indexOf('$'); i != -1; i = text.indexOf('$', i + 1))
      if (!(i == 0 && i + 2 < text.length()
              && pMoney1.matcher(text.substring(i, i + 3)).find()
            || i > 0 && i + 2 < text.length()
               && pMoney2.matcher(text.substring(i - 1, i + 3)).find()
            || partOfURL(i))) {
        if (i >= 1 && pDollarMask.matcher(text.substring(i - 1, i)).find()) {
          myAdd(boundaries, i - 1, ")dollar1");
          myAdd(boundaries, i, "(dollar1");
        }

        if (i + 1 < text.length()
            && pDollarMask.matcher(text.substring(i + 1, i + 2)).find()) {
          myAdd(boundaries, i, ")dollar2");
          myAdd(boundaries, i + 1, "(dollar2");
        }
      }

    Pattern pBeforeElipsis = Pattern.compile("[^\\s,':/\\$\\.-]\\.\\.\\.");
    Pattern pAfterElipsis = Pattern.compile("\\.\\.\\.[^\\s,':/\\$\\.-]");

    // Three or more consecutive periods form their own word.
    for (int i = text.indexOf('.'); i != -1; i = text.indexOf('.', i + 1)) {
      if (i > 0 && i + 2 < text.length()
          && pBeforeElipsis.matcher(text.substring(i - 1, i + 3)).find()) {
        myAdd(boundaries, i - 1, ")ellipsis1");
        myAdd(boundaries, i, "(ellipsis1");
      }

      if (i >= 2 && i + 1 < text.length()
          && pAfterElipsis.matcher(text.substring(i - 2, i + 2)).find()) {
        myAdd(boundaries, i, ")ellipsis2");
        myAdd(boundaries, i + 1, "(ellipsis2");
      }
    }

    // If the last occurrence of a period in the sentence comes after all
    // occurrences of letters and digits, it is an end of sentence marker
    // which constitutes its own word, unless it appears immediately after two
    // other periods.
    int period = text.lastIndexOf('.');
    if (period != -1) {
      boolean endOfSentence = true;
      for (int i = period + 1; i < text.length() && endOfSentence; ++i)
        endOfSentence = !Character.isLetterOrDigit(text.charAt(i));

      if (endOfSentence) {
        if (period >= 1
            && (text.charAt(period - 1) != '.' || period == 1
                || text.charAt(period - 2) != '.')
            && pDollarMask.matcher(text.substring(period - 1, period)).find())
        {
          myAdd(boundaries, period - 1, ")period1");
          myAdd(boundaries, period, "(period1");
        }

        if (period + 1 < text.length()
            && (period == 0 || text.charAt(period - 1) != '.' || period == 1
                || text.charAt(period - 2) != '.')
            && pDollarMask.matcher(text.substring(period + 1, period + 2))
                          .find()) {
          myAdd(boundaries, period, ")period2");
          myAdd(boundaries, period + 1, "(period2");
        }
      }
      else period = -1;
    }

    // All other punctuation marks constitute their own words, unless they
    // appear immediately after themselves (consecutive identical punctuation
    // marks form a single word) or are part of a URL.
    Pattern pPunctuation = Pattern.compile("[^\\s\\w,'\\.:/\\$-]");
    m = pPunctuation.matcher(text);

    while (m.find())
      if (!partOfURL(m.start())) {
        if (m.start() + 1 < text.length()
            && text.charAt(m.start()) != text.charAt(m.start() + 1)
            && m.start() + 1 != period
            && pPunctuation
               .matcher(text.substring(m.start() + 1, m.start() + 2)).find())
        {
          myAdd(boundaries, m.start(), ")punctuation1");
          myAdd(boundaries, m.start() + 1, "(punctuation1");
        }
      }

    m = Pattern.compile("[^\\s\\w,'\\.:/\\$-]\\w").matcher(text);
    while (m.find())
      if (!partOfURL(m.start())) {
        myAdd(boundaries, m.start(), ")punctuation2");
        myAdd(boundaries, m.start() + 1, "(punctuation2");
      }

    m = Pattern.compile("\\w[^\\s\\w,'\\.:/\\$-]").matcher(text);
    while (m.find())
      if (!partOfURL(m.start())) {
        myAdd(boundaries, m.start(), ")punctuation3");
        myAdd(boundaries, m.start() + 1, "(punctuation3");
      }

    // Now we just have to create the LinkedVector.
    Integer[] temp =
      (Integer[]) boundaries.toArray(new Integer[boundaries.size()]);
    int[] I = new int[temp.length];
    for (int i = 0; i < I.length; ++i) I[i] = temp[i].intValue();
    Arrays.sort(I);

    Word w = new Word(text.substring(I[0], I[1] + 1), I[0] + start,
                      I[1] + start);
    for (int i = 2; i < I.length; i += 2) {
      w.next = new Word(text.substring(I[i], I[i + 1] + 1),
                        w,
                        I[i] + start,
                        I[i + 1] + start);
      w = (Word) w.next;
    }

    inURL = null;
    return new LinkedVector(w);
  }


  /**
    * Does a simple check to determine if the symbol at the specified index in
    * the specified string is likely to be part of a URL.  If the specified
    * text contains any of the following strings before the specified symbol,
    * and there is no whitespace in between the two, the specified symbol is
    * deemed likely to be part of a URL.
    *
    * @param index  The index of the symbol in question.
    * @return <code>true</code> if and only if the specified symbol appears to
    *         be part of a URL.
   **/
  private boolean partOfURL(int index) {
    if (inURL != null) return inURL[index];
    inURL = new boolean[text.length()];

    StringBuffer pattern = new StringBuffer();
    pattern.append("(?i)(");
    pattern.append(protocols[0]);
    for (int i = 1; i < protocols.length; ++i) {
      pattern.append("|");
      pattern.append(protocols[i]);
    }

    pattern.append(")://\\S+|[a-zA-Z0-9][a-zA-Z0-9-]*\\.(");
    pattern.append(topLevelDomains[0]);
    for (int i = 0; i < topLevelDomains.length; ++i) {
      pattern.append("|");
      pattern.append(topLevelDomains[i]);
    }
    pattern.append(")(/\\S+)?");

    Matcher m = Pattern.compile(pattern.toString()).matcher(text);
    while (m.find())
      for (int i = m.start(); i < m.end(); ++i) inURL[i] = true;
    return inURL[index];
  }


  /**
    * The string representation of a <code>Sentence</code> is just its text.
    *
    * @return The text of this sentence.
   **/
  public String toString() { return text; }
}