EnglishUnknownWordModel.java example

Explorer
CoreNLP-master
// Stanford Parser -- a probabilistic lexicalized NL CFG parser
// Copyright (c) 2002, 2003, 2004, 2005, 2008 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    parser-support@lists.stanford.edu
//    http://nlp.stanford.edu/software/lex-parser.shtml

package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.process.DistSimClassifier;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;


/**
 * This is a basic unknown word model for English.  It supports 5 different
 * types of feature modeling; see {@link #getSignature(String, int)}.
 *
 * <i>Implementation note: the contents of this class tend to overlap somewhat
 * with {@link ArabicUnknownWordModel} and were originally included in {@link BaseLexicon}.
 *
 * @author Dan Klein
 * @author Galen Andrew
 * @author Christopher Manning
 * @author Anna Rafferty
 */
public class EnglishUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(EnglishUnknownWordModel.class);

  private static final long serialVersionUID = 4825624957364628770L;

  private static final boolean DEBUG_UWM = false;

  protected final boolean smartMutation;

  protected final int unknownSuffixSize;
  protected final int unknownPrefixSize;

  protected final String wordClassesFile;

  private static final int MIN_UNKNOWN = 0;
  private static final int MAX_UNKNOWN = 8;

  public EnglishUnknownWordModel(Options op, Lexicon lex,
                                 Index<String> wordIndex,
                                 Index<String> tagIndex,
                                 ClassicCounter<IntTaggedWord> unSeenCounter) {
    super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
    if (unknownLevel < MIN_UNKNOWN || unknownLevel > MAX_UNKNOWN) {
      throw new IllegalArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel);
    }
    this.smartMutation = op.lexOptions.smartMutation;
    this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
    this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
    wordClassesFile = op.lexOptions.wordClassesFile;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   */
  public EnglishUnknownWordModel(Options op, Lexicon lex,
                                 Index<String> wordIndex,
                                 Index<String> tagIndex) {
    this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
    double pb_T_S = scoreProbTagGivenWordSignature(iTW, loc, smooth, word);
    double p_T = (c_Tseen / total);
    double p_W = 1.0 / total;
    double pb_W_T = Math.log(pb_T_S * p_W / p_T);

    if (pb_W_T > -100.0) {
      if (DEBUG_UWM) {
        log.info(iTW + " tagging has probability " + pb_W_T);
      }
      return (float) pb_W_T;
    }
    if (DEBUG_UWM) {
      log.info(iTW + " tagging is impossible.");
    }
    return Float.NEGATIVE_INFINITY;
  } // end score()


  /** Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown) */
  @Override
  public double scoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, String word) {
    // iTW.tag = nullTag;
    // double c_W = ((BaseLexicon) l).getCount(iTW);
    // iTW.tag = tag;

    // unknown word model for P(T|S)

    int wordSig = getSignatureIndex(iTW.word, loc, word);
    IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
    double c_TS = unSeenCounter.getCount(temp);
    temp = new IntTaggedWord(wordSig, nullTag);
    double c_S = unSeenCounter.getCount(temp);
    double c_U = unSeenCounter.getCount(NULL_ITW);
    temp = new IntTaggedWord(nullWord, iTW.tag);
    double c_T = unSeenCounter.getCount(temp);

    double p_T_U = c_T / c_U;
    if (unknownLevel == 0) {
      c_TS = 0;
      c_S = 0;
    }
    return (c_TS + smooth * p_T_U) / (c_S + smooth);
  }


  /**
   * Returns the index of the signature of the word numbered wordIndex, where
   * the signature is the String representation of unknown word features.
   */
  @Override
  public int getSignatureIndex(int index, int sentencePosition, String word) {
    String uwSig = getSignature(word, sentencePosition);
    int sig = wordIndex.addToIndex(uwSig);
    if (DEBUG_UWM) {
      log.info("Signature (" + unknownLevel + "): mapped " + word +
                         " (" + index + ") to " + uwSig + " (" + sig + ")");
    }
    return sig;
  }

  /**
   * This routine returns a String that is the "signature" of the class of a
   * word. For, example, it might represent whether it is a number of ends in
   * -s. The strings returned by convention matches the pattern UNK(-.+)? ,
   * which is just assumed to not match any real word. Behavior depends on the
   * unknownLevel (-uwm flag) passed in to the class. The recognized numbers are
   * 1-5: 5 is fairly English-specific; 4, 3, and 2 look for various word
   * features (digits, dashes, etc.) which are only vaguely English-specific; 1
   * uses the last two characters combined with a simple classification by
   * capitalization.
   *
   * @param word The word to make a signature for
   * @param loc Its position in the sentence (mainly so sentence-initial
   *          capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  @Override
  public String getSignature(String word, int loc) {
    StringBuilder sb = new StringBuilder("UNK");
    switch (unknownLevel) {
      case 8:
        getSignature8(word, sb);
        break;
      case 7:
        getSignature7(word, loc, sb);
        break;
      case 6:
        getSignature6(word, loc, sb);
        break;
      case 5:
        getSignature5(word, loc, sb);
        break;
      case 4:
        getSignature4(word, loc, sb);
        break;
      case 3:
        getSignature3(word, loc, sb);
        break;
      case 2:
        getSignature2(word, loc, sb);
        break;
      case 1:
        getSignature1(word, loc, sb);
        break;
      default:
        // 0 = do nothing so it just stays as "UNK"
    } // end switch (unknownLevel)
    // log.info("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()


  private static void getSignature7(String word, int loc, StringBuilder sb) {
    // New Sep 2008. Like 2 but rely more on Caps somewhere than initial Caps
    // {-ALLC, -INIT, -UC somewhere, -LC, zero} +
    // {-DASH, zero} +
    // {-NUM, -DIG, zero} +
    // {lowerLastChar, zeroIfShort}
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLower = false;
    boolean hasUpper = false;
    boolean hasDash = false;
    int wlen = word.length();
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          } else {
            hasUpper = true;
          }
        } else if (ch == '-') {
          hasDash = true;
        }
      }
    }
    if (wlen > 0 && hasUpper) {
      if ( ! hasLower) {
        sb.append("-ALLC");
      } else if (loc == 0) {
        sb.append("-INIT");
      } else {
        sb.append("-UC");
      }
    } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) {
      sb.append("-LC");
    }
    // no suffix = no (lowercase) letters
    if (hasDash) {
      sb.append("-DASH");
    }
    if (hasDigit) {
      if (!hasNonDigit) {
        sb.append("-NUM");
      } else {
        sb.append("-DIG");
      }
    } else if (wlen > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      sb.append(Character.toLowerCase(ch));
    }
    // no suffix = short non-number, non-alphabetic
  }


  private void getSignature6(String word, int loc, StringBuilder sb) {
    // New Sep 2008. Like 5 but rely more on Caps somewhere than initial Caps
    // { -INITC, -CAPS, (has) -CAP, -LC lowercase, 0 } +
    // { -KNOWNLC, 0 } + [only for INITC]
    // { -NUM, 0 } +
    // { -DASH, 0 } +
    // { -last lowered char(s) if known discriminating suffix, 0}
    int wlen = word.length();
    int numCaps = 0;
    boolean hasDigit = false;
    boolean hasDash = false;
    boolean hasLower = false;
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else if (ch == '-') {
        hasDash = true;
      } else if (Character.isLetter(ch)) {
        if (Character.isLowerCase(ch)) {
          hasLower = true;
        } else if (Character.isTitleCase(ch)) {
          hasLower = true;
          numCaps++;
        } else {
          numCaps++;
        }
      }
    }
    String lowered = word.toLowerCase();
    if (numCaps > 1) {
      sb.append("-CAPS");
    } else if (numCaps > 0) {
      if (loc == 0) {
        sb.append("-INITC");
        if (getLexicon().isKnown(lowered)) {
          sb.append("-KNOWNLC");
        }
      } else {
        sb.append("-CAP");
      }
    } else if (hasLower) { // (Character.isLowerCase(ch0)) {
      sb.append("-LC");
    }
    if (hasDigit) {
      sb.append("-NUM");
    }
    if (hasDash) {
      sb.append("-DASH");
    }
    if (lowered.endsWith("s") && wlen >= 3) {
      // here length 3, so you don't miss out on ones like 80s
      char ch2 = lowered.charAt(wlen - 2);
      // not -ess suffixes or greek/latin -us, -is
      if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
        sb.append("-s");
      }
    } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
      // don't do for very short words;
      // Implement common discriminating suffixes
      if (lowered.endsWith("ed")) {
        sb.append("-ed");
      } else if (lowered.endsWith("ing")) {
        sb.append("-ing");
      } else if (lowered.endsWith("ion")) {
        sb.append("-ion");
      } else if (lowered.endsWith("er")) {
        sb.append("-er");
      } else if (lowered.endsWith("est")) {
        sb.append("-est");
      } else if (lowered.endsWith("ly")) {
        sb.append("-ly");
      } else if (lowered.endsWith("ity")) {
        sb.append("-ity");
      } else if (lowered.endsWith("y")) {
        sb.append("-y");
      } else if (lowered.endsWith("al")) {
        sb.append("-al");
        // } else if (lowered.endsWith("ble")) {
        // sb.append("-ble");
        // } else if (lowered.endsWith("e")) {
        // sb.append("-e");
      }
    }
  }


  private void getSignature5(String word, int loc, StringBuilder sb) {
    // Reformed Mar 2004 (cdm); hopefully better now.
    // { -CAPS, -INITC ap, -LC lowercase, 0 } +
    // { -KNOWNLC, 0 } + [only for INITC]
    // { -NUM, 0 } +
    // { -DASH, 0 } +
    // { -last lowered char(s) if known discriminating suffix, 0}
    int wlen = word.length();
    int numCaps = 0;
    boolean hasDigit = false;
    boolean hasDash = false;
    boolean hasLower = false;
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else if (ch == '-') {
        hasDash = true;
      } else if (Character.isLetter(ch)) {
        if (Character.isLowerCase(ch)) {
          hasLower = true;
        } else if (Character.isTitleCase(ch)) {
          hasLower = true;
          numCaps++;
        } else {
          numCaps++;
        }
      }
    }
    char ch0 = word.charAt(0);
    String lowered = word.toLowerCase();
    if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
      if (loc == 0 && numCaps == 1) {
        sb.append("-INITC");
        if (getLexicon().isKnown(lowered)) {
          sb.append("-KNOWNLC");
        }
      } else {
        sb.append("-CAPS");
      }
    } else if (!Character.isLetter(ch0) && numCaps > 0) {
      sb.append("-CAPS");
    } else if (hasLower) { // (Character.isLowerCase(ch0)) {
      sb.append("-LC");
    }
    if (hasDigit) {
      sb.append("-NUM");
    }
    if (hasDash) {
      sb.append("-DASH");
    }
    if (lowered.endsWith("s") && wlen >= 3) {
      // here length 3, so you don't miss out on ones like 80s
      char ch2 = lowered.charAt(wlen - 2);
      // not -ess suffixes or greek/latin -us, -is
      if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
        sb.append("-s");
      }
    } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
      // don't do for very short words;
      // Implement common discriminating suffixes
      if (lowered.endsWith("ed")) {
        sb.append("-ed");
      } else if (lowered.endsWith("ing")) {
        sb.append("-ing");
      } else if (lowered.endsWith("ion")) {
        sb.append("-ion");
      } else if (lowered.endsWith("er")) {
        sb.append("-er");
      } else if (lowered.endsWith("est")) {
        sb.append("-est");
      } else if (lowered.endsWith("ly")) {
        sb.append("-ly");
      } else if (lowered.endsWith("ity")) {
        sb.append("-ity");
      } else if (lowered.endsWith("y")) {
        sb.append("-y");
      } else if (lowered.endsWith("al")) {
        sb.append("-al");
        // } else if (lowered.endsWith("ble")) {
        // sb.append("-ble");
        // } else if (lowered.endsWith("e")) {
        // sb.append("-e");
      }
    }
  }


  private static void getSignature4(String word, int loc, StringBuilder sb) {
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLetter = false;
    boolean hasLower = false;
    boolean hasDash = false;
    boolean hasPeriod = false;
    boolean hasComma = false;
    for (int i = 0; i < word.length(); i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          hasLetter = true;
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          }
        } else {
          if (ch == '-') {
            hasDash = true;
          } else if (ch == '.') {
            hasPeriod = true;
          } else if (ch == ',') {
            hasComma = true;
          }
        }
      }
    }
    // 6 way on letters
    if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
      if (!hasLower) {
        sb.append("-AC");
      } else if (loc == 0) {
        sb.append("-SC");
      } else {
        sb.append("-C");
      }
    } else if (hasLower) {
      sb.append("-L");
    } else if (hasLetter) {
      sb.append("-U");
    } else {
      // no letter
      sb.append("-S");
    }
    // 3 way on number
    if (hasDigit && !hasNonDigit) {
      sb.append("-N");
    } else if (hasDigit) {
      sb.append("-n");
    }
    // binary on period, dash, comma
    if (hasDash) {
      sb.append("-H");
    }
    if (hasPeriod) {
      sb.append("-P");
    }
    if (hasComma) {
      sb.append("-C");
    }
    if (word.length() > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      if (Character.isLetter(ch)) {
        sb.append('-');
        sb.append(Character.toLowerCase(ch));
      }
    }
  }


  private static void getSignature3(String word, int loc, StringBuilder sb) {
    // This basically works right, except note that 'S' is applied to all
    // capitalized letters in first word of sentence, not just first....
    sb.append('-');
    char lastClass = '-'; // i.e., nothing
    int num = 0;
    for (int i = 0; i < word.length(); i++) {
      char ch = word.charAt(i);
      char newClass;
      if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
        if (loc == 0) {
          newClass = 'S';
        } else {
          newClass = 'L';
        }
      } else if (Character.isLetter(ch)) {
        newClass = 'l';
      } else if (Character.isDigit(ch)) {
        newClass = 'd';
      } else if (ch == '-') {
        newClass = 'h';
      } else if (ch == '.') {
        newClass = 'p';
      } else {
        newClass = 's';
      }
      if (newClass != lastClass) {
        lastClass = newClass;
        sb.append(lastClass);
        num = 1;
      } else {
        if (num < 2) {
          sb.append('+');
        }
        num++;
      }
    }
    if (word.length() > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing to lower for further densening and skipping digits
      char ch = Character.toLowerCase(word.charAt(word.length() - 1));
      sb.append('-');
      sb.append(ch);
    }
  }


  private static void getSignature2(String word, int loc, StringBuilder sb) {
    // {-ALLC, -INIT, -UC, -LC, zero} +
    // {-DASH, zero} +
    // {-NUM, -DIG, zero} +
    // {lowerLastChar, zeroIfShort}
    boolean hasDigit = false;
    boolean hasNonDigit = false;
    boolean hasLower = false;
    int wlen = word.length();
    for (int i = 0; i < wlen; i++) {
      char ch = word.charAt(i);
      if (Character.isDigit(ch)) {
        hasDigit = true;
      } else {
        hasNonDigit = true;
        if (Character.isLetter(ch)) {
          if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
            hasLower = true;
          }
        }
      }
    }
    if (wlen > 0
            && (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0)))) {
      if (!hasLower) {
        sb.append("-ALLC");
      } else if (loc == 0) {
        sb.append("-INIT");
      } else {
        sb.append("-UC");
      }
    } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) {
      sb.append("-LC");
    }
    // no suffix = no (lowercase) letters
    if (word.indexOf('-') >= 0) {
      sb.append("-DASH");
    }
    if (hasDigit) {
      if (!hasNonDigit) {
        sb.append("-NUM");
      } else {
        sb.append("-DIG");
      }
    } else if (wlen > 3) {
      // don't do for very short words: "yes" isn't an "-es" word
      // try doing toLower for further densening and skipping digits
      char ch = word.charAt(word.length() - 1);
      sb.append(Character.toLowerCase(ch));
    }
    // no suffix = short non-number, non-alphabetic
  }


  private static void getSignature1(String word, int loc, StringBuilder sb) {
    sb.append('-');
    sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
    sb.append('-');
    if (Character.isLowerCase(word.charAt(0))) {
      sb.append("LOWER");
    } else {
      if (Character.isUpperCase(word.charAt(0))) {
        if (loc == 0) {
          sb.append("INIT");
        } else {
          sb.append("UPPER");
        }
      } else {
        sb.append("OTHER");
      }
    }
  }


  private void getSignature8(String word, StringBuilder sb) {
    sb.append('-');
    boolean digit = true;
    for (int i = 0; i < word.length(); i++) {
      char c = word.charAt(i);
      if ( ! (Character.isDigit(c) || c == '.' || c == ',' || (i == 0 && (c == '-' || c == '+')))) {
        digit = false;
      }
    }
    // digit = false;  // todo: Just turned off while we test it.
    if (digit) {
      sb.append("NUMBER");
    } else {
      if (distSim == null) {
        distSim = new DistSimClassifier(wordClassesFile, false, true);
        // todo XXXX booleans depend on distsim file; need more options
      }

      String cluster = distSim.distSimClass(word);
      if (cluster == null) {
        cluster = "NULL";
      }
      sb.append(cluster);
    }
  }

  private transient DistSimClassifier distSim;

} // end class