ChineseUnknownWordModel.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.ling.Tag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;

/**
 * Stores, trains, and scores with an unknown word model.  A couple
 * of filters deterministically force rewrites for certain proper
 * nouns, dates, and cardinal and ordinal numbers; when none of these
 * filters are met, either the distribution of terminals with the same
 * first character is used, or Good-Turing smoothing is used. Although
 * this is developed for Chinese, the training and storage methods
 * could be used cross-linguistically.
 *
 * @author Roger Levy
 */
public class ChineseUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseUnknownWordModel.class);

  private static final String encoding = "GB18030"; // used only for debugging

  private final boolean useUnicodeType;


  /* These strings are stored in ascii-type Unicode encoding.  To
   * edit them, either use the Unicode codes or use native2ascii or a
   * similar program to convert the file into a Chinese encoding, then
   * convert back. */
  private static final String numberMatch = ".*[0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf\u96F6\u3007\u25cb\u25ef].*";
  private static final String dateMatch = numberMatch + "[\u5e74\u6708\u65e5\u53f7]";
  private static final String ordinalMatch = "\u7b2c.*";
  // uses midDot characters as one clue of being proper name
  private static final String properNameMatch = ".*[\u00b7\u0387\u2022\u2024\u2027\u2219\u22C5\u30FB].*";

  private final Set<String> seenFirst;


  public ChineseUnknownWordModel(Options op, Lexicon lex,
                                 Index<String> wordIndex,
                                 Index<String> tagIndex,
                                 ClassicCounter<IntTaggedWord> unSeenCounter,
                                 Map<Label,ClassicCounter<String>> tagHash,
                                 Map<String,Float> unknownGT,
                                 boolean useGT,
                                 Set<String> seenFirst) {
    super(op, lex, wordIndex, tagIndex,
          unSeenCounter, tagHash, unknownGT, null);
    this.useFirst = !useGT;
    this.useGT = useGT;
    this.useUnicodeType = op.lexOptions.useUnicodeType;
    this.seenFirst = seenFirst;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   * TODO: would need to set useGT correctly if you saved a model with
   * useGT and then wanted to recover it from text.
   */
  public ChineseUnknownWordModel(Options op, Lexicon lex,
                                 Index<String> wordIndex,
                                 Index<String> tagIndex) {
    this(op, lex, wordIndex, tagIndex,
            new ClassicCounter<>(),
         Generics.<Label,ClassicCounter<String>>newHashMap(),
         Generics.<String,Float>newHashMap(),
         false, Generics.<String>newHashSet());
  }

  @Override
  public float score(IntTaggedWord itw, String word) {
    // Label tagL = itw.tagLabel();
    // String tag = tagL.value();
    String tag = itw.tagString(tagIndex);
    Label tagL = new Tag(tag);

    float logProb;

    if (VERBOSE) EncodingPrintWriter.out.println("Scoring unknown word |" + word + "| with tag " + tag, encoding);

    if (word.matches(dateMatch)) {
      //EncodingPrintWriter.out.println("Date match for " + word,encoding);
      if (tag.equals("NT")) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    } else if (word.matches(numberMatch)) {
      //EncodingPrintWriter.out.println("Number match for " + word,encoding);
      if (tag.equals("CD") && (!word.matches(ordinalMatch))) {
        logProb = 0.0f;
      } else if (tag.equals("OD") && word.matches(ordinalMatch)) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    } else if (word.matches(properNameMatch)) {
      //EncodingPrintWriter.out.println("Proper name match for " + word,encoding);
      if (tag.equals("NR")) {
        logProb = 0.0f;
      } else {
        logProb = Float.NEGATIVE_INFINITY;
      }
    /* -------------
      // this didn't seem to work -- too categorical
      int type = Character.getType(word.charAt(0));
      // the below may not normalize probs over options, but is probably okay
      if (type == Character.START_PUNCTUATION) {
        if (tag.equals("PU-LPAREN") || tag.equals("PU-PAREN") ||
            tag.equals("PU-LQUOTE") || tag.equals("PU-QUOTE") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown L Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      } else if (type == Character.END_PUNCTUATION) {
        if (tag.equals("PU-RPAREN") || tag.equals("PU-PAREN") ||
            tag.equals("PU-RQUOTE") || tag.equals("PU-QUOTE") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown R Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      } else {
        if (tag.equals("PU-OTHER") || tag.equals("PU-ENDSENT") ||
            tag.equals("PU")) {
          // if (VERBOSE) log.info("ChineseUWM: unknown O Punc");
          logProb = 0.0f;
        } else {
          logProb = Float.NEGATIVE_INFINITY;
        }
      }
    ------------- */
    } else {
      first:
        if (useFirst) {
          String first = word.substring(0, 1);
          if (useUnicodeType) {
            char ch = word.charAt(0);
            int type = Character.getType(ch);
            if (type != Character.OTHER_LETTER) {
              // standard Chinese characters are of type "OTHER_LETTER"!!
              first = Integer.toString(type);
            }
          }
          if ( ! seenFirst.contains(first)) {
            if (useGT) {
              logProb = scoreGT(tag);
              break first;
            } else {
              first = unknown;
            }
          }

          /* get the Counter of terminal rewrites for the relevant tag */
          ClassicCounter<String> wordProbs = tagHash.get(tagL);

          /* if the proposed tag has never been seen before, issue a
             warning and return probability 0. */
          if (wordProbs == null) {
            if (VERBOSE) log.info("Warning: proposed tag is unseen in training data!");
            logProb = Float.NEGATIVE_INFINITY;
          } else if (wordProbs.containsKey(first)) {
            logProb = (float) wordProbs.getCount(first);
          } else {
            logProb = (float) wordProbs.getCount(unknown);
          }
        } else if (useGT) {
          logProb = scoreGT(tag);
        } else {
          if (VERBOSE) log.info("Warning: no unknown word model in place!\nGiving the combination " + word + " " + tag + " zero probability.");
          logProb = Float.NEGATIVE_INFINITY; // should never get this!
        }
    }

    if (VERBOSE) EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb, encoding);
    return logProb;
  }



  public static void main(String[] args) {
    System.out.println("Testing unknown matching");
    String s = "\u5218\u00b7\u9769\u547d";
    if (s.matches(properNameMatch)) {
      System.out.println("hooray names!");
    } else {
      System.out.println("Uh-oh names!");
    }
    String s1 = "\uff13\uff10\uff10\uff10";
    if (s1.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c";
    if (s11.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d";
    if (s12.matches(numberMatch)) {
      System.out.println("hooray numbers!");
    } else {
      System.out.println("Uh-oh numbers!");
    }
    String s2 = "\u4e09\u6708";
    if (s2.matches(dateMatch)) {
      System.out.println("hooray dates!");
    } else {
      System.out.println("Uh-oh dates!");
    }

    System.out.println("Testing tagged word");
    ClassicCounter<TaggedWord> c = new ClassicCounter<>();
    TaggedWord tw1 = new TaggedWord("w", "t");
    c.incrementCount(tw1);
    TaggedWord tw2 = new TaggedWord("w", "t2");
    System.out.println(c.containsKey(tw2));
    System.out.println(tw1.equals(tw2));

    WordTag wt1 = toWordTag(tw1);
    WordTag wt2 = toWordTag(tw2);
    WordTag wt3 = new WordTag("w", "t2");
    System.out.println(wt1.equals(wt2));
    System.out.println(wt2.equals(wt3));
  }

  private static WordTag toWordTag(TaggedWord tw) {
    return new WordTag(tw.word(), tw.tag());
  }

  private static final long serialVersionUID = 221L;


  @Override
  public String getSignature(String word, int loc) {
    throw new UnsupportedOperationException();
  }

}