UnknownWordModel.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import java.io.Serializable;

import edu.stanford.nlp.stats.Counter;


/** This class defines the runtime interface for unknown words
 *  in lexparser. See UnknownWordModelTrainer for how unknown
 *  word models are built from training data.
 *
 *  @author Anna Rafferty
 *  @author Christopher Manning
 */
public interface UnknownWordModel extends Serializable {

  /**
   * Get the level of equivalence classing for the model.
   * One unknown word model may allow different options to be set; for example,
   * several models of unknown words for a given language could be included in one
   *  class.  The unknown level can be queried with this method.
   *
   * @return The current level of unknown word equivalence classing
   */
  int getUnknownLevel();


  /**
   * Returns the lexicon used by this unknown word model. The
   * lexicon is used to check information about words being seen/unseen.
   *
   * @return The lexicon used by this unknown word model
   */
  Lexicon getLexicon();


  /**
   * Get the score of this word with this tag (as an IntTaggedWord) at this
   * location loc in a sentence.
   * (Presumably an estimate of P(word | tag), usually calculated as
   * P(signature | tag).)
   * Assumes the word is unknown.
   *
   * @param iTW An IntTaggedWord pairing a word and POS tag
   * @param loc The position in the sentence.  <i>In the default implementation
   *               this is used only for unknown words to change their
   *               probability distribution when sentence initial.  Now,
   *               a negative value </i>
   * @param c_Tseen Total count of this tag (on seen words) in training
   * @param total Total count of word tokens in training
   * @param smooth Weighting on prior P(T|U) in estimate
   * @param word The word itself; useful so we don't look it up in the index
   * @return A double valued score, usually - log P(word|tag)
   */
  float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word);


  /** Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown). */
  double scoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, String word);


  /**
   * This routine returns a String that is the "signature" of the class of a
   * word. For, example, it might represent whether it is a number of ends in
   * -s. The strings returned by convention match the pattern UNK or UNK-.* ,
   * which is just assumed to not match any real word. Behavior depends on the
   * unknownLevel (-uwm flag) passed in to the class.
   *
   * @param word The word to make a signature for
   * @param loc Its position in the sentence (mainly so sentence-initial
   *          capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  String getSignature(String word, int loc);

  /** Returns an unknown word signature as an integer index rather than as a String. */
  int getSignatureIndex(int wordIndex, int sentencePosition, String word);


  /**
   * Adds the tagging with count to the data structures in this Lexicon.
   *
   * @param seen Whether tagging is seen
   * @param itw The tagging
   * @param count Its weight
   */
  void addTagging(boolean seen, IntTaggedWord itw, double count);

  /** Returns a Counter from IntTaggedWord to how often they have been seen. */
  Counter<IntTaggedWord> unSeenCounter();

}