package edu.stanford.nlp.parser.lexparser; import java.io.Serializable; import edu.stanford.nlp.stats.Counter; /** This class defines the runtime interface for unknown words * in lexparser. See UnknownWordModelTrainer for how unknown * word models are built from training data. * * @author Anna Rafferty * @author Christopher Manning */ public interface UnknownWordModel extends Serializable { /** * Get the level of equivalence classing for the model. * One unknown word model may allow different options to be set; for example, * several models of unknown words for a given language could be included in one * class. The unknown level can be queried with this method. * * @return The current level of unknown word equivalence classing */ int getUnknownLevel(); /** * Returns the lexicon used by this unknown word model. The * lexicon is used to check information about words being seen/unseen. * * @return The lexicon used by this unknown word model */ Lexicon getLexicon(); /** * Get the score of this word with this tag (as an IntTaggedWord) at this * location loc in a sentence. * (Presumably an estimate of P(word | tag), usually calculated as * P(signature | tag).) * Assumes the word is unknown. * * @param iTW An IntTaggedWord pairing a word and POS tag * @param loc The position in the sentence. <i>In the default implementation * this is used only for unknown words to change their * probability distribution when sentence initial. Now, * a negative value </i> * @param c_Tseen Total count of this tag (on seen words) in training * @param total Total count of word tokens in training * @param smooth Weighting on prior P(T|U) in estimate * @param word The word itself; useful so we don't look it up in the index * @return A double valued score, usually - log P(word|tag) */ float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word); /** Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown). */ double scoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, String word); /** * This routine returns a String that is the "signature" of the class of a * word. For, example, it might represent whether it is a number of ends in * -s. The strings returned by convention match the pattern UNK or UNK-.* , * which is just assumed to not match any real word. Behavior depends on the * unknownLevel (-uwm flag) passed in to the class. * * @param word The word to make a signature for * @param loc Its position in the sentence (mainly so sentence-initial * capitalized words can be treated differently) * @return A String that is its signature (equivalence class) */ String getSignature(String word, int loc); /** Returns an unknown word signature as an integer index rather than as a String. */ int getSignatureIndex(int wordIndex, int sentencePosition, String word); /** * Adds the tagging with count to the data structures in this Lexicon. * * @param seen Whether tagging is seen * @param itw The tagging * @param count Its weight */ void addTagging(boolean seen, IntTaggedWord itw, double count); /** Returns a Counter from IntTaggedWord to how often they have been seen. */ Counter<IntTaggedWord> unSeenCounter(); }