package edu.stanford.nlp.parser.lexparser; import java.util.Collection; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.Index; /** * An interface for training an UnknownWordModel. Once initialized, * you can feed it trees and then call finishTraining to get the * UnknownWordModel. * * @author John Bauer */ public interface UnknownWordModelTrainer { /** * Initialize the trainer with a few of the data structures it needs * to train. Also, it is necessary to estimate the number of trees * that it will be given, as many of the UWMs switch training modes * after seeing a fraction of the trees. * * This is an initialization method and not part of the constructor * because these Trainers are generally loaded by reflection, and * making this a method instead of a constructor lets the compiler * catch silly errors. */ void initializeTraining(Options op, Lexicon lex, Index<String> wordIndex, Index<String> tagIndex, double totalTrees); /** * Tallies statistics for this particular collection of trees. Can * be called multiple times. */ void train(Collection<Tree> trees); /** * Tallies statistics for a weighted collection of trees. Can * be called multiple times. */ void train(Collection<Tree> trees, double weight); /** * Tallies statistics for a single tree. * Can be called multiple times. */ void train(Tree tree, double weight); /** * Tallies statistics for a single word. * Can be called multiple times. */ void train(TaggedWord tw, int loc, double weight); /** * Maintains a (real-valued) count of how many (weighted) trees have * been read in. Can be called multiple times. * * @param weight The weight of trees additionally trained on */ void incrementTreesRead(double weight); /** * Returns the trained UWM. Many of the subclasses build exactly * one model, and some of the finishTraining methods manipulate the * data in permanent ways, so this should only be called once */ UnknownWordModel finishTraining(); String unknown = "UNK"; int nullWord = -1; short nullTag = -1; IntTaggedWord NULL_ITW = new IntTaggedWord(nullWord, nullTag); }