Lexicon.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.trees.Tree;
import java.util.function.Function;


/**
 * An interface for lexicons interfacing to lexparser.  Its primary
 * responsibility is to provide a conditional probability
 * P(word|tag), which is fulfilled by the {#score} method.
 * Inside the lexparser,
 * Strings are interned and tags and words are usually represented as integers.
 *
 * @author Galen Andrew
 */
public interface Lexicon extends Serializable {

  String UNKNOWN_WORD = "UNK";  // if UNK were a word, counts would merge
  String BOUNDARY = ".$.";      // boundary word -- assumed not a real word
  String BOUNDARY_TAG = ".$$."; // boundary tag -- assumed not a real tag


  /**
   * Set the model via which unknown words should be scored by this lexicon
   */
  //void setUnknownWordModel(UnknownWordModel uwModel);

  /**
   * Returns the number of times this word/tag pair has been seen;
   * 0 returned if never previously seen
   */
//  double getCount(IntTaggedWord w);

  /**
   * Checks whether a word is in the lexicon.
   *
   * @param word The word as an int
   * @return Whether the word is in the lexicon
   */
  boolean isKnown(int word);

  /**
   * Checks whether a word is in the lexicon.
   *
   * @param word The word as a String
   * @return Whether the word is in the lexicon
   */
  boolean isKnown(String word);

  /** Return the Set of tags used by this tagger (available after training the tagger).
   *
   * @return The Set of tags used by this tagger
   */
  Set<String> tagSet(Function<String,String> basicCategoryFunction);

  /**
   * Get an iterator over all rules (pairs of (word, POS)) for this word.
   *
   * @param word The word, represented as an integer in Index
   * @param loc  The position of the word in the sentence (counting from 0).
   *                <i>Implementation note: The BaseLexicon class doesn't
   *                actually make use of this position information.</i>
   * @param featureSpec Additional word features like morphosyntactic information.
   * @return An Iterator over a List ofIntTaggedWords, which pair the word
   *                with possible taggings as integer pairs.  (Each can be
   *                thought of as a <code>tag -> word<code> rule.)
   */
  Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec);

  /**
   * Same thing, but with a string that needs to be translated by the
   * lexicon's word index
   */
  Iterator<IntTaggedWord> ruleIteratorByWord(String word, int loc, String featureSpec);

  /** Returns the number of rules (tag rewrites as word) in the Lexicon.
   *  This method assumes that the lexicon has been initialized.
   *
   * @return The number of rules (tag rewrites as word) in the Lexicon.
   */
  public int numRules();

  /**
   * Start training this lexicon on the expected number of trees.
   * (Some UnknownWordModels use the number of trees to know when to
   * start counting statistics.)
   */
  void initializeTraining(double numTrees);

  /**
   * Trains this lexicon on the Collection of trees.
   * Can be called more than once with different collections of trees.
   *
   * @param trees Trees to train on
   */
  void train(Collection<Tree> trees);

  void train(Collection<Tree> trees, double weight);

  // WSGDEBUG
  // Binarizer converts everything to CategoryWordTag, so we lose additional
  // lexical annotations. RawTrees should be the same size as trees.
  void train(Collection<Tree> trees, Collection<Tree> rawTrees);

  void train(Tree tree, double weight);

  /**
   * Not all subclasses support this particular method.  Those that
   * don't will barf...
   */
  void train(List<TaggedWord> sentence, double weight);

  /**
   * Not all subclasses support this particular method.  Those that
   * don't will barf...
   */
  void train(TaggedWord tw, int loc, double weight);

  /**
   * If training on a per-word basis instead of on a per-tree basis,
   * we will want to increment the tree count as this happens.
   */
  void incrementTreesRead(double weight);

  /**
   * Sometimes we might have a sentence of tagged words which we would
   * like to add to the lexicon, but they weren't part of a binarized,
   * markovized, or otherwise annotated tree.
   */
  void trainUnannotated(List<TaggedWord> sentence, double weight);

  /**
   * Done collecting statistics for the lexicon.
   */
  void finishTraining();

  /**
   * Add additional words with expansion of subcategories.
   */
  // void trainWithExpansion(Collection<TaggedWord> taggedWords);

  /**
   * Get the score of this word with this tag (as an IntTaggedWord) at this
   * loc.
   * (Presumably an estimate of P(word | tag).)
   *
   * @param iTW An IntTaggedWord pairing a word and POS tag
   * @param loc The position in the sentence.  <i>In the default implementation
   *               this is used only for unknown words to change their
   *               probability distribution when sentence initial.</i>
   * @param word The word itself; useful so we don't have to look it
   *               up in an index
   * @param featureSpec TODO
   * @return A score, usually, log P(word|tag)
   */
  float score(IntTaggedWord iTW, int loc, String word, String featureSpec);

  /**
   * Write the lexicon in human-readable format to the Writer.
   * (An optional operation.)
   *
   * @param w The writer to output to
   * @throws IOException If any I/O problem
   */
  public void writeData(Writer w) throws IOException;

  /**
   * Read the lexicon from the BufferedReader in the format written by
   * writeData.
   * (An optional operation.)
   *
   * @param in The BufferedReader to read from
   * @throws IOException If any I/O problem
   */
  public void readData(BufferedReader in) throws IOException;

  public UnknownWordModel getUnknownWordModel();

  // todo [cdm Sep 2013]: It seems like we could easily remove this from the interface
  public void setUnknownWordModel(UnknownWordModel uwm);

}