Extractor.java example

Explorer
CoreNLP-master
/**
 * Title:        StanfordMaxEnt<p>
 * Description:  A Maximum Entropy Toolkit<p>
 * Copyright:    Copyright (c) Kristina Toutanova<p>
 * Company:      Stanford University<p>
 */

package edu.stanford.nlp.tagger.maxent; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.Serializable;



/**
 * This class serves as the base class for classes which extract relevant
 * information from a history to give it to the features. Every feature has
 * an associated extractor or maybe more.  GlobalHolder keeps all the
 * extractors; two histories are considered equal if all extractors return
 * equal values for them.  The main functionality of the Extractors is
 * provided by the method extract which takes a History as an argument.
 * The Extractor looks at the history and takes out something important for
 * the features - e.g. specific words and tags at specific positions or
 * some function of the History. The histories are effectively vectors
 * of values, with each dimension being the output of some extractor.
 * <p>
 * New extractors are created in either ExtractorFrames or
 * ExtractorFramesRare; those are the places you want to consider
 * adding your new extractor.  For a new Extractor, typically the things
 * that you have to define are:
 * <ul>
 * <li>leftContext() and/or rightContext() if the extractor uses the tag
 * sequence to the left or right (so that dynamic programming will be done
 * correctly.
 * <li>isLocal() Return true iff the function is only of the current word
 * (for efficiency)
 * <li>isDynamic() Return true if a function of any tags (for efficiency)
 * <li>extract(History, PairsHolder) The actual function that returns the
 * value for the feature.
 * </ul>
 * <p>
 * Note that some extractors can be reused across multiple taggers,
 * but many cannot.  Any extractor that uses information from the
 * tagger such as its dictionary, for example, cannot.  For the
 * moment, some of the extractors in ExtractorFrames and
 * ExtractorFramesRare are static; those are all reusable at the
 * moment, but if you change them in any way to make them not
 * reusable, make sure to change the way they are constructed as well.
 *
 * @author Kristina Toutanova
 * @version 1.0
 */
public class Extractor implements Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(Extractor.class);

  private static final long serialVersionUID = -4694133872973560083L;

  static final String zeroSt = "0";

  final int position;
  private final boolean isTag;

  public Extractor() {
    this(Integer.MAX_VALUE, false);
  }


  /**
   * This constructor creates an extractor which extracts either the tag or
   * the word from position position in the history.
   *
   * @param position The position of the thing to be extracted. This is
   *                 relative to the current word. For example, position 0
   *                 will be the current word, -1 will be
   *                 the word before +1 will be the word after, etc.
   * @param isTag    If true this means that the POS tag is extracted from
   *                 position, otherwise the word is extracted.
   */
  protected Extractor(int position, boolean isTag) {
    this.position = position;
    this.isTag = isTag;
  }

  /**
   * Subclasses should override this method and keep only the data
   * they want about the tagger.  Note that such data should also be
   * declared "transient" if it is already available in the tagger.
   * This is because, when we save the tagger to disk, we do so by
   * writing out objects, and there is no need to write the same
   * object more than once.  setGlobalHolder will be called both after
   * construction when building a new tag and when loading existing
   * taggers from disk, so the same data will available then as well.
   */
  protected void setGlobalHolder(MaxentTagger tagger) {}


  /** This evaluates any precondition for a feature being applicable based
   *  on a certain tag. It returns true if the feature is applicable.
   *  By default an Extractor is applicable everywhere, but some
   *  subclasses limit application.
   *
   *  @param tag The possible tag that the feature will be generated for
   *  @return Whether the feature extractor is applicable (true) or not (false)
   */
  @SuppressWarnings({"MethodMayBeStatic", "UnusedDeclaration"})
  public boolean precondition(String tag) {
    return true;
  }


  /**
   * @return the number of positions to the left the extractor looks at (only tags, because words are fixed.)
   */
  public int leftContext() {
    if (isTag) {
      if (position < 0) {
        return -position;
      }
    }

    return 0;
  }


  /**
   * @return the number of positions to the right the extractor looks at (only tags, because words are fixed.)
   */
  public int rightContext() {
    if (isTag) {
      if (position > 0) {
        return position;
      }
    }

    return 0;
  }

  // CDM May 2007: This feature is currently never used. Maybe we should
  // change things so it is, and each feature template has a threshold, but
  // need to then work out what a TaggerFeature is and whether we should still
  // be using one of those to index with.
  // At present real threshold check happens in TaggerExperiments with
  // the populated(int, int) method.
  //  public boolean isPopulated(TaggerFeature f) {
  //    return (f.indexedValues.length > GlobalHolder.minFeatureThresh);
  //  }

  /** Subclasses should only override the two argument version
   *  of this method.
   *
   *  @param h The history to extract from
   *  @return The feature value
   */
  final String extract(History h) {
    return extract(h, h.pairs);
  }

  /**
   * @return Returns true if extractor is a function of POS tags; if it returns false,
   * features are pre-computed.
   */
  public boolean isDynamic() {
    return isTag;
  }

  /**
   * @return Returns true if extractor is not a function of POS tags, and only
   * depends on current word.
   */
  public boolean isLocal() {
    return !isTag && position == 0;
  }

  String extract(History h, PairsHolder pH) {
    return isTag ? pH.getTag(h, position) : pH.getWord(h, position);
  }

  @SuppressWarnings({"MethodMayBeStatic"})
  String extractLV(History h, PairsHolder pH) {
    // should extract last verbal word and also the current word
    int start = h.start;
    String lastverb = "NA";
    int current = h.current;
    int index = current - 1;
    while (index >= start) {
      String tag = pH.getTag(index);
      if (tag.startsWith("VB")) {
        lastverb = pH.getWord(index);
        break;
      }
      if (tag.startsWith(",")) {
        break;
      }
      index--;
    }
    return lastverb;
  }

  String extractLV(History h, PairsHolder pH, int bound) {
    // should extract last verbal word and also the current word
    int start = h.start;
    String lastverb = "NA";
    int current = h.current;
    int index = current - 1;
    while ((index >= start) && (index >= current - bound)) {
      String tag = pH.getTag(index);
      if (tag.startsWith("VB")) {
        lastverb = pH.getWord(index);
        break;
      }
      if (tag.startsWith(",")) {
        break;
      }
      index--;
    }
    return lastverb;
  }


  // By default the bound is ignored, but a few subclasses make use of it.
  @SuppressWarnings({"UnusedDeclaration"})
  String extract(History h, PairsHolder pH, int bound) {
    return extract(h, pH);
  }


  @Override
  public String toString() {
    String cl = getClass().getName();
    int ind = cl.lastIndexOf('.');
    // MAX_VALUE is the default value and means we aren't using these two arguments
    String args = (position == Integer.MAX_VALUE) ? "": (position + "," + (isTag ? "tag" : "word"));
    return cl.substring(ind + 1) + '(' + args + ')';
  }


  /** This is used for argument parsing in arch variable.
   *  It can extract a comma separated argument.
   *  Assumes the input format is "name(arg,arg,arg)".
   *
   *  @param str arch variable component input
   *  @param num Number of argument
   *  @return The parenthesized String, or null if none.
   */
  static String getParenthesizedArg(String str, int num) {
    String[] args = str.split("\\s*[,()]\\s*");
    if (args.length <= num) {
      return null;
    }
    // log.info("getParenthesizedArg split " + str + " into " + args.length + " pieces; returning number " + num);
    // for (int i = 0; i < args.length; i++) {
    //   log.info("  " + args[i]);
    // }
    return args[num];
  }

  /** This is used for argument parsing in arch variable.
   *  It can extract a comma separated argument.
   *  Assumes the input format is "name(arg,arg,arg)", with possible
   *  spaces around the parentheses and comma(s).
   *
   *  @param str arch variable component input
   *  @param num Number of argument
   *  @return The int value of the arg or 0 if missing or empty
   */
  static int getParenthesizedNum(String str, int num) {
    String[] args = str.split("\\s*[,()]\\s*");
    int ans = 0;
    try {
      ans = Integer.parseInt(args[num]);
    } catch (NumberFormatException nfe) {
      // just leave ans as 0
    } catch (ArrayIndexOutOfBoundsException aioobe) {
      // just leave ans as 0
    }
    return ans;
  }

}