/**
* Title: StanfordMaxEnt<p>
* Description: A Maximum Entropy Toolkit<p>
* Copyright: Copyright (c) Kristina Toutanova<p>
* Company: Stanford University<p>
*/
package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.Serializable;
/**
* This class serves as the base class for classes which extract relevant
* information from a history to give it to the features. Every feature has
* an associated extractor or maybe more. GlobalHolder keeps all the
* extractors; two histories are considered equal if all extractors return
* equal values for them. The main functionality of the Extractors is
* provided by the method extract which takes a History as an argument.
* The Extractor looks at the history and takes out something important for
* the features - e.g. specific words and tags at specific positions or
* some function of the History. The histories are effectively vectors
* of values, with each dimension being the output of some extractor.
* <p>
* New extractors are created in either ExtractorFrames or
* ExtractorFramesRare; those are the places you want to consider
* adding your new extractor. For a new Extractor, typically the things
* that you have to define are:
* <ul>
* <li>leftContext() and/or rightContext() if the extractor uses the tag
* sequence to the left or right (so that dynamic programming will be done
* correctly.
* <li>isLocal() Return true iff the function is only of the current word
* (for efficiency)
* <li>isDynamic() Return true if a function of any tags (for efficiency)
* <li>extract(History, PairsHolder) The actual function that returns the
* value for the feature.
* </ul>
* <p>
* Note that some extractors can be reused across multiple taggers,
* but many cannot. Any extractor that uses information from the
* tagger such as its dictionary, for example, cannot. For the
* moment, some of the extractors in ExtractorFrames and
* ExtractorFramesRare are static; those are all reusable at the
* moment, but if you change them in any way to make them not
* reusable, make sure to change the way they are constructed as well.
*
* @author Kristina Toutanova
* @version 1.0
*/
public class Extractor implements Serializable {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(Extractor.class);
private static final long serialVersionUID = -4694133872973560083L;
static final String zeroSt = "0";
final int position;
private final boolean isTag;
public Extractor() {
this(Integer.MAX_VALUE, false);
}
/**
* This constructor creates an extractor which extracts either the tag or
* the word from position position in the history.
*
* @param position The position of the thing to be extracted. This is
* relative to the current word. For example, position 0
* will be the current word, -1 will be
* the word before +1 will be the word after, etc.
* @param isTag If true this means that the POS tag is extracted from
* position, otherwise the word is extracted.
*/
protected Extractor(int position, boolean isTag) {
this.position = position;
this.isTag = isTag;
}
/**
* Subclasses should override this method and keep only the data
* they want about the tagger. Note that such data should also be
* declared "transient" if it is already available in the tagger.
* This is because, when we save the tagger to disk, we do so by
* writing out objects, and there is no need to write the same
* object more than once. setGlobalHolder will be called both after
* construction when building a new tag and when loading existing
* taggers from disk, so the same data will available then as well.
*/
protected void setGlobalHolder(MaxentTagger tagger) {}
/** This evaluates any precondition for a feature being applicable based
* on a certain tag. It returns true if the feature is applicable.
* By default an Extractor is applicable everywhere, but some
* subclasses limit application.
*
* @param tag The possible tag that the feature will be generated for
* @return Whether the feature extractor is applicable (true) or not (false)
*/
@SuppressWarnings({"MethodMayBeStatic", "UnusedDeclaration"})
public boolean precondition(String tag) {
return true;
}
/**
* @return the number of positions to the left the extractor looks at (only tags, because words are fixed.)
*/
public int leftContext() {
if (isTag) {
if (position < 0) {
return -position;
}
}
return 0;
}
/**
* @return the number of positions to the right the extractor looks at (only tags, because words are fixed.)
*/
public int rightContext() {
if (isTag) {
if (position > 0) {
return position;
}
}
return 0;
}
// CDM May 2007: This feature is currently never used. Maybe we should
// change things so it is, and each feature template has a threshold, but
// need to then work out what a TaggerFeature is and whether we should still
// be using one of those to index with.
// At present real threshold check happens in TaggerExperiments with
// the populated(int, int) method.
// public boolean isPopulated(TaggerFeature f) {
// return (f.indexedValues.length > GlobalHolder.minFeatureThresh);
// }
/** Subclasses should only override the two argument version
* of this method.
*
* @param h The history to extract from
* @return The feature value
*/
final String extract(History h) {
return extract(h, h.pairs);
}
/**
* @return Returns true if extractor is a function of POS tags; if it returns false,
* features are pre-computed.
*/
public boolean isDynamic() {
return isTag;
}
/**
* @return Returns true if extractor is not a function of POS tags, and only
* depends on current word.
*/
public boolean isLocal() {
return !isTag && position == 0;
}
String extract(History h, PairsHolder pH) {
return isTag ? pH.getTag(h, position) : pH.getWord(h, position);
}
@SuppressWarnings({"MethodMayBeStatic"})
String extractLV(History h, PairsHolder pH) {
// should extract last verbal word and also the current word
int start = h.start;
String lastverb = "NA";
int current = h.current;
int index = current - 1;
while (index >= start) {
String tag = pH.getTag(index);
if (tag.startsWith("VB")) {
lastverb = pH.getWord(index);
break;
}
if (tag.startsWith(",")) {
break;
}
index--;
}
return lastverb;
}
String extractLV(History h, PairsHolder pH, int bound) {
// should extract last verbal word and also the current word
int start = h.start;
String lastverb = "NA";
int current = h.current;
int index = current - 1;
while ((index >= start) && (index >= current - bound)) {
String tag = pH.getTag(index);
if (tag.startsWith("VB")) {
lastverb = pH.getWord(index);
break;
}
if (tag.startsWith(",")) {
break;
}
index--;
}
return lastverb;
}
// By default the bound is ignored, but a few subclasses make use of it.
@SuppressWarnings({"UnusedDeclaration"})
String extract(History h, PairsHolder pH, int bound) {
return extract(h, pH);
}
@Override
public String toString() {
String cl = getClass().getName();
int ind = cl.lastIndexOf('.');
// MAX_VALUE is the default value and means we aren't using these two arguments
String args = (position == Integer.MAX_VALUE) ? "": (position + "," + (isTag ? "tag" : "word"));
return cl.substring(ind + 1) + '(' + args + ')';
}
/** This is used for argument parsing in arch variable.
* It can extract a comma separated argument.
* Assumes the input format is "name(arg,arg,arg)".
*
* @param str arch variable component input
* @param num Number of argument
* @return The parenthesized String, or null if none.
*/
static String getParenthesizedArg(String str, int num) {
String[] args = str.split("\\s*[,()]\\s*");
if (args.length <= num) {
return null;
}
// log.info("getParenthesizedArg split " + str + " into " + args.length + " pieces; returning number " + num);
// for (int i = 0; i < args.length; i++) {
// log.info(" " + args[i]);
// }
return args[num];
}
/** This is used for argument parsing in arch variable.
* It can extract a comma separated argument.
* Assumes the input format is "name(arg,arg,arg)", with possible
* spaces around the parentheses and comma(s).
*
* @param str arch variable component input
* @param num Number of argument
* @return The int value of the arg or 0 if missing or empty
*/
static int getParenthesizedNum(String str, int num) {
String[] args = str.split("\\s*[,()]\\s*");
int ans = 0;
try {
ans = Integer.parseInt(args[num]);
} catch (NumberFormatException nfe) {
// just leave ans as 0
} catch (ArrayIndexOutOfBoundsException aioobe) {
// just leave ans as 0
}
return ans;
}
}