ExtractorFrames.java example

Explorer
CoreNLP-master
//ExtractorFrames -- StanfordMaxEnt, A Maximum Entropy Toolkit
//Copyright (c) 2002-2011 Leland Stanford Junior University


//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.

//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.

//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
//    Support/Questions: java-nlp-user@lists.stanford.edu
//    Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml


package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.util.StringUtils;

import java.util.*;


/**
 * This class contains the basic feature extractors used for all words and
 * tag sequences (and interaction terms) for the MaxentTagger, but not the
 * feature extractors explicitly targeting generalization for rare or unknown
 * words.
 * The following options are supported:
 * <table>
 * <tr><td>Name</td><td>Args</td><td>Effect</td></tr>
 * <tr><td>words</td><td>begin, end</td>
 *     <td>Individual features for words begin ... end.
 *     If just one argument words(-2) is given, then end is taken as 0. If
 *     begin is not less than or equal to end, no features are made.</td></tr>
 * <tr><td>tags</td><td>begin, end</td>
 *     <td>Individual features for tags begin ... end</td></tr>
 * <tr><td>biword</td><td>w1, w2</td>
 *     <td>One feature for the pair of words w1, w2</td></tr>
 * <tr><td>biwords</td><td>begin, end</td>
 *     <td>One feature for each sequential pair of words
 *         from begin to end</td></tr>
 * <tr><td>twoTags</td><td>t1, t2</td>
 *     <td>One feature for the pair of tags t1, t2</td></tr>
 * <tr><td>lowercasewords</td><td>begin, end</td>
 *     <td>One feature for each word begin ... end, lowercased</td></tr>
 * <tr><td>order</td><td>left, right</td>
 *     <td>A feature for tags left through 0 and a feature for
 *         tags 0 through right.  Lower order left and right features are
 *         also added.
 *         This gets very expensive for higher order terms.</td></tr>
 * <tr><td>wordTag</td><td>w, t</td>
 *     <td>A feature combining word w and tag t.</td></tr>
 * <tr><td>wordTwoTags</td><td>w, t1, t2</td>
 *     <td>A feature combining word w and tags t1, t2.</td></tr>
 * <tr><td>threeTags</td><td>t1, t2, t3</td>
 *     <td>A feature combining tags t1, t2, t3.</td></tr>
 * <tr><td>vbn</td><td>length</td>
 *     <td>A feature that looks at the left length words for something that
 *         appears to be a VBN (in English) without looking at the actual tags.
 *         It is zeroeth order, as it does not look at the tag predictions.
 *         It also is never used, since it doesn't seem to help.</td></tr>
 * <tr><td>allwordshapes</td><td>left, right</td>
 *     <td>Word shape features, eg transform Foo5 into Xxx#
 *         (not exactly like that, but that general idea).
 *         Creates individual features for each word left ... right.
 *         Compare with the feature "wordshapes" in ExtractorFramesRare,
 *         which is only applied to rare words. Fairly English-specific.
 *         Slightly increases accuracy.</td></tr>
 * <tr><td>allunicodeshapes</td><td>left, right</td>
 *     <td>Same thing, but works for unicode characters more generally.</td></tr>
 * <tr><td>allunicodeshapeconjunction</td><td>left, right</td>
 *     <td>Instead of individual word shape features, combines several
 *         word shapes into one feature.</td></tr>
 * </table>
 *
 * See {@link ExtractorFramesRare} for more options.
 * <br>
 * There are also macro features:
 * <br>
 * left3words = words(-1,1),order(2) <br>
 * left5words = words(-2,2),order(2) <br>
 * generic = words(-1,1),order(2),biwords(-1,0),wordTag(0,-1) <br>
 * bidirectional5words =
 *   words(-2,2),order(-2,2),twoTags(-1,1),
 *   wordTag(0,-1),wordTag(0,1),biwords(-1,1) <br>
 * bidirectional =
 *   words(-1,1),order(-2,2),twoTags(-1,1),
 *   wordTag(0,-1),wordTag(0,1),biwords(-1,1) <br>
 * german = some random stuff <br>
 * sighan2005 = some other random stuff <br>
 * The left3words architectures are faster, but slightly less
 * accurate, than the bidirectional architectures.
 * 'naacl2003unknowns' was our traditional set of unknown word
 * features, but you can now specify features more flexibility via the
 * various other supported keywords.
 * <br>
 * @author Kristina Toutanova
 * @author Michel Galley
 * @version 1.0
 */
public class ExtractorFrames  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ExtractorFrames.class);

  // all features are implicitly conjoined with the current tag
  static final Extractor cWord = new Extractor(0, false);
  private static final Extractor prevWord = new Extractor(-1, false);
  private static final Extractor prevTag = new Extractor(-1, true);
  // prev tag and current word!
  private static final Extractor prevTagWord = new ExtractorWordTag(0, -1);

  private static final Extractor prevWord2 = new Extractor(-2,false);
  private static final Extractor prevTwoTag = new Extractor(-2,true);
  private static final Extractor nextWord = new Extractor(1, false);
  private static final Extractor nextWord2 = new Extractor(2,false);
  private static final Extractor nextTag = new Extractor(1, true);


  // features for 2005 SIGHAN tagger
  private static final Extractor[] eFrames_sighan2005 = { cWord, prevWord, prevWord2, nextWord, nextWord2, prevTag, prevTwoTag, new ExtractorContinuousTagConjunction(-2) };

  // features for a german-language bidirectional tagger
  private static final Extractor[] eFrames_german ={ cWord, prevWord, nextWord, nextTag,
      prevTag, new ExtractorContinuousTagConjunction(-2), prevTagWord, new ExtractorTwoWords(-1,0) };

  /**
   * This class is not meant to be instantiated.
   */
  private ExtractorFrames() {
  }


  protected static Extractor[] getExtractorFrames(String arch) {
    // handle some traditional macro options
    // left3words: a simple trigram CMM tagger (similar to the baseline EMNLP 2000 tagger)
    // left5words: a simple trigram CMM tagger, like left3words, with 5 word context
    // generic: our standard multilingual CMM baseline

    arch = arch.replaceAll("left3words", "words(-1,1),order(2)");
    arch = arch.replaceAll("left5words", "words(-2,2),order(2)");
    arch = arch.replaceAll("generic", "words(-1,1),order(2),biwords(-1,0),wordTag(0,-1)");
    arch = arch.replaceAll("bidirectional5words", "words(-2,2),order(-2,2),twoTags(-1,1),wordTag(0,-1),wordTag(0,1),biwords(-1,1)");
    arch = arch.replaceAll("bidirectional", "words(-1,1),order(-2,2),twoTags(-1,1),wordTag(0,-1),wordTag(0,1),biwords(-1,1)");

    ArrayList<Extractor> extrs = new ArrayList<>();
    List<String> args = StringUtils.valueSplit(arch, "[a-zA-Z0-9]*(?:\\([^)]*\\))?", "\\s*,\\s*");
    for (String arg : args) {
      if (arg.equals("sighan2005")) {
        extrs.addAll(Arrays.asList(eFrames_sighan2005));
      } else if (arg.equalsIgnoreCase("german")) {
        extrs.addAll(Arrays.asList(eFrames_german));
      } else if (arg.startsWith("words(")) {
        // non-sequence features with just a certain number of words to the
        // left and right; e.g., words(-2,2) or words(-2,-1)
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        for (int i = lWindow; i <= rWindow; i++) {
          extrs.add(new Extractor(i, false));
        }
      } else if (arg.startsWith("tags(")) {
        // non-sequence features with just a certain number of words to the
        // left and right; e.g., tags(-2,2) or tags(-2,-1)
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        for (int i = lWindow; i <= rWindow; i++) {
          extrs.add(new Extractor(i, true));
        }
      } else if (arg.startsWith("biwords(")) {
        // non-sequence features of word pairs.
        // biwords(-2,1) would give you 3 extractors for w-2w-1, w-1,w0, w0w1
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        for (int i = lWindow; i < rWindow; i++) {
          extrs.add(new ExtractorTwoWords(i));
        }
      } else if (arg.startsWith("biword(")) {
        // non-sequence feature of a word pair.
        // biwords(-2,1) would give you 1 extractor for w-2, w+1
        int left = Extractor.getParenthesizedNum(arg, 1);
        int right = Extractor.getParenthesizedNum(arg, 2);
        extrs.add(new ExtractorTwoWords(left, right));
      } else if (arg.startsWith("twoTags(")) {
        // non-sequence feature of a tag pair.
        // twoTags(-2,1) would give you 1 extractor for t-2, t+1
        int left = Extractor.getParenthesizedNum(arg, 1);
        int right = Extractor.getParenthesizedNum(arg, 2);
        extrs.add(new ExtractorTwoTags(left, right));
      } else if (arg.startsWith("lowercasewords(")) {
        // non-sequence features with just a certain number of lowercase words
        // to the left and right
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        for (int i = lWindow; i <= rWindow; i++) {
          extrs.add(new ExtractorWordLowerCase(i));
        }
      } else if (arg.startsWith("order(")) {
        // anything like order(2), order(-4), order(0,3), or
        // order(-2,1) are okay.
        int leftOrder = Extractor.getParenthesizedNum(arg, 1);
        int rightOrder = Extractor.getParenthesizedNum(arg, 2);
        if (leftOrder > 0) { leftOrder = -leftOrder; }
	if (rightOrder < 0) { throw new IllegalArgumentException("Right order must be non-negative, not " + rightOrder); }
        // cdm 2009: We only add successively higher order tag k-grams
        // ending adjacent to t0.  Adding lower order features at a distance
        // appears not to help (Dec 2009). But they can now be added with tags().

        for (int idx = leftOrder ; idx <= rightOrder; idx++) {
          if (idx == 0) {
            // do nothing
          } else if (idx == -1 || idx == 1) {
            extrs.add(new Extractor(idx, true));
          } else {
            extrs.add(new ExtractorContinuousTagConjunction(idx));
          }
        }
      } else if (arg.startsWith("wordTag(")) {
        // sequence feature of a word and a tag: wordTag(-1,1)
        int posW = Extractor.getParenthesizedNum(arg, 1);
        int posT = Extractor.getParenthesizedNum(arg, 2);
        extrs.add(new ExtractorWordTag(posW, posT));
      } else if (arg.startsWith("wordTwoTags(")) {
        int word = Extractor.getParenthesizedNum(arg, 1);
        int tag1 = Extractor.getParenthesizedNum(arg, 2);
        int tag2 = Extractor.getParenthesizedNum(arg, 3);
        extrs.add(new ExtractorWordTwoTags(word,tag1,tag2));
      } else if (arg.startsWith("threeTags(")) {
        int pos1 = Extractor.getParenthesizedNum(arg, 1);
        int pos2 = Extractor.getParenthesizedNum(arg, 2);
        int pos3 = Extractor.getParenthesizedNum(arg, 3);
        extrs.add(new ExtractorThreeTags(pos1,pos2,pos3));
      } else if (arg.startsWith("vbn(")) {
        int order = Extractor.getParenthesizedNum(arg, 1);
        extrs.add(new ExtractorVerbalVBNZero(order));
      } else if (arg.startsWith("allwordshapes(")) {
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        String wsc = Extractor.getParenthesizedArg(arg, 3);
        if (wsc == null) {
          wsc = "chris2";
        }
        for (int i = lWindow; i <= rWindow; i++) {
          extrs.add(new ExtractorWordShapeClassifier(i, wsc));
        }
      } else if (arg.startsWith("allwordshapeconjunction(")) {
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        String wsc = Extractor.getParenthesizedArg(arg, 3);
        if (wsc == null) {
          wsc = "chris2";
        }
        extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, wsc));
      } else if (arg.startsWith("allunicodeshapes(")) {
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        for (int i = lWindow; i <= rWindow; i++) {
          extrs.add(new ExtractorWordShapeClassifier(i, "chris4"));
        }
      } else if (arg.startsWith("allunicodeshapeconjunction(")) {
        int lWindow = Extractor.getParenthesizedNum(arg, 1);
        int rWindow = Extractor.getParenthesizedNum(arg, 2);
        extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, "chris4"));
      } else if (arg.equalsIgnoreCase("spanishauxiliaries")) {
        extrs.add(new ExtractorSpanishAuxiliaryTag());
        extrs.add(new ExtractorSpanishSemiauxiliaryTag());
      } else if (arg.equalsIgnoreCase("naacl2003unknowns") ||
                 arg.equalsIgnoreCase("lnaacl2003unknowns") ||
                 arg.equalsIgnoreCase("caselessnaacl2003unknowns") ||
                 arg.equalsIgnoreCase("naacl2003conjunctions") ||
                 arg.equalsIgnoreCase("frenchunknowns") ||
                 arg.equalsIgnoreCase("spanishunknowns") ||
                 arg.startsWith("wordshapes(") ||
                 arg.startsWith("wordshapeconjunction(") ||
                 arg.equalsIgnoreCase("motleyUnknown") ||
                 arg.startsWith("suffix(") ||
                 arg.startsWith("prefix(") ||
                 arg.startsWith("prefixsuffix") ||
                 arg.startsWith("capitalizationsuffix(") ||
                 arg.startsWith("distsim(") ||
                 arg.startsWith("distsimconjunction(") ||
                 arg.equalsIgnoreCase("lctagfeatures") ||
                 arg.startsWith("unicodeshapes(") ||
                 arg.startsWith("chinesedictionaryfeatures(") ||
                 arg.startsWith("unicodeshapeconjunction(")) {
        // okay; known unknown keyword
      } else {
        log.info("Unrecognized ExtractorFrames identifier (ignored): " + arg);
      }
    } // end for
    return extrs.toArray(new Extractor[extrs.size()]);
  }


  /**
   * This extractor extracts a word and tag in conjunction.
   */
  static class ExtractorWordTag extends Extractor {

    private static final long serialVersionUID = 3L;

    private final int wordPosition;
    public ExtractorWordTag(int posW, int posT) {
      super(posT, true);
      wordPosition = posW;
    }

    @Override
    String extract(History h, PairsHolder pH) {
      return pH.getTag(h, position) + '!' + pH.getWord(h, wordPosition);
    }

    @Override
    public String toString() {
      return (getClass().getName() + "(w" + wordPosition +
              ",t" + position + ')');
    }
  }


  /**
   * The word in lower-cased version.
   * Always uses Locale.ENGLISH.
   */
  static class ExtractorWordLowerCase extends Extractor {

    private static final long serialVersionUID = -7847524200422095441L;

    public ExtractorWordLowerCase(int position) {
      super(position, false);
    }

    @Override
      String extract(History h, PairsHolder pH) {
      return pH.getWord(h, position).toLowerCase(Locale.ENGLISH);
    }

  }

  /**
   * The current word if it is capitalized, zero otherwise.
   * Always uses Locale.ENGLISH.
   */
  static class ExtractorCWordCapCase extends Extractor {

    private static final long serialVersionUID = -2393096135964969744L;

    @Override
      String extract(History h, PairsHolder pH) {
      String cw = pH.getWord(h, 0);
      String lk = cw.toLowerCase(Locale.ENGLISH);
      if (lk.equals(cw)) {
        return zeroSt;
      }
      return cw;
    }

    @Override public boolean isLocal() { return true; }
    @Override public boolean isDynamic() { return false; }
  }


  /**
   * This extractor extracts two words in conjunction.
   * The one argument constructor gives you leftPosition and
   * leftPosition+1, but with the two argument constructor,
   * they can be any pair of word positions.
   */
  static class ExtractorTwoWords extends Extractor {

    private static final long serialVersionUID = -1034112287022504917L;

    private final int leftPosition;
    private final int rightPosition;

    public ExtractorTwoWords(int leftPosition) {
      this(leftPosition, leftPosition+1);
    }

    public ExtractorTwoWords(int position1, int position2) {
      super(0, false);
      if (position1 > position2) {
        leftPosition = position1;
        rightPosition = position2;
      } else {
        leftPosition = position2;
        rightPosition = position1;
      }
    }

    @Override
      String extract(History h, PairsHolder pH) {
      // I ran a bunch of timing tests that seem to indicate it is
      // cheaper to simply add string + char + string than use a
      // StringBuilder or go through the StringBuildMemoizer -horatio
      return pH.getWord(h, leftPosition) + '!' + pH.getWord(h, rightPosition);
    }

    @Override public boolean isLocal() { return false; }

    // isDynamic --> false, but no need to override


    @Override
    public String toString() {
      return (getClass().getName() + "(w" + leftPosition +
              ",w" + rightPosition + ')');
    }
  }



  /**
   * This extractor extracts two tags in conjunction.
   * The one argument constructor gives you leftPosition and
   * leftPosition+1, but with the two argument constructor,
   * they can be any pair of tag positions.
   */
  static class ExtractorTwoTags extends Extractor {

    private static final long serialVersionUID = -7342144764725605134L;

    private final int leftPosition;
    private final int rightPosition;
    private final int leftContext, rightContext;

    public ExtractorTwoTags(int position1, int position2) {
      leftPosition = Math.min(position1, position2);
      rightPosition = Math.max(position1, position2);

      leftContext = -Math.min(leftPosition, 0);
      rightContext = Math.max(rightPosition, 0);
    }

    @Override
    public int rightContext() {
      return rightContext;
    }

    @Override
    public int leftContext() {
      return leftContext;
    }

    @Override
    String extract(History h, PairsHolder pH) {
      // I ran a bunch of timing tests that seem to indicate it is
      // cheaper to simply add string + char + string than use a
      // StringBuilder or go through the StringBuildMemoizer -horatio
      return pH.getTag(h, leftPosition) + '!' + pH.getTag(h, rightPosition);
    }

    @Override public boolean isLocal() { return false; }
    @Override public boolean isDynamic() { return true; }

    @Override
    public String toString() {
      return (getClass().getName() + "(t" + leftPosition +
              ",t" + rightPosition + ')');
    }
  }


  /**
   * This extractor extracts two words and a tag in conjunction.
   */
  static class ExtractorTwoWordsTag extends Extractor {

    private static final long serialVersionUID = 277004119652781188L;

    private final int leftWord, rightWord, tag;
    private final int rightContext, leftContext;

    public ExtractorTwoWordsTag(int leftWord, int rightWord, int tag) {
      this.leftWord = Math.min(leftWord, rightWord);
      this.rightWord = Math.max(leftWord, rightWord);
      this.tag = tag;

      this.rightContext = Math.max(tag, 0);
      this.leftContext = -Math.min(tag, 0);
    }

    @Override
    public int rightContext() {
      return rightContext;
    }

    @Override
    public int leftContext() {
      return leftContext;
    }

    @Override
      String extract(History h, PairsHolder pH) {
      return (pH.getWord(h, leftWord) + '!' + pH.getTag(h, tag) + '!' +
              pH.getWord(h, rightWord));
    }

    @Override public boolean isLocal() { return false; }
    @Override public boolean isDynamic() { return true; }

    @Override
    public String toString() {
      return (getClass().getName() + "(w" + leftWord +
              ",t" + tag + ",w" + rightWord + ')');
    }
  }



  /**
   * This extractor extracts several contiguous tags only on one side of position 0.
   * E.g., use constructor argument -3 for an order 3 predictor on the left.
   * isLocal=false, isDynamic=true (through super call)
   */
  static class ExtractorContinuousTagConjunction extends Extractor {

    private static final long serialVersionUID = 3;

    public ExtractorContinuousTagConjunction(int maxPosition) {
      super(maxPosition, true);
    }

    @Override
    String extract(History h, PairsHolder pH) {
      StringBuilder sb = new StringBuilder();
      if (position < 0) {
        for (int idx = position; idx < 0; idx++) {
          if (idx != position) {
            sb.append('!');
          }
          sb.append(pH.getTag(h, idx));
        }
      } else {
        for (int idx = position; idx > 0; idx--) {
          if (idx != position) {
            sb.append('!');
          }
          sb.append(pH.getTag(h, idx));
        }
      }
      return sb.toString();
    }

  }


  /**
   * This extractor extracts three tags.
   */
  static class ExtractorThreeTags extends Extractor {

    private static final long serialVersionUID = 8563584394721620568L;

    private int position1;
    private int position2;
    private int position3;

    public ExtractorThreeTags(int position1, int position2, int position3) {
      // bubblesort them!
      int x;
      if (position1 > position2) {
        x = position2;
        position2 = position1;
        position1 = x;
      }
      if (position2 > position3) {
        x = position3;
        position3 = position2;
        position2 = x;
      }
      if (position1 > position2) {
        x = position2;
        position2 = position1;
        position1 = x;
      }
      this.position1 = position1;
      this.position2 = position2;
      this.position3 = position3;
    }

    @Override
      public int rightContext() {
      if (position3 > 0) {
        return position3;
      } else {
        return 0;
      }
    }

    @Override
      public int leftContext() {
      if (position1 < 0) {
        return -position1;
      } else {
        return 0;
      }
    }

    @Override
      String extract(History h, PairsHolder pH) {
      return pH.getTag(h, position1) + '!' + pH.getTag(h, position2) + '!' + pH.getTag(h, position3);
    }

    @Override public boolean isLocal() { return false; }
    @Override public boolean isDynamic() { return true; }

    @Override
    public String toString() {
      return (getClass().getName() + "(t" + position1 +
              ",t" + position2 + ",t" + position3 + ')');
    }
  }


  /**
   * This extractor extracts two tags and the a word in conjunction.
   */
  static class ExtractorWordTwoTags extends Extractor {

    private static final long serialVersionUID = -4942654091455804176L;

    // We sort so that position1 <= position2 and then rely on that.
    private int position1;
    private int position2;
    private int word;

    public ExtractorWordTwoTags(int word, int position1, int position2) {
      if (position1 < position2) {
        this.position1 = position1;
        this.position2 = position1;
      } else {
        this.position1 = position2;
        this.position2 = position1;
      }
      this.word = word;
    }

    @Override
      public int leftContext() {
      if (position1 < 0) {
        return  -position1;
      } else {
        return 0;
      }
    }

    @Override
      public int rightContext() {
      if (position2 > 0) {
        return position2;
      } else {
        return 0;
      }
    }

    @Override
      String extract(History h, PairsHolder pH) {
      return pH.getTag(h, position1) + '!' + pH.getWord(h, word) + '!' + pH.getTag(h, position2);
    }

    @Override public boolean isLocal() { return false; }
    @Override public boolean isDynamic() { return true; }

    @Override
    public String toString() {
      return (getClass().getName() + "(t" + position1 +
              ",t" + position2 + ",w" + word + ')');
    }
  }


} // end class ExtractorFrames


class ExtractorWordShapeClassifier extends Extractor {

  private final int wordShaper;
  private final String name;

  // This cache speeds things up a little bit.  I used
  // -Xrunhprof:cpu=samples,interval=1 when using the "distsim" tagger
  // on the training set to measure roughly how much time was spent in
  // this method.  I concluded that with the cache, 1.24% of the time
  // is spent here, and without the cache, 1.26% of the time is spent
  // here.  This is a very small savings, which would be even smaller
  // if we make the cache thread safe.  It turns out that, as written,
  // the cache is not thread safe for various reasons.  In particular,
  // it assumes only one wordshape classifier is ever used, which
  // might not be true even with just one tagger, and has an even
  // higher chance of not being true if there are multiple taggers.
  // Furthermore, access to the cache should really be synchronized
  // regardless.  The easiest solution is to comment out the cache and
  // note that if you want to bring it back, make it a map from wsc to
  // cache rather than just a single cache.  -- horatio
  //private static final Map<String, String> shapes =
  //  Generics.newHashMap();
  // --- should be:
  //private static final Map<String, Map<String, String>> ...

  ExtractorWordShapeClassifier(int position, String wsc) {
    super(position, false);
    wordShaper = WordShapeClassifier.lookupShaper(wsc);
    name = "ExtractorWordShapeClassifier(" + position+ ',' + wsc + ')';
  }

  @Override
  String extract(History h, PairsHolder pH) {
    String s = super.extract(h, pH);
    String shape = WordShapeClassifier.wordShape(s, wordShaper);
    return shape;
  }

  private static final long serialVersionUID = 101L;

  @Override
  public String toString() {
    return name;
  }

  @Override public boolean isLocal() { return position == 0; }
  @Override public boolean isDynamic() { return false; }
}


/**
 * This extractor extracts a conjunction of word shapes.
 */
class ExtractorWordShapeConjunction extends Extractor {

  private static final long serialVersionUID = -49L;

  private final int wordShaper;
  private final int left;
  private final int right;
  private final String name;

  ExtractorWordShapeConjunction(int left, int right, String wsc) {
    super();
    this.left = left;
    this.right = right;
    wordShaper = WordShapeClassifier.lookupShaper(wsc);
    name = "ExtractorWordShapeConjunction(" + left + ',' + right + ',' + wsc + ')';
  }

  @Override
  String extract(History h, PairsHolder pH) {
    StringBuilder sb = new StringBuilder();
    for (int j = left; j <= right; j++) {
      String s = pH.getWord(h, j);
      sb.append(WordShapeClassifier.wordShape(s, wordShaper));
      if (j < right) {
        sb.append('|');
      }
    }
    return sb.toString();
  }

  @Override
  public String toString() {
    return name;
  }

  @Override public boolean isLocal() { return false; }
  @Override public boolean isDynamic() { return false; }

}


/**
 * Extracts a boolean indicating whether the given word is preceded by
 * an auxiliary verb.
 */
class ExtractorSpanishAuxiliaryTag extends Extractor {

  private static final long serialVersionUID = -3352770856914897103L;

  public ExtractorSpanishAuxiliaryTag() {
    super(-1, true);
  }

  @Override
  String extract(History h, PairsHolder pH) {
    String tag = super.extract(h, pH);
    boolean isAux = tag.length() >= 2 && tag.substring(0, 2).equals("va");

    return isAux ? "1" : "0";
  }

  @Override
  public String toString() {
    return "ExtractorSpanishAuxiliaryTag";
  }

}


/**
 * Extracts a boolean indicating whether the given word is preceded by
 * a semi-auxiliary verb.
 */
class ExtractorSpanishSemiauxiliaryTag extends Extractor {

  private static final long serialVersionUID = -164942945521643734L;

  public ExtractorSpanishSemiauxiliaryTag() {
    super(-1, true);
  }

  @Override
  String extract(History h, PairsHolder pH) {
    String tag = super.extract(h, pH);
    boolean isSemiAux = tag.length() >= 2 && tag.substring(0, 2).equals("vs");

    return isSemiAux ? "1" : "0";
  }

  @Override
  public String toString() {
    return "ExtractorSpanishSemiauxiliaryTag";
  }

}