//ExtractorFrames -- StanfordMaxEnt, A Maximum Entropy Toolkit //Copyright (c) 2002-2011 Leland Stanford Junior University //This program is free software; you can redistribute it and/or //modify it under the terms of the GNU General Public License //as published by the Free Software Foundation; either version 2 //of the License, or (at your option) any later version. //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. //You should have received a copy of the GNU General Public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. //For more information, bug reports, fixes, contact: //Christopher Manning //Dept of Computer Science, Gates 1A //Stanford CA 94305-9010 //USA // Support/Questions: java-nlp-user@lists.stanford.edu // Licensing: java-nlp-support@lists.stanford.edu //http://www-nlp.stanford.edu/software/tagger.shtml package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.process.WordShapeClassifier; import edu.stanford.nlp.util.StringUtils; import java.util.*; /** * This class contains the basic feature extractors used for all words and * tag sequences (and interaction terms) for the MaxentTagger, but not the * feature extractors explicitly targeting generalization for rare or unknown * words. * The following options are supported: * <table> * <tr><td>Name</td><td>Args</td><td>Effect</td></tr> * <tr><td>words</td><td>begin, end</td> * <td>Individual features for words begin ... end. * If just one argument words(-2) is given, then end is taken as 0. If * begin is not less than or equal to end, no features are made.</td></tr> * <tr><td>tags</td><td>begin, end</td> * <td>Individual features for tags begin ... end</td></tr> * <tr><td>biword</td><td>w1, w2</td> * <td>One feature for the pair of words w1, w2</td></tr> * <tr><td>biwords</td><td>begin, end</td> * <td>One feature for each sequential pair of words * from begin to end</td></tr> * <tr><td>twoTags</td><td>t1, t2</td> * <td>One feature for the pair of tags t1, t2</td></tr> * <tr><td>lowercasewords</td><td>begin, end</td> * <td>One feature for each word begin ... end, lowercased</td></tr> * <tr><td>order</td><td>left, right</td> * <td>A feature for tags left through 0 and a feature for * tags 0 through right. Lower order left and right features are * also added. * This gets very expensive for higher order terms.</td></tr> * <tr><td>wordTag</td><td>w, t</td> * <td>A feature combining word w and tag t.</td></tr> * <tr><td>wordTwoTags</td><td>w, t1, t2</td> * <td>A feature combining word w and tags t1, t2.</td></tr> * <tr><td>threeTags</td><td>t1, t2, t3</td> * <td>A feature combining tags t1, t2, t3.</td></tr> * <tr><td>vbn</td><td>length</td> * <td>A feature that looks at the left length words for something that * appears to be a VBN (in English) without looking at the actual tags. * It is zeroeth order, as it does not look at the tag predictions. * It also is never used, since it doesn't seem to help.</td></tr> * <tr><td>allwordshapes</td><td>left, right</td> * <td>Word shape features, eg transform Foo5 into Xxx# * (not exactly like that, but that general idea). * Creates individual features for each word left ... right. * Compare with the feature "wordshapes" in ExtractorFramesRare, * which is only applied to rare words. Fairly English-specific. * Slightly increases accuracy.</td></tr> * <tr><td>allunicodeshapes</td><td>left, right</td> * <td>Same thing, but works for unicode characters more generally.</td></tr> * <tr><td>allunicodeshapeconjunction</td><td>left, right</td> * <td>Instead of individual word shape features, combines several * word shapes into one feature.</td></tr> * </table> * * See {@link ExtractorFramesRare} for more options. * <br> * There are also macro features: * <br> * left3words = words(-1,1),order(2) <br> * left5words = words(-2,2),order(2) <br> * generic = words(-1,1),order(2),biwords(-1,0),wordTag(0,-1) <br> * bidirectional5words = * words(-2,2),order(-2,2),twoTags(-1,1), * wordTag(0,-1),wordTag(0,1),biwords(-1,1) <br> * bidirectional = * words(-1,1),order(-2,2),twoTags(-1,1), * wordTag(0,-1),wordTag(0,1),biwords(-1,1) <br> * german = some random stuff <br> * sighan2005 = some other random stuff <br> * The left3words architectures are faster, but slightly less * accurate, than the bidirectional architectures. * 'naacl2003unknowns' was our traditional set of unknown word * features, but you can now specify features more flexibility via the * various other supported keywords. * <br> * @author Kristina Toutanova * @author Michel Galley * @version 1.0 */ public class ExtractorFrames { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ExtractorFrames.class); // all features are implicitly conjoined with the current tag static final Extractor cWord = new Extractor(0, false); private static final Extractor prevWord = new Extractor(-1, false); private static final Extractor prevTag = new Extractor(-1, true); // prev tag and current word! private static final Extractor prevTagWord = new ExtractorWordTag(0, -1); private static final Extractor prevWord2 = new Extractor(-2,false); private static final Extractor prevTwoTag = new Extractor(-2,true); private static final Extractor nextWord = new Extractor(1, false); private static final Extractor nextWord2 = new Extractor(2,false); private static final Extractor nextTag = new Extractor(1, true); // features for 2005 SIGHAN tagger private static final Extractor[] eFrames_sighan2005 = { cWord, prevWord, prevWord2, nextWord, nextWord2, prevTag, prevTwoTag, new ExtractorContinuousTagConjunction(-2) }; // features for a german-language bidirectional tagger private static final Extractor[] eFrames_german ={ cWord, prevWord, nextWord, nextTag, prevTag, new ExtractorContinuousTagConjunction(-2), prevTagWord, new ExtractorTwoWords(-1,0) }; /** * This class is not meant to be instantiated. */ private ExtractorFrames() { } protected static Extractor[] getExtractorFrames(String arch) { // handle some traditional macro options // left3words: a simple trigram CMM tagger (similar to the baseline EMNLP 2000 tagger) // left5words: a simple trigram CMM tagger, like left3words, with 5 word context // generic: our standard multilingual CMM baseline arch = arch.replaceAll("left3words", "words(-1,1),order(2)"); arch = arch.replaceAll("left5words", "words(-2,2),order(2)"); arch = arch.replaceAll("generic", "words(-1,1),order(2),biwords(-1,0),wordTag(0,-1)"); arch = arch.replaceAll("bidirectional5words", "words(-2,2),order(-2,2),twoTags(-1,1),wordTag(0,-1),wordTag(0,1),biwords(-1,1)"); arch = arch.replaceAll("bidirectional", "words(-1,1),order(-2,2),twoTags(-1,1),wordTag(0,-1),wordTag(0,1),biwords(-1,1)"); ArrayList<Extractor> extrs = new ArrayList<>(); List<String> args = StringUtils.valueSplit(arch, "[a-zA-Z0-9]*(?:\\([^)]*\\))?", "\\s*,\\s*"); for (String arg : args) { if (arg.equals("sighan2005")) { extrs.addAll(Arrays.asList(eFrames_sighan2005)); } else if (arg.equalsIgnoreCase("german")) { extrs.addAll(Arrays.asList(eFrames_german)); } else if (arg.startsWith("words(")) { // non-sequence features with just a certain number of words to the // left and right; e.g., words(-2,2) or words(-2,-1) int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); for (int i = lWindow; i <= rWindow; i++) { extrs.add(new Extractor(i, false)); } } else if (arg.startsWith("tags(")) { // non-sequence features with just a certain number of words to the // left and right; e.g., tags(-2,2) or tags(-2,-1) int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); for (int i = lWindow; i <= rWindow; i++) { extrs.add(new Extractor(i, true)); } } else if (arg.startsWith("biwords(")) { // non-sequence features of word pairs. // biwords(-2,1) would give you 3 extractors for w-2w-1, w-1,w0, w0w1 int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); for (int i = lWindow; i < rWindow; i++) { extrs.add(new ExtractorTwoWords(i)); } } else if (arg.startsWith("biword(")) { // non-sequence feature of a word pair. // biwords(-2,1) would give you 1 extractor for w-2, w+1 int left = Extractor.getParenthesizedNum(arg, 1); int right = Extractor.getParenthesizedNum(arg, 2); extrs.add(new ExtractorTwoWords(left, right)); } else if (arg.startsWith("twoTags(")) { // non-sequence feature of a tag pair. // twoTags(-2,1) would give you 1 extractor for t-2, t+1 int left = Extractor.getParenthesizedNum(arg, 1); int right = Extractor.getParenthesizedNum(arg, 2); extrs.add(new ExtractorTwoTags(left, right)); } else if (arg.startsWith("lowercasewords(")) { // non-sequence features with just a certain number of lowercase words // to the left and right int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); for (int i = lWindow; i <= rWindow; i++) { extrs.add(new ExtractorWordLowerCase(i)); } } else if (arg.startsWith("order(")) { // anything like order(2), order(-4), order(0,3), or // order(-2,1) are okay. int leftOrder = Extractor.getParenthesizedNum(arg, 1); int rightOrder = Extractor.getParenthesizedNum(arg, 2); if (leftOrder > 0) { leftOrder = -leftOrder; } if (rightOrder < 0) { throw new IllegalArgumentException("Right order must be non-negative, not " + rightOrder); } // cdm 2009: We only add successively higher order tag k-grams // ending adjacent to t0. Adding lower order features at a distance // appears not to help (Dec 2009). But they can now be added with tags(). for (int idx = leftOrder ; idx <= rightOrder; idx++) { if (idx == 0) { // do nothing } else if (idx == -1 || idx == 1) { extrs.add(new Extractor(idx, true)); } else { extrs.add(new ExtractorContinuousTagConjunction(idx)); } } } else if (arg.startsWith("wordTag(")) { // sequence feature of a word and a tag: wordTag(-1,1) int posW = Extractor.getParenthesizedNum(arg, 1); int posT = Extractor.getParenthesizedNum(arg, 2); extrs.add(new ExtractorWordTag(posW, posT)); } else if (arg.startsWith("wordTwoTags(")) { int word = Extractor.getParenthesizedNum(arg, 1); int tag1 = Extractor.getParenthesizedNum(arg, 2); int tag2 = Extractor.getParenthesizedNum(arg, 3); extrs.add(new ExtractorWordTwoTags(word,tag1,tag2)); } else if (arg.startsWith("threeTags(")) { int pos1 = Extractor.getParenthesizedNum(arg, 1); int pos2 = Extractor.getParenthesizedNum(arg, 2); int pos3 = Extractor.getParenthesizedNum(arg, 3); extrs.add(new ExtractorThreeTags(pos1,pos2,pos3)); } else if (arg.startsWith("vbn(")) { int order = Extractor.getParenthesizedNum(arg, 1); extrs.add(new ExtractorVerbalVBNZero(order)); } else if (arg.startsWith("allwordshapes(")) { int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); String wsc = Extractor.getParenthesizedArg(arg, 3); if (wsc == null) { wsc = "chris2"; } for (int i = lWindow; i <= rWindow; i++) { extrs.add(new ExtractorWordShapeClassifier(i, wsc)); } } else if (arg.startsWith("allwordshapeconjunction(")) { int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); String wsc = Extractor.getParenthesizedArg(arg, 3); if (wsc == null) { wsc = "chris2"; } extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, wsc)); } else if (arg.startsWith("allunicodeshapes(")) { int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); for (int i = lWindow; i <= rWindow; i++) { extrs.add(new ExtractorWordShapeClassifier(i, "chris4")); } } else if (arg.startsWith("allunicodeshapeconjunction(")) { int lWindow = Extractor.getParenthesizedNum(arg, 1); int rWindow = Extractor.getParenthesizedNum(arg, 2); extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, "chris4")); } else if (arg.equalsIgnoreCase("spanishauxiliaries")) { extrs.add(new ExtractorSpanishAuxiliaryTag()); extrs.add(new ExtractorSpanishSemiauxiliaryTag()); } else if (arg.equalsIgnoreCase("naacl2003unknowns") || arg.equalsIgnoreCase("lnaacl2003unknowns") || arg.equalsIgnoreCase("caselessnaacl2003unknowns") || arg.equalsIgnoreCase("naacl2003conjunctions") || arg.equalsIgnoreCase("frenchunknowns") || arg.equalsIgnoreCase("spanishunknowns") || arg.startsWith("wordshapes(") || arg.startsWith("wordshapeconjunction(") || arg.equalsIgnoreCase("motleyUnknown") || arg.startsWith("suffix(") || arg.startsWith("prefix(") || arg.startsWith("prefixsuffix") || arg.startsWith("capitalizationsuffix(") || arg.startsWith("distsim(") || arg.startsWith("distsimconjunction(") || arg.equalsIgnoreCase("lctagfeatures") || arg.startsWith("unicodeshapes(") || arg.startsWith("chinesedictionaryfeatures(") || arg.startsWith("unicodeshapeconjunction(")) { // okay; known unknown keyword } else { log.info("Unrecognized ExtractorFrames identifier (ignored): " + arg); } } // end for return extrs.toArray(new Extractor[extrs.size()]); } /** * This extractor extracts a word and tag in conjunction. */ static class ExtractorWordTag extends Extractor { private static final long serialVersionUID = 3L; private final int wordPosition; public ExtractorWordTag(int posW, int posT) { super(posT, true); wordPosition = posW; } @Override String extract(History h, PairsHolder pH) { return pH.getTag(h, position) + '!' + pH.getWord(h, wordPosition); } @Override public String toString() { return (getClass().getName() + "(w" + wordPosition + ",t" + position + ')'); } } /** * The word in lower-cased version. * Always uses Locale.ENGLISH. */ static class ExtractorWordLowerCase extends Extractor { private static final long serialVersionUID = -7847524200422095441L; public ExtractorWordLowerCase(int position) { super(position, false); } @Override String extract(History h, PairsHolder pH) { return pH.getWord(h, position).toLowerCase(Locale.ENGLISH); } } /** * The current word if it is capitalized, zero otherwise. * Always uses Locale.ENGLISH. */ static class ExtractorCWordCapCase extends Extractor { private static final long serialVersionUID = -2393096135964969744L; @Override String extract(History h, PairsHolder pH) { String cw = pH.getWord(h, 0); String lk = cw.toLowerCase(Locale.ENGLISH); if (lk.equals(cw)) { return zeroSt; } return cw; } @Override public boolean isLocal() { return true; } @Override public boolean isDynamic() { return false; } } /** * This extractor extracts two words in conjunction. * The one argument constructor gives you leftPosition and * leftPosition+1, but with the two argument constructor, * they can be any pair of word positions. */ static class ExtractorTwoWords extends Extractor { private static final long serialVersionUID = -1034112287022504917L; private final int leftPosition; private final int rightPosition; public ExtractorTwoWords(int leftPosition) { this(leftPosition, leftPosition+1); } public ExtractorTwoWords(int position1, int position2) { super(0, false); if (position1 > position2) { leftPosition = position1; rightPosition = position2; } else { leftPosition = position2; rightPosition = position1; } } @Override String extract(History h, PairsHolder pH) { // I ran a bunch of timing tests that seem to indicate it is // cheaper to simply add string + char + string than use a // StringBuilder or go through the StringBuildMemoizer -horatio return pH.getWord(h, leftPosition) + '!' + pH.getWord(h, rightPosition); } @Override public boolean isLocal() { return false; } // isDynamic --> false, but no need to override @Override public String toString() { return (getClass().getName() + "(w" + leftPosition + ",w" + rightPosition + ')'); } } /** * This extractor extracts two tags in conjunction. * The one argument constructor gives you leftPosition and * leftPosition+1, but with the two argument constructor, * they can be any pair of tag positions. */ static class ExtractorTwoTags extends Extractor { private static final long serialVersionUID = -7342144764725605134L; private final int leftPosition; private final int rightPosition; private final int leftContext, rightContext; public ExtractorTwoTags(int position1, int position2) { leftPosition = Math.min(position1, position2); rightPosition = Math.max(position1, position2); leftContext = -Math.min(leftPosition, 0); rightContext = Math.max(rightPosition, 0); } @Override public int rightContext() { return rightContext; } @Override public int leftContext() { return leftContext; } @Override String extract(History h, PairsHolder pH) { // I ran a bunch of timing tests that seem to indicate it is // cheaper to simply add string + char + string than use a // StringBuilder or go through the StringBuildMemoizer -horatio return pH.getTag(h, leftPosition) + '!' + pH.getTag(h, rightPosition); } @Override public boolean isLocal() { return false; } @Override public boolean isDynamic() { return true; } @Override public String toString() { return (getClass().getName() + "(t" + leftPosition + ",t" + rightPosition + ')'); } } /** * This extractor extracts two words and a tag in conjunction. */ static class ExtractorTwoWordsTag extends Extractor { private static final long serialVersionUID = 277004119652781188L; private final int leftWord, rightWord, tag; private final int rightContext, leftContext; public ExtractorTwoWordsTag(int leftWord, int rightWord, int tag) { this.leftWord = Math.min(leftWord, rightWord); this.rightWord = Math.max(leftWord, rightWord); this.tag = tag; this.rightContext = Math.max(tag, 0); this.leftContext = -Math.min(tag, 0); } @Override public int rightContext() { return rightContext; } @Override public int leftContext() { return leftContext; } @Override String extract(History h, PairsHolder pH) { return (pH.getWord(h, leftWord) + '!' + pH.getTag(h, tag) + '!' + pH.getWord(h, rightWord)); } @Override public boolean isLocal() { return false; } @Override public boolean isDynamic() { return true; } @Override public String toString() { return (getClass().getName() + "(w" + leftWord + ",t" + tag + ",w" + rightWord + ')'); } } /** * This extractor extracts several contiguous tags only on one side of position 0. * E.g., use constructor argument -3 for an order 3 predictor on the left. * isLocal=false, isDynamic=true (through super call) */ static class ExtractorContinuousTagConjunction extends Extractor { private static final long serialVersionUID = 3; public ExtractorContinuousTagConjunction(int maxPosition) { super(maxPosition, true); } @Override String extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); if (position < 0) { for (int idx = position; idx < 0; idx++) { if (idx != position) { sb.append('!'); } sb.append(pH.getTag(h, idx)); } } else { for (int idx = position; idx > 0; idx--) { if (idx != position) { sb.append('!'); } sb.append(pH.getTag(h, idx)); } } return sb.toString(); } } /** * This extractor extracts three tags. */ static class ExtractorThreeTags extends Extractor { private static final long serialVersionUID = 8563584394721620568L; private int position1; private int position2; private int position3; public ExtractorThreeTags(int position1, int position2, int position3) { // bubblesort them! int x; if (position1 > position2) { x = position2; position2 = position1; position1 = x; } if (position2 > position3) { x = position3; position3 = position2; position2 = x; } if (position1 > position2) { x = position2; position2 = position1; position1 = x; } this.position1 = position1; this.position2 = position2; this.position3 = position3; } @Override public int rightContext() { if (position3 > 0) { return position3; } else { return 0; } } @Override public int leftContext() { if (position1 < 0) { return -position1; } else { return 0; } } @Override String extract(History h, PairsHolder pH) { return pH.getTag(h, position1) + '!' + pH.getTag(h, position2) + '!' + pH.getTag(h, position3); } @Override public boolean isLocal() { return false; } @Override public boolean isDynamic() { return true; } @Override public String toString() { return (getClass().getName() + "(t" + position1 + ",t" + position2 + ",t" + position3 + ')'); } } /** * This extractor extracts two tags and the a word in conjunction. */ static class ExtractorWordTwoTags extends Extractor { private static final long serialVersionUID = -4942654091455804176L; // We sort so that position1 <= position2 and then rely on that. private int position1; private int position2; private int word; public ExtractorWordTwoTags(int word, int position1, int position2) { if (position1 < position2) { this.position1 = position1; this.position2 = position1; } else { this.position1 = position2; this.position2 = position1; } this.word = word; } @Override public int leftContext() { if (position1 < 0) { return -position1; } else { return 0; } } @Override public int rightContext() { if (position2 > 0) { return position2; } else { return 0; } } @Override String extract(History h, PairsHolder pH) { return pH.getTag(h, position1) + '!' + pH.getWord(h, word) + '!' + pH.getTag(h, position2); } @Override public boolean isLocal() { return false; } @Override public boolean isDynamic() { return true; } @Override public String toString() { return (getClass().getName() + "(t" + position1 + ",t" + position2 + ",w" + word + ')'); } } } // end class ExtractorFrames class ExtractorWordShapeClassifier extends Extractor { private final int wordShaper; private final String name; // This cache speeds things up a little bit. I used // -Xrunhprof:cpu=samples,interval=1 when using the "distsim" tagger // on the training set to measure roughly how much time was spent in // this method. I concluded that with the cache, 1.24% of the time // is spent here, and without the cache, 1.26% of the time is spent // here. This is a very small savings, which would be even smaller // if we make the cache thread safe. It turns out that, as written, // the cache is not thread safe for various reasons. In particular, // it assumes only one wordshape classifier is ever used, which // might not be true even with just one tagger, and has an even // higher chance of not being true if there are multiple taggers. // Furthermore, access to the cache should really be synchronized // regardless. The easiest solution is to comment out the cache and // note that if you want to bring it back, make it a map from wsc to // cache rather than just a single cache. -- horatio //private static final Map<String, String> shapes = // Generics.newHashMap(); // --- should be: //private static final Map<String, Map<String, String>> ... ExtractorWordShapeClassifier(int position, String wsc) { super(position, false); wordShaper = WordShapeClassifier.lookupShaper(wsc); name = "ExtractorWordShapeClassifier(" + position+ ',' + wsc + ')'; } @Override String extract(History h, PairsHolder pH) { String s = super.extract(h, pH); String shape = WordShapeClassifier.wordShape(s, wordShaper); return shape; } private static final long serialVersionUID = 101L; @Override public String toString() { return name; } @Override public boolean isLocal() { return position == 0; } @Override public boolean isDynamic() { return false; } } /** * This extractor extracts a conjunction of word shapes. */ class ExtractorWordShapeConjunction extends Extractor { private static final long serialVersionUID = -49L; private final int wordShaper; private final int left; private final int right; private final String name; ExtractorWordShapeConjunction(int left, int right, String wsc) { super(); this.left = left; this.right = right; wordShaper = WordShapeClassifier.lookupShaper(wsc); name = "ExtractorWordShapeConjunction(" + left + ',' + right + ',' + wsc + ')'; } @Override String extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); for (int j = left; j <= right; j++) { String s = pH.getWord(h, j); sb.append(WordShapeClassifier.wordShape(s, wordShaper)); if (j < right) { sb.append('|'); } } return sb.toString(); } @Override public String toString() { return name; } @Override public boolean isLocal() { return false; } @Override public boolean isDynamic() { return false; } } /** * Extracts a boolean indicating whether the given word is preceded by * an auxiliary verb. */ class ExtractorSpanishAuxiliaryTag extends Extractor { private static final long serialVersionUID = -3352770856914897103L; public ExtractorSpanishAuxiliaryTag() { super(-1, true); } @Override String extract(History h, PairsHolder pH) { String tag = super.extract(h, pH); boolean isAux = tag.length() >= 2 && tag.substring(0, 2).equals("va"); return isAux ? "1" : "0"; } @Override public String toString() { return "ExtractorSpanishAuxiliaryTag"; } } /** * Extracts a boolean indicating whether the given word is preceded by * a semi-auxiliary verb. */ class ExtractorSpanishSemiauxiliaryTag extends Extractor { private static final long serialVersionUID = -164942945521643734L; public ExtractorSpanishSemiauxiliaryTag() { super(-1, true); } @Override String extract(History h, PairsHolder pH) { String tag = super.extract(h, pH); boolean isSemiAux = tag.length() >= 2 && tag.substring(0, 2).equals("vs"); return isSemiAux ? "1" : "0"; } @Override public String toString() { return "ExtractorSpanishSemiauxiliaryTag"; } }