// ExtractorFramesRare -- StanfordMaxEnt, A Maximum Entropy Toolkit
// Copyright (c) 2002-2008 The Board of Trustees of
// Leland Stanford Junior University. All rights reserved.
//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
// Support/Questions: java-nlp-user@lists.stanford.edu
// Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml
package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.international.french.FrenchUnknownWordSignatures;
import edu.stanford.nlp.international.spanish.SpanishUnknownWordSignatures;
import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import java.util.*;
/**
* This class contains feature extractors for the MaxentTagger that are only
* applied to rare (low frequency/unknown) words.
* The following options are supported:
* <table>
* <tr><td>Name</td><td>Args</td><td>Effect</td></tr>
* <tr><td>wordshapes</td><td>left, right</td>
* <td>Word shape features, e.g., transform Foo5 into Xxx#
* (not exactly like that, but that general idea).
* Creates individual features for each word left ... right.
* If just one argument wordshapes(-2) is given, then end is taken as 0.
* If left is not less than or equal to right, no features are made.
* Fairly English-specific.</td></tr>
* <tr><td>unicodeshapes</td><td>left, right</td>
* <td>Same thing, but works for unicode characters generally.</td></tr>
* <tr><td>unicodeshapeconjunction</td><td>left, right</td>
* <td>Instead of individual word shape features, combines several
* word shapes into one feature.</td></tr>
* <tr><td>suffix</td><td>length, position</td>
* <td>Features for suffixes of the word position. One feature for
* each suffix of length 1 ... length.</td></tr>
* <tr><td>prefix</td><td>length, position</td>
* <td>Features for prefixes of the word position. One feature for
* each prefix of length 1 ... length.</td></tr>
* <tr><td>prefixsuffix</td><td>length</td>
* <td>Features for concatenated prefix and suffix. One feature for
* each of length 1 ... length.</td></tr>
* <tr><td>capitalizationsuffix</td><td>length</td>
* <td>Current word only. Combines character suffixes up to size length with a
* binary value for whether the word contains any capital letters.</td></tr>
* <tr><td>distsim</td><td>filename, left, right</td>
* <td>Individual features for each position left ... right.
* Compares that word with the dictionary in filename.</td></tr>
* <tr><td>distsimconjunction</td><td>filename, left, right</td>
* <td>A concatenation of distsim features from left ... right.</td></tr>
* </table>
* Also available are the macros "naacl2003unknowns",
* "lnaacl2003unknowns", and "naacl2003conjunctions".
* naacl2003unknowns and lnaacl2003unknowns include suffix extractors
* and extractors for specific word shape features, such as containing
* or not containing a digit.
* <br>
* The macro "frenchunknowns" is a macro for five extractors specific
* to French, which test the end of the word to see if it matches
* common suffixes for various POS classes and plural words. Adding
* this experiment did not improve accuracy over the regular
* naacl2003unknowns extractor macro, though.
* <br>
* @author Kristina Toutanova
* @author Christopher Manning
* @author Michel Galley
* @version 2.0
*/
public class ExtractorFramesRare {
/**
* Last 1-4 characters of word
*/
private static final Extractor cWordSuff1 = new ExtractorWordSuff(1, 0);
private static final Extractor cWordSuff2 = new ExtractorWordSuff(2, 0);
private static final Extractor cWordSuff3 = new ExtractorWordSuff(3, 0);
private static final Extractor cWordSuff4 = new ExtractorWordSuff(4, 0);
/**
* "1" iff word contains 1 or more upper case characters (somewhere)
*/
private static final Extractor cWordUppCase = new ExtractorUCase();
/**
* "1" iff word contains 1 or more digit characters (somewhere)
*/
private static final Extractor cWordNumber = new ExtractorCNumber();
/**
* "1" iff word contains 1 or more dash characters (somewhere)
*/
private static final Extractor cWordDash = new ExtractorDash();
/**
* "1" if token has no lower case letters
*/
private static final Extractor cNoLower = new ExtractorAllCap();
/**
* "1" if token has only upper case letters
*/
private static final Extractor cAllCapitalized = new ExtractorAllCapitalized();
/**
* "1" if capitalized and one of following 3 words is Inc., Co., or Corp.
*/
private static final Extractor cCompany = new CompanyNameDetector();
/**
* "1" if capitalized and one of following 3 words is Inc., Co.,
* Corp., or similar words
*/
private static final Extractor cCaselessCompany =
new CaselessCompanyNameDetector();
/**
* "1" if word contains letter, digit, and dash, in any position and case
*/
private static final Extractor cLetterDigitDash = new ExtractorLetterDigitDash();
/**
* "1" if word contains uppercase letter, digit, and dash
*/
private static final Extractor cUpperDigitDash = new ExtractorUpperDigitDash();
/**
* Distance to lowercase word. Used by another extractor....
*/
private static final Extractor cCapDist = new ExtractorCapDistLC();
private static final Extractor[] eFrames_motley_naacl2003 = { cWordUppCase, cWordNumber, cWordDash, cNoLower, cLetterDigitDash, cCompany, cAllCapitalized, cUpperDigitDash};
private static final Extractor[] eFrames_motley_naacl2003_left = { cWordUppCase, cWordNumber, cWordDash, cNoLower, cLetterDigitDash, cAllCapitalized, cUpperDigitDash};
private static final Extractor[] eFrames_motley_caseless_naacl2003 = { cWordNumber, cWordDash, cLetterDigitDash, cCaselessCompany};
/**
* Whether it has a typical French noun suffix.
*/
private static final ExtractorFrenchNounSuffix cWordFrenchNounSuffix =
new ExtractorFrenchNounSuffix();
/**
* Whether it has a typical French adverb suffix.
*/
private static final ExtractorFrenchAdvSuffix cWordFrenchAdvSuffix =
new ExtractorFrenchAdvSuffix();
/**
* Whether it has a typical French verb suffix.
*/
private static final ExtractorFrenchVerbSuffix cWordFrenchVerbSuffix =
new ExtractorFrenchVerbSuffix();
/**
* Whether it has a typical French adjective suffix.
*/
private static final ExtractorFrenchAdjSuffix cWordFrenchAdjSuffix =
new ExtractorFrenchAdjSuffix();
/**
* Whether it has a typical French plural suffix.
*/
private static final ExtractorFrenchPluralSuffix cWordFrenchPluralSuffix =
new ExtractorFrenchPluralSuffix();
private static final Extractor[] french_unknown_extractors = { cWordFrenchNounSuffix, cWordFrenchAdvSuffix, cWordFrenchVerbSuffix, cWordFrenchAdjSuffix, cWordFrenchPluralSuffix };
/**
* Extracts Spanish gender patterns.
*/
private static final ExtractorSpanishGender cWordSpanishGender =
new ExtractorSpanishGender();
/**
* Matches conditional-tense verb suffixes.
*/
private static final ExtractorSpanishConditionalSuffix cWordSpanishConditionalSuffix =
new ExtractorSpanishConditionalSuffix();
/**
* Matches imperfect-tense verb suffixes (-er, -ir verbs).
*/
private static final ExtractorSpanishImperfectErIrSuffix cWordSpanishImperfectErIrSuffix =
new ExtractorSpanishImperfectErIrSuffix();
private static final Extractor[] spanish_unknown_extractors = {
cWordSpanishGender, cWordSpanishConditionalSuffix,
cWordSpanishImperfectErIrSuffix
};
private ExtractorFramesRare() {
}
/**
* Adds a few specific extractors needed by both "naacl2003unknowns"
* and "lnaacl2003unknowns".
*/
private static void getNaaclExtractors(ArrayList<Extractor> extrs) {
extrs.add(new ExtractorStartSentenceCap());
extrs.add(new ExtractorMidSentenceCapC());
extrs.add(new ExtractorMidSentenceCap());
for (int i = 1; i <= 10; i++) {
extrs.add(new ExtractorWordSuff(i, 0));
}
for (int i = 1; i <= 10; i++) {
extrs.add(new ExtractorWordPref(i, 0));
}
}
/**
* Adds a few specific extractors needed by "naacl2003unknowns" in a
* caseless form.
*/
private static void getCaselessNaaclExtractors(ArrayList<Extractor> extrs) {
for (int i = 1; i <= 10; i++) {
extrs.add(new ExtractorWordSuff(i, 0));
}
for (int i = 1; i <= 10; i++) {
extrs.add(new ExtractorWordPref(i, 0));
}
}
/** Get an array of rare word feature Extractor identified by a name.
* Note: Names used here must also be known in getExtractorFrames, so we
* can appropriately add error messages. So if you add a keyword here,
* add it there as one to be ignored, too. (In the next iteration, this
* class and ExtractorFrames should probably just be combined).
*
* @param identifier Describes a set of extractors for rare word features
* @return A set of extractors for rare word features
*/
protected static Extractor[] getExtractorFramesRare(String identifier, TTags ttags) {
ArrayList<Extractor> extrs = new ArrayList<>();
List<String> args = StringUtils.valueSplit(identifier, "[a-zA-Z0-9]*(?:\\([^)]*\\))?", "\\s*,\\s*");
for (String arg : args) {
if ("naacl2003unknowns".equalsIgnoreCase(arg)) {
extrs.addAll(Arrays.asList(eFrames_motley_naacl2003));
getNaaclExtractors(extrs);
} else if (("lnaacl2003unknowns").equalsIgnoreCase(arg)) {
extrs.addAll(Arrays.asList(eFrames_motley_naacl2003_left));
getNaaclExtractors(extrs);
} else if ("caselessnaacl2003unknowns".equalsIgnoreCase(arg)) {
extrs.addAll(Arrays.asList(eFrames_motley_caseless_naacl2003));
getCaselessNaaclExtractors(extrs);
// TODO: test this next one
} else if ("naacl2003conjunctions".equalsIgnoreCase(arg)) {
extrs.addAll(Arrays.asList(naacl2003Conjunctions()));
} else if ("frenchunknowns".equalsIgnoreCase(arg)) {
extrs.addAll(Arrays.asList(french_unknown_extractors));
} else if (arg.startsWith("wordshapes(")) {
int lWindow = Extractor.getParenthesizedNum(arg, 1);
int rWindow = Extractor.getParenthesizedNum(arg, 2);
String wsc = Extractor.getParenthesizedArg(arg, 3);
if (wsc == null) {
wsc = "chris2";
}
for (int i = lWindow; i <= rWindow; i++) {
extrs.add(new ExtractorWordShapeClassifier(i, wsc));
}
} else if (arg.startsWith("wordshapeconjunction(")) {
int lWindow = Extractor.getParenthesizedNum(arg, 1);
int rWindow = Extractor.getParenthesizedNum(arg, 2);
String wsc = Extractor.getParenthesizedArg(arg, 3);
if (wsc == null) {
wsc = "chris2";
}
for (int i = lWindow; i <= rWindow; i++) {
extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, wsc));
}
} else if (arg.startsWith("unicodeshapes(")) {
int lWindow = Extractor.getParenthesizedNum(arg, 1);
int rWindow = Extractor.getParenthesizedNum(arg, 2);
for (int i = lWindow; i <= rWindow; i++) {
extrs.add(new ExtractorWordShapeClassifier(i, "chris4"));
}
} else if (arg.startsWith("unicodeshapeconjunction(")) {
int lWindow = Extractor.getParenthesizedNum(arg, 1);
int rWindow = Extractor.getParenthesizedNum(arg, 2);
extrs.add(new ExtractorWordShapeConjunction(lWindow, rWindow, "chris4"));
} else if (arg.startsWith("chinesedictionaryfeatures(")) {
throw new RuntimeException("These features are no longer supported." +
" The paths and data files associated " +
"with this material are out of date, and " +
"the classes used are not thread-safe. " +
"Those problems would need to be fixed " +
"to use this feature.");
//String path = Extractor.getParenthesizedArg(arg, 1);
//// Default nlp location for these features is: /u/nlp/data/pos-tagger/dictionary
//int lWindow = Extractor.getParenthesizedNum(arg, 2);
//int rWindow = Extractor.getParenthesizedNum(arg, 3);
//// First set up the dictionary prefix for the Chinese dictionaries
//ASBCDict.setPathPrefix(path);
//for (int i = lWindow; i <= rWindow; i++) {
// extrs.addAll(Arrays.asList(ctbPreFeatures(i)));
// extrs.addAll(Arrays.asList(ctbSufFeatures(i)));
// extrs.addAll(Arrays.asList(ctbUnkDictFeatures(i)));
// extrs.addAll(Arrays.asList(asbcUnkFeatures(i)));
//}
// No longer add prefix suffix features, now that you can more flexibly add them separately.
// } else if ("generic".equalsIgnoreCase(arg)) {
// // does prefix and suffix up to 6 grams
// for (int i = 1; i <= 6; i++) {
// extrs.add(new ExtractorCWordSuff(i));
// extrs.add(new ExtractorCWordPref(i));
// }
} else if (arg.equalsIgnoreCase("motleyUnknown")) { // This is naacl2003unknown minus prefix and suffix features.
extrs.addAll(Arrays.asList(eFrames_motley_naacl2003));
} else if (arg.startsWith("suffix(")) {
int max = Extractor.getParenthesizedNum(arg, 1);
// will conveniently be 0 if not specified
int position = Extractor.getParenthesizedNum(arg, 2);
for (int i = 1; i <= max; i++) {
extrs.add(new ExtractorWordSuff(i, position));
}
} else if (arg.startsWith("prefix(")) {
int max = Extractor.getParenthesizedNum(arg, 1);
// will conveniently be 0 if not specified
int position = Extractor.getParenthesizedNum(arg, 2);
for (int i = 1; i <= max; i++) {
extrs.add(new ExtractorWordPref(i, position));
}
} else if (arg.startsWith("prefixsuffix(")) {
int max = Extractor.getParenthesizedNum(arg, 1);
for (int i = 1; i <= max; i++) {
extrs.add(new ExtractorsConjunction(new ExtractorWordPref(i, 0),
new ExtractorWordSuff(i, 0)));
}
} else if (arg.startsWith("capitalizationsuffix(")) {
int max = Extractor.getParenthesizedNum(arg, 1);
for (int i = 1; i <= max; i++) {
extrs.add(new ExtractorsConjunction(cWordUppCase, new ExtractorWordSuff(i,0)));
}
} else if (arg.startsWith("distsim(")) {
String path = Extractor.getParenthesizedArg(arg, 1);
// traditional nlp filesystem location is: /u/nlp/data/pos_tags_are_useless/egw.bnc.200.pruned
int lWindow = Extractor.getParenthesizedNum(arg, 2);
int rWindow = Extractor.getParenthesizedNum(arg, 3);
for (int i = lWindow; i <= rWindow; i++) {
extrs.add(new ExtractorDistsim(path, i));
}
} else if (arg.startsWith("distsimconjunction(")) {
String path = Extractor.getParenthesizedArg(arg, 1);
int lWindow = Extractor.getParenthesizedNum(arg, 2);
int rWindow = Extractor.getParenthesizedNum(arg, 3);
extrs.add(new ExtractorDistsimConjunction(path, lWindow, rWindow));
} else if (arg.equalsIgnoreCase("lctagfeatures")) {
extrs.addAll(Arrays.asList(lcTagFeatures(ttags)));
}
}
return extrs.toArray(new Extractor[extrs.size()]);
}
/**
* This provides the conjunction of various features as rare words features.
*
* @return An array of feature conjunctions
*/
private static Extractor[] naacl2003Conjunctions() {
Extractor[] newW = new Extractor[24];
//add them manually ....
newW[0] = new ExtractorsConjunction(cWordUppCase, cWordSuff1);
newW[1] = new ExtractorsConjunction(cWordUppCase, cWordSuff2);
newW[2] = new ExtractorsConjunction(cWordUppCase, cWordSuff3);
newW[3] = new ExtractorsConjunction(cWordUppCase, cWordSuff4);
newW[4] = new ExtractorsConjunction(cNoLower, cWordSuff1);
newW[5] = new ExtractorsConjunction(cNoLower, cWordSuff2);
newW[6] = new ExtractorsConjunction(cNoLower, cWordSuff3);
newW[7] = new ExtractorsConjunction(cNoLower, cWordSuff4);
Extractor cMidSentence = new ExtractorMidSentenceCap();
newW[8] = new ExtractorsConjunction(cMidSentence, cWordSuff1);
newW[9] = new ExtractorsConjunction(cMidSentence, cWordSuff2);
newW[10] = new ExtractorsConjunction(cMidSentence, cWordSuff3);
newW[11] = new ExtractorsConjunction(cMidSentence, cWordSuff4);
Extractor cWordStartUCase = new ExtractorStartSentenceCap();
newW[12] = new ExtractorsConjunction(cWordStartUCase, cWordSuff1);
newW[13] = new ExtractorsConjunction(cWordStartUCase, cWordSuff2);
newW[14] = new ExtractorsConjunction(cWordStartUCase, cWordSuff3);
newW[15] = new ExtractorsConjunction(cWordStartUCase, cWordSuff4);
Extractor cWordMidUCase = new ExtractorMidSentenceCapC();
newW[16] = new ExtractorsConjunction(cWordMidUCase, cWordSuff1);
newW[17] = new ExtractorsConjunction(cWordMidUCase, cWordSuff2);
newW[18] = new ExtractorsConjunction(cWordMidUCase, cWordSuff3);
newW[19] = new ExtractorsConjunction(cWordMidUCase, cWordSuff4);
newW[20] = new ExtractorsConjunction(cCapDist, cWordSuff1);
newW[21] = new ExtractorsConjunction(cCapDist, cWordSuff2);
newW[22] = new ExtractorsConjunction(cCapDist, cWordSuff3);
newW[23] = new ExtractorsConjunction(cCapDist, cWordSuff4);
return newW;
}
private static Extractor[] lcTagFeatures(TTags ttags) {
Extractor[] newE = new Extractor[ttags.getSize()];
for (int i = 0; i < ttags.getSize(); i++) {
String tag = ttags.getTag(i);
newE[i] = new ExtractorCapLCSeen(tag);
}
return newE;
}
/* private ExtractorFramesRare() {
// this is now a statics only class!
} */
/*
ArrayList<Extractor> v = new ArrayList<Extractor>();
GlobalHolder.ySize = GlobalHolder.tags.getSize();
for (int i = 1; i < 5; i++) {
for (int y = 0; y < GlobalHolder.tags.getSize(); y++) {
if (!GlobalHolder.tags.isClosed(GlobalHolder.tags.getTag(y))) {
ExtractorMorpho extr = new ExtractorMorpho(i, y);
v.add(extr);
}// if open
}
}// for i
for (int y = 0; y < GlobalHolder.ySize; y++) {
for (int y1 = 0; y1 < GlobalHolder.ySize; y1++) {
if (!GlobalHolder.tags.isClosed(GlobalHolder.tags.getTag(y)) && (!GlobalHolder.tags.isClosed(GlobalHolder.tags.getTag(y)))) {
ExtractorMorpho extr = new ExtractorMorpho(5, y, y1);
v.add(extr);
}// if open
}
}
int vSize = v.size();
Extractor[] eFramestemp = new Extractor[eFrames.length + vSize];
System.arraycopy(eFrames, 0, eFramestemp, 0, eFrames.length);
for (int i = 0; i < vSize; i++) {
eFramestemp[i + eFrames.length] = v.get(i);
}
eFrames = eFramestemp;
*/
private static Extractor[] ctbPreFeatures(int n) {
String[] tagsets = {"AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NP", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
Extractor[] newW = new Extractor[tagsets.length];
for (int k = 0; k < tagsets.length; k++) {
newW[k] = new CtbPreDetector(tagsets[k], n);
}
return newW;
} // end ctbPreFeatures
private static Extractor[] ctbSufFeatures(int n) {
String[] tagsets = {"AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NP", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
Extractor[] newW=new Extractor[tagsets.length];
for(int k=0;k<tagsets.length;k++){
newW[k] = new CtbSufDetector(tagsets[k], n);
}
return newW;
} // end ctbSuffFeatures
/*
public static Extractor[] asbcPreFeatures(int n) {
String[] tagsets = {"A", "Caa", "Cab", "Cba", "Cbb", "D", "DE", "DK", "Da", "Dd", "De", "Des", "Dfa", "Dfb", "Di", "Dk", "FW", "I", " Na", "Nb", " Nc", "Ncb", "Ncd", " Nd", "Neaq", "Nep", "Neqa", "Neqb", "Nes", "Neu", "Nf", "Ng", "Nh", "P", "PU", "SHI", "T", "VA", "VAC", "VB", "VC", "VCL", "VD", "VE", "VF", "VG", "VH", "VHC", "VI", "VJ", "VK", "VL", "V_2" };
Extractor[] newW=new Extractor[tagsets.length];
for(int k=0;k<tagsets.length;k++){
newW[k] = new ASBCPreDetector(tagsets[k], n);
}
return newW;
}
public static Extractor[] asbcSufFeatures(int n) {
String[] tagsets = {"A", "Caa", "Cab", "Cba", "Cbb", "D", "DE", "DK", "Da", "Dd", "De", "Des", "Dfa", "Dfb", "Di", "Dk", "FW", "I", " Na", "Nb", " Nc", "Ncb", "Ncd", " Nd", "Neaq", "Nep", "Neqa", "Neqb", "Nes", "Neu", "Nf", "Ng", "Nh", "P", "PU", "SHI", "T", "VA", "VAC", "VB", "VC", "VCL", "VD", "VE", "VF", "VG", "VH", "VHC", "VI", "VJ", "VK", "VL", "V_2" };
Extractor[] newW=new Extractor[tagsets.length];
for(int k=0;k<tagsets.length;k++){
newW[k] = new ASBCSufDetector(tagsets[k], n);
}
return newW;
}
*/
private static Extractor[] asbcUnkFeatures(int n) {
String[] tagsets = {"A", "Caa", "Cab", "Cba", "Cbb", "D", "DE", "DK", "Da", "Dd", "De", "Des", "Dfa", "Dfb", "Di", "Dk", "FW", "I", " Na", "Nb", " Nc", "Ncb", "Ncd", " Nd", "Neaq", "Nep", "Neqa", "Neqb", "Nes", "Neu", "Nf", "Ng", "Nh", "P", "PU", "SHI", "T", "VA", "VAC", "VB", "VC", "VCL", "VD", "VE", "VF", "VG", "VH", "VHC", "VI", "VJ", "VK", "VL", "V_2" };
Extractor[] newW=new Extractor[tagsets.length];
for(int k=0;k<tagsets.length;k++){
newW[k] = new ASBCunkDetector(tagsets[k], n);
}
return newW;
}
private static Extractor[] ctbUnkDictFeatures(int n) {
String[] tagsets = {"A", "Caa", "Cab", "Cba", "Cbb", "D", "DE", "DK", "Da", "Dd", "De", "Des", "Dfa", "Dfb", "Di", "Dk", "FW", "I", " Na", "Nb", " Nc", "Ncb", "Ncd", " Nd", "Neaq", "Nep", "Neqa", "Neqb", "Nes", "Neu", "Nf", "Ng", "Nh", "P", "PU", "SHI", "T", "VA", "VAC", "VB", "VC", "VCL", "VD", "VE", "VF", "VG", "VH", "VHC", "VI", "VJ", "VK", "VL", "V_2" };
Extractor[] newW=new Extractor[tagsets.length];
for(int k=0;k<tagsets.length;k++){
newW[k] = new CTBunkDictDetector(tagsets[k], n);
}
return newW;
}
} // end class ExtractorFramesRare
/**
* Superclass for rare word feature frames. Provides some common functions.
* Designed to be extended.
*/
class RareExtractor extends Extractor {
static final String naTag = "NA";
RareExtractor() {
super();
}
RareExtractor(int position) {
super(position, false);
}
static boolean startsUpperCase(String s) {
if (s == null || s.length() == 0) {
return false;
}
char ch = s.charAt(0);
return Character.isUpperCase(ch);
}
/**
* A string is lowercase if it starts with a lowercase letter
* such as one from a to z.
* Should we include numbers?
* @param s The String to check
* @return If its first character is lower case
*/
protected static boolean startsLowerCase(String s) {
if (s == null) {
return false;
}
char ch = s.charAt(0);
return Character.isLowerCase(ch);
}
protected static boolean containsDash(String s) {
return s != null && s.indexOf('-') >= 0;
}
protected static boolean containsNumber(String s) {
if (s == null) {
return false;
}
for (int i = 0, len = s.length(); i < len; i++) {
if (Character.isDigit(s.charAt(i))) {
return true;
}
}
return false;
}
protected static boolean containsLetter(String s) {
if (s == null) {
return false;
}
for (int i = 0, len = s.length(); i < len; i++) {
if (Character.isLetter(s.charAt(i))) {
return true;
}
}
return false;
}
protected static boolean containsUpperCase(String s) {
if (s == null) {
return false;
}
for (int i = 0, len = s.length(); i < len; i++) {
if (Character.isUpperCase(s.charAt(i))) {
return true;
}
}
return false;
}
protected static boolean allUpperCase(String s) {
if (s == null) {
return false;
}
for (int i = 0, len = s.length(); i < len; i++) {
if (!Character.isUpperCase(s.charAt(i))) {
return false;
}
}
return true;
}
static boolean noneLowerCase(String s) {
if (s == null) {
return false;
}
for (int i = 0, len = s.length(); i < len; i++) {
if (Character.isLowerCase(s.charAt(i))) {
return false;
}
}
return true;
}
private static final long serialVersionUID = -7682607870855426599L;
} // end class RareExtractor
/** English-specific crude company name NER. */
class CompanyNameDetector extends RareExtractor {
static final int COMPANY_NAME_WINDOW = 3;
final Set<String> companyNameEnds;
public CompanyNameDetector() {
companyNameEnds = Generics.newHashSet();
companyNameEnds.add("Company");
companyNameEnds.add("COMPANY");
companyNameEnds.add("Co.");
companyNameEnds.add("Co"); // at end of sentence in PTB
companyNameEnds.add("Cos.");
companyNameEnds.add("CO.");
companyNameEnds.add("COS.");
companyNameEnds.add("Corporation");
companyNameEnds.add("CORPORATION");
companyNameEnds.add("Corp.");
companyNameEnds.add("Corp"); // at end of sentence in PTB
companyNameEnds.add("CORP.");
companyNameEnds.add("Incorporated");
companyNameEnds.add("INCORPORATED");
companyNameEnds.add("Inc.");
companyNameEnds.add("Inc"); // at end of sentence in PTB
companyNameEnds.add("INC.");
companyNameEnds.add("Association");
companyNameEnds.add("ASSOCIATION");
companyNameEnds.add("Assn");
companyNameEnds.add("ASSN");
companyNameEnds.add("Limited");
companyNameEnds.add("LIMITED");
companyNameEnds.add("Ltd.");
companyNameEnds.add("LTD.");
companyNameEnds.add("L.P.");
// companyNameEnds.add("PLC"); // Other thing added at same time.
}
private boolean companyNameEnd(String s) {
return companyNameEnds.contains(s);
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if ( ! startsUpperCase(s)) {
return "0";
}
for (int i = 0; i <= COMPANY_NAME_WINDOW; i++) {
String s1 = pH.getWord(h, i);
if (companyNameEnd(s1)) {
return "1";
}
}
return "0";
}
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 21L;
} // end class CompanyNameDetector
class CaselessCompanyNameDetector extends RareExtractor {
private final Set<String> companyNameEnds;
public CaselessCompanyNameDetector() {
companyNameEnds = Generics.newHashSet();
CompanyNameDetector cased = new CompanyNameDetector();
for (String name : cased.companyNameEnds) {
companyNameEnds.add(name.toLowerCase());
}
}
private boolean companyNameEnd(String s) {
return companyNameEnds.contains(s);
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
for (int i = 0; i <= CompanyNameDetector.COMPANY_NAME_WINDOW; i++) {
String s1 = pH.getWord(h, i);
if (companyNameEnd(s1)) {
return "1";
}
}
return "0";
}
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 21L;
}
class ExtractorUCase extends RareExtractor {
public ExtractorUCase() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (containsUpperCase(s)) {
return "1";
}
return "0";
}
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 22L;
}
class ExtractorLetterDigitDash extends RareExtractor {
public ExtractorLetterDigitDash() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (containsLetter(s) && containsDash(s) && containsNumber(s)) {
return "1";
}
return "0";
}
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 23;
}
class ExtractorUpperDigitDash extends RareExtractor {
public ExtractorUpperDigitDash() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (containsUpperCase(s) && containsDash(s) && containsNumber(s)) {
return "1";
}
return "0";
}
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 33L;
}
/** This requires the 3 character classes in order. This was worse than ExtractorLetterDigitDash (Oct 2009) */
class ExtractorLetterDashDigit extends RareExtractor {
public ExtractorLetterDashDigit() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (s == null) return "0";
boolean seenLetter = false;
boolean seenDash = false;
boolean seenNumber = false;
for (int i = 0, len = s.length(); i < len; i++) {
char ch = s.charAt(i);
if (Character.isLetter(ch)) {
seenLetter = true;
} else if (seenLetter && ch == '-') {
seenDash = true;
} else if (seenDash && Character.isDigit(ch)) {
seenNumber = true;
break;
}
}
if (seenNumber) {
return "1";
}
return "0";
}
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 33L;
}
/**
* creates features which are true if the current word is all caps
* and the distance to the first lowercase word to the left is dist
* the distance is 1 for adjacent, 2 for one across, 3 for ... and so on.
* infinity if no capitalized word (we hit the start of sentence or '')
*/
class ExtractorCapDistLC extends RareExtractor {
boolean verbose = false;
public ExtractorCapDistLC() {
}
@Override
String extract(History h, PairsHolder pH) {
String word = pH.getWord(h, 0);
String ret;
if (!startsUpperCase(word)) {
if (verbose) {
System.out.println("did not apply because not start with upper case");
}
return "0";
}
if (allUpperCase(word)) {
ret = "all:";
} else {
ret = "start";
}
//now find the distance
int current = -1;
int distance = 1;
while (true) {
String prevWord = pH.getWord(h, current);
if (startsLowerCase(prevWord)) {
if (verbose) {
System.out.println("returning " + (ret + current) + "for " + word + ' ' + prevWord);
}
return ret + distance;
}
if (prevWord.equals(naTag) || prevWord.equals("``")) {
if (verbose) {
System.out.println("returning " + ret + "infinity for " + word + ' ' + prevWord);
}
return ret + "infinity";
}
current--;
distance++;
}
}
@Override public boolean isDynamic() { return false; }
@Override public boolean isLocal() { return false; }
private static final long serialVersionUID = 34L;
}
/**
* This feature applies when the word is capitalized
* and the previous lower case is infinity
* and the lower cased version of it has occured 2 or more times with tag t
* false if the word was not seen.
* create features only for tags that are the same as the tag t
*/
class ExtractorCapLCSeen extends RareExtractor {
final String tag;
int cutoff = 1;
private final Extractor cCapDist = new ExtractorCapDistLC();
private transient Dictionary dict;
ExtractorCapLCSeen(String tag) {
this.tag = tag;
}
@Override
protected void setGlobalHolder(MaxentTagger tagger) {
this.dict = tagger.dict;
}
@Override
public boolean precondition(String tag1) {
return tag.equals(tag1);
}
@Override
String extract(History h, PairsHolder pH) {
String res = cCapDist.extract(h, pH);
if (res.equals("0")) {
return res;
}
//otherwise it is capitalized
String word = pH.getWord(h, 0);
if (dict.getCount(word, tag) > cutoff) {
return res + tag;
} else {
return "0";
}
}
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 35L;
}
/**
* "1" if not first word of sentence and _some_ letter is uppercase
*/
class ExtractorMidSentenceCap extends RareExtractor {
public ExtractorMidSentenceCap() {
}
@Override
String extract(History h, PairsHolder pH) {
String prevTag = pH.getTag(h, -1);
if(prevTag == null) { return "0"; }
if (prevTag.equals(naTag)) {
return "0";
}
String s = pH.getWord(h, 0);
if (containsUpperCase(s)) {
return "1";
}
return "0";
}
private static final long serialVersionUID = 24L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return true; }
}
/**
* "0" if not 1st word of sentence or not upper case, or lowercased version
* not in dictionary. Else first tag of word lowercased.
*/
class ExtractorStartSentenceCap extends RareExtractor {
private transient Dictionary dict;
public ExtractorStartSentenceCap() {
}
@Override
protected void setGlobalHolder(MaxentTagger tagger) {
this.dict = tagger.dict;
}
@Override
String extract(History h, PairsHolder pH) {
String prevTag = pH.getTag(h, -1);
if(prevTag == null) { return zeroSt; }
if (!prevTag.equals(naTag)) {
return zeroSt;
}
String s = pH.getWord(h, 0);
if (startsUpperCase(s)) {
String s1 = s.toLowerCase();
if (dict.isUnknown(s1)) {
return zeroSt;
}
return dict.getFirstTag(s1);
}
return zeroSt;
}
private static final long serialVersionUID = 25L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return true; }
}
/**
* "0" if first word of sentence or not first letter uppercase or if
* lowercase version isn't in dictionary. Otherwise first tag of lowercase
* equivalent.
*/
class ExtractorMidSentenceCapC extends RareExtractor {
private transient Dictionary dict;
public ExtractorMidSentenceCapC() {
}
@Override
protected void setGlobalHolder(MaxentTagger tagger) {
this.dict = tagger.dict;
}
@Override
String extract(History h, PairsHolder pH) {
String prevTag = pH.getTag(h, -1);
if (prevTag == null) { return zeroSt; }
if (prevTag.equals(naTag)) {
return zeroSt;
}
String s = pH.getWord(h, 0);
if (startsUpperCase(s)) {
String s1 = s.toLowerCase();
if (dict.isUnknown(s1)) {
return zeroSt;
}
return dict.getFirstTag(s1);
}
return zeroSt;
}
private static final long serialVersionUID = 26L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return true; }
}
class ExtractorCapC extends RareExtractor {
private transient Dictionary dict;
public ExtractorCapC() {
}
@Override
protected void setGlobalHolder(MaxentTagger tagger) {
this.dict = tagger.dict;
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (startsUpperCase(s)) {
String s1 = s.toLowerCase();
if (dict.isUnknown(s1)) {
return zeroSt;
}
return dict.getFirstTag(s1);
}
return zeroSt;
}
private static final long serialVersionUID = 26L;
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
// TODO: the next time we have to rebuild the tagger files anyway, we
// should change this class's name to something like
// "ExtractorNoLowercase" to distinguish it from
// ExtractorAllCapitalized
class ExtractorAllCap extends RareExtractor {
public ExtractorAllCap() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (noneLowerCase(s)) {
return "1";
}
return "0";
}
private static final long serialVersionUID = 27L;
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
class ExtractorAllCapitalized extends RareExtractor {
public ExtractorAllCapitalized() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (allUpperCase(s)) {
return "1";
}
return "0";
}
private static final long serialVersionUID = 32L;
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
class ExtractorCNumber extends RareExtractor {
public ExtractorCNumber() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (containsNumber(s)) {
return "1";
}
return "0";
}
private static final long serialVersionUID = 28L;
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
class ExtractorDash extends RareExtractor {
public ExtractorDash() {
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (containsDash(s)) {
return "1";
}
return "0";
}
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
private static final long serialVersionUID = 29L;
}
class ExtractorWordSuff extends RareExtractor {
// todo [cdm 2013]: position field in this class could be deleted and use super's position. But will break
private final int num, position;
ExtractorWordSuff(int num, int position) {
this.num = num;
this.position = position;
}
@Override
String extract(History h, PairsHolder pH) {
// String word = TestSentence.toNice(pH.getWord(h, 0));
String word = pH.getWord(h, position);
if (word.length() < num) {
return "######";
}
return word.substring(word.length() - num);
}
private static final long serialVersionUID = 724767436530L;
@Override
public String toString() {
return StringUtils.getShortClassName(this) + "(len" + num + ",w" + position + ")";
}
@Override public boolean isLocal() { return (position == 0); }
@Override public boolean isDynamic() { return false; }
}
class ExtractorWordPref extends RareExtractor {
// todo [cdm 2013]: position field in this class could be deleted and use super's position. But will break
private final int num, position;
ExtractorWordPref(int num, int position) {
this.num = num;
this.position = position;
}
@Override
String extract(History h, PairsHolder pH) {
// String word = TestSentence.toNice(pH.getWord(h, 0));
String word = pH.getWord(h, position);
if (word.length() < num) {
return "######";
} else {
return word.substring(0, num);
}
}
private static final long serialVersionUID = 724767436531L;
@Override
public String toString() {
return StringUtils.getShortClassName(this) + "(len" + num + ",w" + position + ")";
}
@Override public boolean isLocal() { return (position == 0); }
@Override public boolean isDynamic() { return false; }
} // end class ExtractorWordPref
class ExtractorsConjunction extends RareExtractor {
private final Extractor extractor1;
private final Extractor extractor2;
volatile boolean isLocal, isDynamic;
ExtractorsConjunction(Extractor e1, Extractor e2) {
extractor1 = e1;
extractor2 = e2;
isLocal = e1.isLocal() && e2.isLocal();
isDynamic = e1.isDynamic() || e2.isDynamic();
}
@Override
protected void setGlobalHolder(MaxentTagger tagger) {
extractor1.setGlobalHolder(tagger);
extractor2.setGlobalHolder(tagger);
}
@Override
String extract(History h, PairsHolder pH) {
String ex1 = extractor1.extract(h, pH);
if (ex1.equals(zeroSt)) {
return zeroSt;
}
String ex2 = extractor2.extract(h, pH);
if (ex2.equals(zeroSt)) {
return zeroSt;
}
return ex1 + ':' + ex2;
}
private static final long serialVersionUID = 36L;
@Override public boolean isLocal() { return isLocal; }
@Override public boolean isDynamic() { return isDynamic; }
@Override
public String toString() {
return StringUtils.getShortClassName(this) + '(' + extractor1 + ',' + extractor2 + ')';
}
}
class PluralAcronymDetector extends RareExtractor {
public PluralAcronymDetector() {
}
private static boolean pluralAcronym(String s) {
int len = s.length();
len--;
if (s.charAt(len) != 's') {
return false;
}
for (int i = 0; i < len; i++) {
if (!Character.isUpperCase(s.charAt(i))) {
return false;
}
}
return true;
}
@Override
String extract(History h, PairsHolder pH) {
String s = pH.getWord(h, 0);
if (pluralAcronym(s)) {
return "1";
}
return "0";
}
private static final long serialVersionUID = 33L;
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
class CtbPreDetector extends RareExtractor {
private String t1;
CtbPreDetector(String t2, int n2) {
super(n2);
t1=t2;
}
@Override
String extract(History h, PairsHolder pH) {
String s = TestSentence.toNice(pH.getWord(h, position));
if (!s.equals("") && CtbDict.getTagPre(t1, s.substring(0, 1)).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 43L;
@Override
public String toString() {
return super.toString() + " tag=" + t1;
}
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
} // end class ctbPreDetector
class CtbSufDetector extends RareExtractor {
private String t1;
CtbSufDetector(String t2, int n2) {
super(n2);
t1=t2;
}
@Override
String extract(History h, PairsHolder pH) {
String s=TestSentence.toNice(pH.getWord(h, position));
if(!s.equals("") && CtbDict.getTagSuf(t1, s.substring(s.length()-1, s.length())).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 44L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
@Override
public String toString() {
return super.toString() + " tag=" + t1;
}
} // end class ctbPreDetector
/*
class ASBCPreDetector extends RareExtractor {
private String t1;
private int n1;
public ASBCPreDetector(String t2, int n2) {
t1=t2;
n1=n2;
}
@Override
String extract(History h, PairsHolder pH) {
String s=TestSentence.toNice(pH.get(h,n1,false));
if(!s.equals("") && ASBCDict.getTagPre(t1, s.substring(0, 1)).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 53L;
} // end class ASBCPreDetector
class ASBCSufDetector extends RareExtractor {
private String t1;
private int n1;
public ASBCSufDetector(String t2, int n2) {
t1=t2;
n1=n2;
}
@Override
String extract(History h, PairsHolder pH) {
String s=TestSentence.toNice(pH.get(h,n1,false));
if (!s.equals("") && ASBCDict.getTagSuf(t1, s.substring(s.length()-1, s.length())).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 54L;
} // end class ASBCPreDetector
*/
class ASBCunkDetector extends RareExtractor {
private String t1;
private int n1;
ASBCunkDetector(String t2, int n2) {
t1=t2;
n1=n2;
}
@Override
String extract(History h, PairsHolder pH) {
String s=TestSentence.toNice(pH.getWord(h,n1));
if (ASBCunkDict.getTag(t1, s).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 57L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
} // end class ASBCunkDetector
class CTBunkDictDetector extends RareExtractor {
private String t1;
private int n1;
CTBunkDictDetector(String t2, int n2) {
t1=t2;
n1=n2;
}
@Override
String extract(History h, PairsHolder pH) {
String s=TestSentence.toNice(pH.getWord(h,n1));
if (CTBunkDict.getTag(t1, s).equals("1"))
return "1:"+t1;
return "0:"+t1;
}
private static final long serialVersionUID = 80L;
@Override public boolean isLocal() { return false; }
@Override public boolean isDynamic() { return false; }
} // end class CTBunkDictDetector
abstract class CWordBooleanExtractor extends RareExtractor {
@Override
String extract(History h, PairsHolder pH) {
String cword = pH.getWord(h, 0);
return extractFeature(cword) ? "1" : "0";
}
abstract boolean extractFeature(String cword);
@Override public boolean isLocal() { return true; }
@Override public boolean isDynamic() { return false; }
}
class ExtractorFrenchNounSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = 848772358776880060L;
@Override
boolean extractFeature(String cword) {
return FrenchUnknownWordSignatures.hasNounSuffix(cword);
}
}
class ExtractorFrenchAdvSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = 9141591417435848689L;
@Override
boolean extractFeature(String cword) {
return FrenchUnknownWordSignatures.hasAdvSuffix(cword);
}
}
class ExtractorFrenchVerbSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = -1762307766086637191L;
@Override
boolean extractFeature(String cword) {
return FrenchUnknownWordSignatures.hasVerbSuffix(cword);
}
}
class ExtractorFrenchAdjSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = -5838046941039275411L;
@Override
boolean extractFeature(String cword) {
return FrenchUnknownWordSignatures.hasAdjSuffix(cword);
}
}
class ExtractorFrenchPluralSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = 1139695807527192176L;
@Override
boolean extractFeature(String cword) {
return FrenchUnknownWordSignatures.hasPossiblePlural(cword);
}
}
class ExtractorSpanishGender extends RareExtractor {
private static final long serialVersionUID = -7359312929174070404L;
@Override
String extract(History h, PairsHolder pH) {
String cword = pH.getWord(h, 0);
if (SpanishUnknownWordSignatures.hasMasculineSuffix(cword))
return "m";
else if (SpanishUnknownWordSignatures.hasFeminineSuffix(cword))
return "f";
else
return "";
}
}
class ExtractorSpanishConditionalSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = 4383251116043848632L;
@Override
boolean extractFeature(String cword) {
return SpanishUnknownWordSignatures.hasConditionalSuffix(cword);
}
}
class ExtractorSpanishImperfectErIrSuffix extends CWordBooleanExtractor {
private static final long serialVersionUID = -5804047931816433075L;
@Override
boolean extractFeature(String cword) {
return SpanishUnknownWordSignatures.hasImperfectErIrSuffix(cword);
}
}