/**
*
*/
package edu.berkeley.nlp.discPCFG;
import java.util.List;
import edu.berkeley.nlp.classify.FeatureExtractor;
import edu.berkeley.nlp.util.Counter;
/**
* @author adpauls
*
*/
public class LexiconFeatureExtractor implements
FeatureExtractor<WordInSentence, LexiconFeature> {
/*
* (non-Javadoc)
*
* @see edu.berkeley.nlp.classify.FeatureExtractor#extractFeatures(java.lang.Object)
*/
public Counter<LexiconFeature> extractFeatures(
WordInSentence sentence) {
int loc = sentence.getSecond();
String word = sentence.getFirst().get(loc);
Counter<LexiconFeature> counter = new Counter<LexiconFeature>();
counter.incrementCount(new LexiconFeature(word), 1.0);
// Reformed Mar 2004 (cdm); hopefully much better now.
// { -CAPS, -INITC ap, -LC lowercase, 0 } +
// { -KNOWNLC, 0 } + [only for INITC]
// { -NUM, 0 } +
// { -DASH, 0 } +
// { -last lowered char(s) if known discriminating suffix, 0}
int wlen = word.length();
int numCaps = 0;
boolean hasDigit = false;
boolean hasDash = false;
boolean hasLower = false;
for (int i = 0; i < wlen; i++) {
char ch = word.charAt(i);
if (Character.isDigit(ch)) {
hasDigit = true;
} else if (ch == '-') {
hasDash = true;
} else if (Character.isLetter(ch)) {
if (Character.isLowerCase(ch)) {
hasLower = true;
} else if (Character.isTitleCase(ch)) {
hasLower = true;
numCaps++;
} else {
numCaps++;
}
}
}
char ch0 = word.charAt(0);
String lowered = word.toLowerCase();
if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
if (loc == 0 && numCaps == 1) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.INIT_CAP), 1.0);
// if (isKnown(lowered)) {
// sb.incrementCount(LexiconFeature.KNOWNLC, 1.0);
// }
} else {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
}
} else if (!Character.isLetter(ch0) && numCaps > 0) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
} else if (hasLower) { // (Character.isLowerCase(ch0)) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.LOWER_CASE), 1.0);
}
if (hasDigit) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DIGIT), 1.0);
}
if (hasDash) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DASH), 1.0);
}
if (lowered.endsWith("s") && wlen >= 3) {
// here length 3, so you don't miss out on ones like 80s
char ch2 = lowered.charAt(wlen - 2);
// not -ess suffixes or greek/latin -us, -is
if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_S), 1.0);
}
} else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
// don't do for very short words;
// Implement common discriminating suffixes
/*
* if (Corpus.myLanguage==Corpus.GERMAN){
* sb.append(lowered.substring(lowered.length()-1)); }else{
*/
if (lowered.endsWith("ed")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ED), 1.0);
} else if (lowered.endsWith("ing")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ING), 1.0);
} else if (lowered.endsWith("ion")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ION), 1.0);
} else if (lowered.endsWith("er")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ER), 1.0);
} else if (lowered.endsWith("est")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_EST), 1.0);
} else if (lowered.endsWith("ly")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_LY), 1.0);
} else if (lowered.endsWith("ity")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ITY), 1.0);
} else if (lowered.endsWith("y")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_Y), 1.0);
} else if (lowered.endsWith("al")) {
counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_AL), 1.0);
// } else if (lowered.endsWith("ble")) {
// sb.append("-ble");
// } else if (lowered.endsWith("e")) {
// sb.append("-e");
}
}
return counter;
}
}