package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; /** * This feature assigns the character 2/3-gram suffixes for each token of the occurrences of the NGrams belonging to a normalized form. */ public class SuffixFeature extends Feature { private static final long serialVersionUID = 4955319571139381492L; public SuffixFeature() { scale = Scale.BINARY; dummyValue = -1; canBeRepresentedAsSequential = true; collectionToStoreDocVals = HashSet.class; } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { int ngramLength = ngramForm.getKey().size(); StringBuffer[] concatenatedSuffices = new StringBuffer[2]; for (int tokenNumber = 0; tokenNumber < ngramLength; ++tokenNumber) { CoreLabel cl = ngramForm.getKey().get(tokenNumber); String word = cl.word().toLowerCase(); int lemmaLenght = word.length(); for (int i = 0; !employBIESmarkup && i < concatenatedSuffices.length; ++i) { // if this is the first token we need to instantiate array elements if (tokenNumber == 0) { concatenatedSuffices[i] = new StringBuffer(); } concatenatedSuffices[i].append("_" + word.charAt(lemmaLenght - 1)); } String type = ngramLength == 1 ? "S" : (tokenNumber == 0 ? "B" : (tokenNumber < ngramLength - 1 ? "I" : "E")); StringBuffer sb = new StringBuffer(); sb.append(word.charAt(lemmaLenght - 1)); for (int i = 2; i <= lemmaLenght && i < concatenatedSuffices.length + 2; ++i) { char charToAdd = word.charAt(lemmaLenght - i); if (!Character.isLetter(charToAdd)) break; if (employBIESmarkup) { sb.append(charToAdd); updateFeatureVals(this.getClass().getName() + "_" + type + "_" + sb, 1.0d, docToCheck); } else { for (int s = 0; s < concatenatedSuffices.length; ++s) { if (i - 2 <= s) { concatenatedSuffices[s].append(charToAdd); } } } } } if (!employBIESmarkup) { for (StringBuffer suffix : concatenatedSuffices) { updateFeatureVals(this.getClass().getName() + suffix.toString(), 1.0d, docToCheck); } } } }