package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; public class StrangeOrthographyFeature extends Feature { private static final long serialVersionUID = 2374136292284809751L; private static final Pattern charRunPattern = Pattern.compile("(?i)([a-z])\\1{2,}"); public StrangeOrthographyFeature() { scale = Scale.BINARY; dummyValue = -1; canBeRepresentedAsSequential = true; collectionToStoreDocVals = HashSet.class; } @Override public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { int ngramSize = ngramForm.getKey().size(); StringBuffer concatenatedCharrunPattern = new StringBuffer(); StringBuffer concatenatedStrangeCapitalizationPattern = new StringBuffer(); StringBuffer concatenatedNePattern = new StringBuffer(); for (int position = 0; position < ngramSize; ++position) { CoreLabel cl = ngramForm.getKey().get(position); String originalToken = cl.word(); boolean hasUpperCase = false; Set<Integer> charRuns = new HashSet<Integer>(); Matcher match = charRunPattern.matcher(originalToken); while (match.find()) { charRuns.add(match.end() - match.start()); } for (int i = 1; i < originalToken.length(); ++i) { if (Character.isUpperCase(originalToken.charAt(i))) { hasUpperCase = true; break; } } if (employBIESmarkup) { String type = ngramSize == 1 ? "S" : (position == 0 ? "B" : (position < ngramSize - 1 ? "I" : "E")); updateFeatureVals(type + "_STRANGE_CAPITALIZATION", hasUpperCase ? 1.0d : 0.0d, docToCheck); updateFeatureVals(type + "_NE_" + cl.getString(NamedEntityTagAnnotation.class), 1.0d, docToCheck); for (Integer charRun : charRuns) { if (charRun > 3) updateFeatureVals(type + "_CHARRUN_" + charRun, 1.0d, docToCheck); } } else { concatenatedStrangeCapitalizationPattern.append(hasUpperCase ? "1" : "0"); concatenatedNePattern.append("_" + cl.getString(NamedEntityTagAnnotation.class)); for (Integer charRun : charRuns) { concatenatedCharrunPattern.append("_"); if (charRun > 3) concatenatedCharrunPattern.append(Integer.toString(charRun) + "|"); } } } if (!employBIESmarkup) { updateFeatureVals("CHARRUN_PATTERN" + concatenatedCharrunPattern, 1.0d, docToCheck); updateFeatureVals("NE_PATTERN" + concatenatedNePattern, 1.0d, docToCheck); updateFeatureVals("STRANGE_CAPITALIZATION_PATTERN_" + concatenatedStrangeCapitalizationPattern, 1.0d, docToCheck); } } }