package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGram.SequenceType; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.utils.NLPUtils; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.Stack; import edu.stanford.nlp.util.CoreMap; /** * Decides whether a given keyphrase aspirant is an extended for of an acronym present in its document. */ public class AcronymFeature extends Feature { private static final long serialVersionUID = -499383355487365213L; private Set<String> abbreviations; public AcronymFeature() { scale = Scale.BINARY; collectionToStoreDocVals = HashSet.class; } /** * Method that checks whether a long term (of more than one token) could be an extended form of of a possible acronym.<br /> * Initials of the individual tokens of the longer form must be consistent with the abbreviation in the sense that they must not be such characters * that are not a successive character of the substring of the abbreviation that is not covered.<br /> * E.g. for pairs * <ul> * <li>("UN", "United Nations") the result would be 1.0d</li> * <li>("UN", "United Kingdom") 0.0d</li> * <li>("UNO", "United Organization") gives the result of 1.0 as well</li> * </ul> * * @param acronym * abbreviation or acronym to check * @param longTerm * possible extension of the shorter form of the other parameter * @return double value indicating when returning 1.0 that the longer term could be an extended form of the shorter form in question */ public static double checkForAcronymity(String acronym, String longTerm) { acronym = acronym.toLowerCase().replaceAll("\\p{Punct}", ""); longTerm = NLPUtils.join(longTerm.split("-")).toLowerCase(); longTerm = longTerm.replaceAll("\\s+and\\s+", " "); if (longTerm.split(" ").length < 2 || longTerm.startsWith(acronym)) return 0.0; ArrayList<Integer> tokenBorders = new ArrayList<Integer>(); Stack<Integer> lastlyInvolved = new Stack<Integer>(); lastlyInvolved.push(0); int whitespace = longTerm.indexOf(" ") + 1; tokenBorders.add(0); tokenBorders.add(whitespace); while ((whitespace = longTerm.indexOf(" ", whitespace)) != -1) tokenBorders.add(++whitespace); int[] matchingChars = new int[longTerm.length()]; boolean[] notForInvolving = new boolean[longTerm.length()]; for (int c = 0; c < matchingChars.length; ++c) { if (c > 0) matchingChars[c] = matchingChars[c - 1]; if (tokenBorders.contains(c)) { if (longTerm.charAt(c) != acronym.charAt(matchingChars[c])) { int wronglyInvolvedIndex; if ((wronglyInvolvedIndex = lastlyInvolved.pop()) == 0) return 0.0; notForInvolving[wronglyInvolvedIndex] = true; for (int index = wronglyInvolvedIndex; index <= c; ++index) { matchingChars[index]--; } c = wronglyInvolvedIndex; } else { matchingChars[c]++; lastlyInvolved.push(c); } } else { if (longTerm.charAt(c) == acronym.charAt(matchingChars[c]) && !notForInvolving[c]) { matchingChars[c]++; lastlyInvolved.push(c); } } if (matchingChars[c] == acronym.length()) if (c < tokenBorders.get(tokenBorders.size() - 1)) { int wronglyInvolvedIndex = lastlyInvolved.pop(); notForInvolving[wronglyInvolvedIndex] = true; for (int index = wronglyInvolvedIndex; index <= c; ++index) { matchingChars[index]--; } c = wronglyInvolvedIndex; } else return 1.0; } return 0.0; } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { if (abbreviations == null) { abbreviations = new HashSet<String>(); for (DocumentData doc : docs) abbreviations.addAll(doc.getAcronyms().keySet()); } String original = ngramForm.getKey().getSequenceAsString(SequenceType.ORIGINAL).replaceAll("-?s$", ""); int matchingAcronyms = 0; for (String acronym : abbreviations) { if (checkForAcronymity(acronym, original) == 1.0) { matchingAcronyms++; break; } } updateFeatureVals(matchingAcronyms, docToCheck); } protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) { abbreviations = null; return super.aggregation(docVals, phrase, train, length); } }