package hu.u_szeged.kpe.candidates; import hu.u_szeged.kpe.readers.KpeReader; import hu.u_szeged.utils.NLPUtils; import hu.u_szeged.utils.stemmer.PorterStemmer; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import edu.smu.tspell.wordnet.Synset; import edu.smu.tspell.wordnet.SynsetType; import edu.smu.tspell.wordnet.WordNetDatabase; import edu.smu.tspell.wordnet.WordSense; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.OriginalTextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.NormalizerAnnotator.NormalizerAnnotation; import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation; /** * An extension of CoreLabel containing ArrayList, that stores words in a lexicographical ordering (based on * CoreLabelComparator), after removing stopword components from the representation. */ public class NGram extends ArrayList<CoreLabel> implements Cloneable { private static PorterStemmer ps = new PorterStemmer(); private static final long serialVersionUID = 3797853353962652098l; private static final CoreLabelComparator coreLabelComparator = new CoreLabelComparator(); private static WordNetDatabase wn_database; private String normalizedForm; public enum SequenceType { TAG, STEM, LEMMA, WIKI_FROM, ORIGINAL, NORMALIZED } public NGram() { this(4); } public NGram(int n) { super(n); } public NGram(String toAnnotate) { Annotation annotation = new Annotation(toAnnotate); KpeReader.sentenceAnalyzer.annotate(annotation); for (CoreLabel cl : annotation.get(TokensAnnotation.class)) { add(cl); } setNormalizedForm(); } public NGram(String[] originalForms, String[] lemmas) { for (int i = 0; i < originalForms.length; ++i) { CoreLabel cl = new CoreLabel(); cl.set(TextAnnotation.class, originalForms[i]); cl.set(OriginalTextAnnotation.class, originalForms[i]); cl.set(LemmaAnnotation.class, lemmas[i]); cl.set(PartOfSpeechAnnotation.class, "dummy"); add(cl); } setNormalizedForm(); } public NGram(CoreLabel ew) { this(); add(ew); setNormalizedForm(); } public NGram(Collection<CoreLabel> words) { super(words); setNormalizedForm(); } public boolean add(CoreLabel element) { boolean returnValue = super.add(element); setNormalizedForm(); return returnValue; } public void add(int index, CoreLabel element) { super.add(index, element); setNormalizedForm(); } public boolean equals(Object o) { if (o instanceof CoreLabel) { return equals(new NGram((CoreLabel) o)); // return new NGram((CoreLabel) o).normalizedForm.equals(this.normalizedForm); } else if (o instanceof NGram) { // return ((NGram) o).normalizedForm.equals(this.normalizedForm); NGram n = (NGram) o; if (n.size() != size()) return false; for (int i = 0; i < size(); ++i) { CoreLabel clThis = get(i); CoreLabel other = n.get(i); int comparison = coreLabelComparator.compareForNGramEquality(clThis, other); if (comparison != 0) return false; } return true; } else return false; } public int compareTo(NGram ngram) { return this.normalizedForm.compareTo(ngram.normalizedForm); } public int hashCode() { return normalizedForm.hashCode(); } public Object clone() { NGram clone = new NGram(); for (CoreLabel ew : this) clone.add(new CoreLabel(ew)); clone.normalizedForm = normalizedForm; return clone; } public static boolean initWordNet(String wordNetDir) { if (wordNetDir != null && !wordNetDir.matches("(?i)[\\p{Punct}\\s]*false[\\p{Punct}\\s]*")) { if (new File(wordNetDir).isDirectory()) { System.setProperty("wordnet.database.dir", wordNetDir); wn_database = WordNetDatabase.getFileInstance(); return true; } else { System.err.println("The WordNet dictionary directory provided does not exist."); System.err.println("Either disable the usage of WordNet or set its correct location in the config file."); System.exit(1); } } wn_database = null; return false; } private static String getTransformedForm(CoreLabel cl) { String pos = cl.getString(PartOfSpeechAnnotation.class).toLowerCase(); Set<SynsetType> synsetTypes = new HashSet<SynsetType>(Arrays.asList(SynsetType.ALL_TYPES)); if (pos.startsWith("nn")) { synsetTypes = new HashSet<SynsetType>(Arrays.asList(new SynsetType[] { SynsetType.NOUN })); } else if (pos.startsWith("vb")) { synsetTypes = new HashSet<SynsetType>(Arrays.asList(new SynsetType[] { SynsetType.VERB })); } else if (pos.startsWith("jj")) { synsetTypes = new HashSet<SynsetType>(Arrays.asList(new SynsetType[] { SynsetType.ADJECTIVE, SynsetType.ADJECTIVE_SATELLITE })); } Map<String, Integer> aggrFreqs = new HashMap<String, Integer>(); String lemma = cl.get(LemmaAnnotation.class); Synset[] synsets = wn_database.getSynsets(lemma); // System.out.println(lemma + "\t" + pos); for (Synset synset : synsets) { SynsetType actualType = synset.getType(); if (!synsetTypes.contains(actualType)) { continue; } WordSense[] wss = actualType == SynsetType.VERB ? synset.getDerivationallyRelatedForms(lemma) : new WordSense[] { new WordSense(lemma, synset) }; for (WordSense w : wss) { Synset s = w.getSynset(); if (pos.startsWith("vb") && s.getType() != SynsetType.NOUN) { continue; } String[] forms = s.getWordForms(); for (String form : forms) { Integer prevdVal = aggrFreqs.get(form); aggrFreqs.put(form, (prevdVal == null ? 0 : prevdVal) + s.getTagCount(form)); } } } String argMax = lemma.replaceAll("([a-z])\\1{2,}", "$1$1"); int freqMax = 0; for (Entry<String, Integer> transformationEntry : aggrFreqs.entrySet()) { if (transformationEntry.getValue() > freqMax && transformationEntry.getKey().split(" ").length == 1) { freqMax = transformationEntry.getValue(); argMax = transformationEntry.getKey(); } } // just in case someone would be curious what kind of rewriting might happen there // if (!lemma.toLowerCase().equals(argMax.toLowerCase())) { // System.err.println(lemma + "-->" + argMax); // } return ps.stemString(argMax); } /** * Sets and returns the normalized representation of a CoreLabel object at the same time. * * @param cl * - the CoreLabel to determine the normalized form of * @return - the normalized representation of the CoreLabel parameter (or null if the word is known to be a * stopword) */ public static String getNormalizedCoreLabel(CoreLabel cl) { boolean isStopWord = cl.has(StopWordAnnotation.class) && cl.get(StopWordAnnotation.class); String normalization = ps.stem(cl.lemma().toLowerCase()); if (isStopWord) { cl.set(NormalizerAnnotation.class, cl.lemma().toLowerCase()); return null; } else if (wn_database != null) { normalization = getTransformedForm(cl); } else { normalization = ps.stemString(cl.lemma()); } cl.set(NormalizerAnnotation.class, normalization); return normalization; } private void setNormalizedForm() { List<String> normalizedTokens = new LinkedList<String>(); for (CoreLabel cl : this) { String normalized = getNormalizedCoreLabel(cl); if (normalized != null) { normalizedTokens.add(normalized); } } Collections.sort(normalizedTokens); StringBuilder sb = new StringBuilder(); for (String token : normalizedTokens) { sb.append(token + ' '); } normalizedForm = sb.toString().trim(); } private String[] getSequence(SequenceType type) { String[] sb = new String[size()]; for (int o = 0; o < size(); ++o) { CoreLabel ew = get(o); switch (type) { case TAG: sb[o] = ew.tag(); break; case NORMALIZED: sb[o] = ew.get(NormalizerAnnotation.class); break; case STEM: sb[o] = ps.stem(ew.word().toLowerCase()); break; case LEMMA: sb[o] = ew.get(LemmaAnnotation.class); break; case WIKI_FROM: sb[o] = (o == size() - 1) ? ew.get(LemmaAnnotation.class) : ew.word(); break; case ORIGINAL: sb[o] = ew.word(); break; } } return sb; } public String getSequenceAsString(SequenceType type) { return getSequenceAsString(type, ' '); } public String getSequenceAsString(SequenceType type, char joiner) { return NLPUtils.join(getSequence(type), joiner); } public String getCanonicalForm() { if (normalizedForm == null) { setNormalizedForm(); } return normalizedForm; } public String getStemmedStringFrom() { List<String> tokens = new LinkedList<String>(); for (CoreLabel cl : this) { tokens.add(cl.word().toLowerCase()); } Collections.sort(tokens); String ordered = NLPUtils.join(tokens.toArray(new String[tokens.size()])); return ps.stemString(ordered); } public String toString() { StringBuffer sb = new StringBuffer(); for (CoreLabel cl : this) { sb.append(cl.word() + ' '); } return sb.toString().trim(); } }