package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.main.KPEFilter; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.utils.NLPUtils; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.NormalizerAnnotator.NormalizerAnnotation; import edu.stanford.nlp.util.CoreMap; /** * Calculates the average mean and deviation of the sentiment values of the neighboring tokens of a candidate * phrase across all of its sentences within the containing document. */ public class SentiWordnetFeature extends Feature { private static final long serialVersionUID = -2312477566056803372L; public SentiWordnetFeature() { scale = Scale.NUMERIC; collectionToStoreDocVals = LinkedList.class; } public void setFeatureField(KPEFilter kf) { if (KPEFilter.wordList == null) { kf.fillWordList("resources/swn/SentiWordNet_3.0.txt"); } } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { List<double[]> sentimentScores = new ArrayList<double[]>(ngramForm.getKey().size()); boolean ngramSeen = false; for (CoreMap sentence : sentences) { NGram ngram = new NGram(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { NGram.getNormalizedCoreLabel(token); if (!token.has(NormalizerAnnotation.class)) { continue; } List<double[]> vals = KPEFilter.wordList.get(token); double[] avgTokenScore = new double[2]; if (vals != null) { for (double[] d : vals) { avgTokenScore = new double[] { avgTokenScore[0] + d[0] / vals.size(), avgTokenScore[1] + d[1] / vals.size() }; } } if (ngram.size() == ngramForm.getKey().size()) { ngram.remove(0); if (!ngramSeen) { sentimentScores.remove(0); } } ngram.add(token); if (!ngramSeen) { sentimentScores.add(avgTokenScore); } ngramSeen = ngramSeen || ngram.equals(ngramForm.getKey()); if (avgTokenScore[0] + avgTokenScore[1] >= 0.5) { String id = token.tag(); id = token.getString(NormalizerAnnotation.class) + "_" + id.substring(0, Math.min(id.length(), 2)).toLowerCase(); updateFeatureVals(this.getClass().getName() + "_" + id, 1.0d, docToCheck); } updateFeatureVals(avgTokenScore[0] + avgTokenScore[1], docToCheck); } // inserting null indicates the end of a sentence and is needed to compute some stats (such as stdev) updateFeatureVals(null, docToCheck); } for (double[] sentimentScore : sentimentScores) { for (double d : sentimentScore) { updateFeatureVals(this.getClass().getName() + "_NGRAM", d, docToCheck); } } } public Map<String, Double> aggregateVals(boolean train, String token, List<int[]> length, double[] dedicatedFeatures) { Map<String, Double> aggregatedVals = new HashMap<String, Double>(); String className = this.getClass().getName(); double maximalSentenceMean = 0.0d; double maxPositiveTokenScore = 0.0d; double maxNegativeTokenScore = 0.0d; double maxTotalTokenScore = 0.0d; double[] perDocSentenceMeans = null; double[] perDocSentenceStDev = null; double[] tokenMeanScores = null; int sentences = 0; for (Entry<String, List<Collection<Number>>> entry : featureVals.entrySet()) { int size = entry.getValue().size(); for (int doc = 0; doc < size; ++doc) { Collection<Number> docVals = entry.getValue().get(doc); if (entry.getKey().endsWith("Feature")) { perDocSentenceMeans = new double[size]; perDocSentenceStDev = new double[size]; List<Number> sentenceAvgs = new LinkedList<Number>(); List<Number> sentence = new LinkedList<Number>(); for (Number docVal : docVals) { if (docVal == null) { sentences++; sentenceAvgs.add(NLPUtils.mean(sentence)); sentence = new LinkedList<Number>(); } else { sentence.add(docVal); } } double sentenceAvg = NLPUtils.mean(sentenceAvgs); perDocSentenceMeans[doc] = sentenceAvg; double summ = 0.0; for (Number sentAvg : sentenceAvgs) { double sentVal = sentAvg.doubleValue(); if (sentVal > maximalSentenceMean) { maximalSentenceMean = sentVal; } summ += (sentVal - sentenceAvg) * (sentVal - sentenceAvg); } perDocSentenceStDev[doc] = Math.sqrt(summ / sentenceAvgs.size()); } else if (entry.getKey().endsWith("_NGRAM")) { tokenMeanScores = new double[size]; int i = 0; double tokenSum = 0.0d, pos = 0.0d, neg = 0.0d, totalSum = 0.0d; for (Number docVal : docVals) { if (i++ % 2 == 0) { pos = docVal.doubleValue(); totalSum += pos; if (pos > maxPositiveTokenScore) { maxPositiveTokenScore = pos; } } else { neg = docVal.doubleValue(); totalSum += neg; tokenSum = pos + neg; if (neg > maxNegativeTokenScore) { maxNegativeTokenScore = neg; } if (tokenSum > maxTotalTokenScore) { maxTotalTokenScore = tokenSum; } } } tokenMeanScores[doc] = totalSum / docVals.size(); } else { String key = entry.getKey().substring(entry.getKey().indexOf(className) + className.length()) + "_"; Double prevVal = aggregatedVals.get(className + key); aggregatedVals.put(className + key, (prevVal == null ? 0 : prevVal) + docVals.size()); } } } aggregatedVals.put(className + "_MAX_SENT_VAL", maximalSentenceMean); aggregatedVals.put(className + "_SENT_MEAN", NLPUtils.mean(perDocSentenceMeans)); aggregatedVals.put(className + "_SENT_STDEV", NLPUtils.mean(perDocSentenceStDev)); aggregatedVals.put(className + "_PHRASE_MEAN", NLPUtils.mean(tokenMeanScores)); aggregatedVals.put(className + "_MAX_POS_TOKEN", maxPositiveTokenScore); aggregatedVals.put(className + "_MAX_NEG_TOKEN", maxNegativeTokenScore); aggregatedVals.put(className + "_MAX_TOKEN", maxTotalTokenScore); for (Entry<String, Double> entry : aggregatedVals.entrySet()) { if (entry.getKey().endsWith("_")) { entry.setValue(entry.getValue() / sentences); } } // reset it so the next time a set of documents are to be keyphrased this flag can start to count from the // beginning documentToExamine = -1; featureVals = new HashMap<String, List<Collection<Number>>>(); return aggregatedVals; } }