package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.main.KPEFilter;
import hu.u_szeged.kpe.readers.DocumentData;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.NormalizerAnnotator.NormalizerAnnotation;
import edu.stanford.nlp.trees.EnglishGrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.TreeGraphNode;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
/**
* Calculates the average "distance" in the parse trees of a candidate phrase to the frequent verbs in the
* corpus.
*/
public class IndicatingVerbsFeature extends NominalFeature {
private static final long serialVersionUID = -6214178223025494415L;
public IndicatingVerbsFeature() {
scale = Scale.NUMERIC;
}
public void setFeatureField(KPEFilter kf) {
nominalVals = kf.determineIndicatingVerbs(20);
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
Map<String, Integer> indicatorHeights = new HashMap<String, Integer>();
Map<String, Integer> indicatorOccurrences = new HashMap<String, Integer>();
// boolean containsEpinionated = false;
// for (DocumentData doc : docs) {
// if (doc.getDocumentType() == DocumentType.Epinion) {
// containsEpinionated = true;
// break;
// }
// }
for (CoreMap sentence : sentences) {
List<CoreLabel> sentenceTokens = sentence.get(TokensAnnotation.class);
Tree sentenceTree = sentence.get(TreeAnnotation.class);
if (sentenceTree != null) {
int[] checkedInterval = { Integer.MAX_VALUE, Integer.MIN_VALUE };
List<Tree> leaves = sentenceTree.getLeaves();
for (int token = 0; token < sentenceTokens.size(); ++token) {
CoreLabel cl = sentenceTokens.get(token);
String word = cl.word();
// replacing "www." serves the only dummy purpose of not to regard hyperlinks as strange-orthography
// tokens
int difference = word.length()
- word.replaceAll("(?i)www.", "????").replaceAll("(?i)([a-z])\\1{2,}", "$1").length();
// boolean strangeOrthography = containsEpinionated && difference > 0 && difference !=word.length();
boolean strangeOrthography = difference > 0 && difference != word.length();
if (!strangeOrthography && !nominalVals.contains(cl)) {
continue;
}
Tree targetLeaf = leaves.get(token);
boolean containsNGram = false;
for (int h = 2; !containsNGram && h <= sentenceTree.depth(targetLeaf); ++h) {
Tree ancestor = targetLeaf.ancestor(h, sentenceTree);
List<Tree> ancestorLeaves = ancestor.getLeaves();
int startIndex = leaves.indexOf(ancestorLeaves.get(0));
int endIndex = startIndex + ancestorLeaves.size() - ngramForm.getKey().size();
for (int i = startIndex; i <= endIndex; ++i) {
if (i >= checkedInterval[0] && i <= checkedInterval[1]) {
continue;
}
List<CoreLabel> successiveTokens = sentenceTokens.subList(i, i + ngramForm.getKey().size());
NGram dummyNGram = new NGram(successiveTokens);
if ((i < token || i > token) && dummyNGram.equals(ngramForm.getKey())) {
containsNGram = true;
break;
}
}
checkedInterval = new int[] { startIndex, endIndex };
if (!containsNGram) {
continue;
}
int depth = ancestor.depth();
depth = h;
Integer prevDepth = indicatorHeights.get(cl);
if (prevDepth == null || prevDepth < depth) {
indicatorHeights.put(cl.get(NormalizerAnnotation.class), depth);
}
if (strangeOrthography) {
GrammaticalStructure depStruct = new EnglishGrammaticalStructure(ancestor);
Collection<TypedDependency> deps = depStruct.allTypedDependencies();
Iterator<TypedDependency> depIt = deps.iterator();
List<TreeGraphNode> seedNodes = new LinkedList<TreeGraphNode>();
while (depIt.hasNext()) {
TypedDependency typedDep = depIt.next();
if (typedDep.dep().label().word().equals(word)) {
seedNodes.add(typedDep.gov());
} else if (typedDep.gov().label().word().equals(word)) {
seedNodes.add(typedDep.dep());
}
}
Iterator<TreeGraphNode> nodeIt = seedNodes.iterator();
while (nodeIt.hasNext()) {
TreeGraphNode next = nodeIt.next();
nodeIt.remove();
Set<TreeGraphNode> newNodes = Generics.newHashSet();
Set<Tree> nodes = depStruct.root().subTrees();
for (Iterator<Tree> it = nodes.iterator(); it.hasNext();) {
TreeGraphNode node = (TreeGraphNode) it.next();
TreeGraphNode gov = node.getGovernor();
if (gov != null && gov == next) {
newNodes.add(node);
}
}
seedNodes.addAll(newNodes);
nodeIt = seedNodes.iterator();
}
}
}
}
} else {
// TODO implement what should happen for Hungarian language
for (CoreLabel ew : sentenceTokens) {
if (nominalVals.contains(ew)) {
Integer num = indicatorOccurrences.get(ew);
indicatorOccurrences.put(ew.get(NormalizerAnnotation.class), num == null ? 1 : ++num);
}
}
}
}
for (Entry<String, Integer> indicator : indicatorOccurrences.entrySet()) {
updateFeatureVals(this.getClass().getName() + "_" + indicator.getKey(), indicator.getValue(), docToCheck);
}
for (Entry<String, Integer> indicator : indicatorHeights.entrySet()) {
double d = 1 / (double) indicator.getValue();
updateFeatureVals(this.getClass().getName() + "_" + indicator.getKey(), d, docToCheck, TreeSet.class);
}
}
}