package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeSet;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
public class SyntactFeature extends Feature {
private static final long serialVersionUID = 6730283683709835856L;
public SyntactFeature() {
scale = Scale.NUMERIC;
collectionToStoreDocVals = LinkedList.class;
}
private List<Integer> syntactTree(CoreMap sentence, NGram orthographicForm) {
List<Integer> toReturn = new LinkedList<Integer>();
Tree tree = sentence.get(TreeAnnotation.class);
if (tree == null)
return toReturn;
List<Tree> leaves = tree.getLeaves();
List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
NGram ngram = new NGram();
for (int l = 0; l < leaves.size(); ++l) {
CoreLabel token = tokens.get(l);
if (ngram.size() == orthographicForm.size()) {
ngram.remove(0);
}
ngram.add(token);
if (!ngram.equals(orthographicForm))
continue;
int heights[] = new int[ngram.size()];
for (int t = 0; t < ngram.size(); ++t) {
int subTreeHeight = tree.depth(leaves.get(l - t));
heights[t] = subTreeHeight;
for (int h = 2; h < subTreeHeight; ++h) {
Tree ancestor = leaves.get(l - t).ancestor(h, tree);
if (ancestor.value().matches("NP.{0,2}")) {
heights[t] = h - 1;
break;
}
}
}
int combinedHeight = 0;
for (int h = 0; h < heights.length; ++h) {
combinedHeight = Math.max(combinedHeight, heights[h]);
}
toReturn.add(combinedHeight);
}
return toReturn;
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
int minHeight = Integer.MAX_VALUE;
for (CoreMap sentence : sentences) {
List<Integer> heights = syntactTree(sentence, ngramForm.getKey());
for (int height : heights) {
// (double)vals[1] to the denominator???
updateFeatureVals(this.getClass().getName() + "_MEAN", height, docToCheck);
minHeight = Math.min(minHeight, height);
}
}
updateFeatureVals(this.getClass().getName() + "_MIN", minHeight, docToCheck, TreeSet.class);
}
}