// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.signature;
import java.util.ArrayList;
import java.util.List;
public class Split implements Comparable<Split> {
protected double score_;
protected List<Trie> children_;
protected int feature_index_;
protected Trie trie_;
protected boolean valid_;
public Split(List<Feature> features, Trie trie, int feature_index) {
feature_index_ = feature_index;
trie_ = trie;
Feature feature = features.get(feature_index);
children_ = new ArrayList<Trie>(2);
children_.add(new Trie(trie, feature_index, 0));
children_.add(new Trie(trie, feature_index, 1));
for (int index = 0; index < trie.words_.size(); index++) {
boolean value = feature.feature(trie.words_.get(index));
int int_value = value ? 0 : 1;
children_.get(int_value).words_.add(trie.words_.get(index));
children_.get(int_value).tags_.add(trie.tags_.get(index));
}
score_ = 0.;
double[] entropy = trie.getEntropy();
valid_ = true;
for (int k = 0; k < entropy.length; k++) {
double current_score = entropy[k];
for (Trie child : children_) {
double p = child.words_.size() / (double) trie.words_.size();
if (p < 0.001 || child.words_.size() < 50) {
valid_ = false;
break;
}
double[] child_entropy = child.getEntropy();
current_score -= p * child_entropy[k];
}
score_ += current_score;
}
score_ *= trie.words_.size();
}
@Override
public int compareTo(Split o) {
return -Double.compare(score_, o.score_);
}
public static List<String> shorten(List<String> list) {
if (list.size() < 5)
return list;
return list.subList(0, 5);
}
}