package quickml.supervised.tree.decisionTree.scorers; import com.google.common.collect.Sets; import quickml.supervised.tree.decisionTree.valueCounters.ClassificationCounter; import quickml.supervised.tree.reducers.AttributeStats; import quickml.supervised.tree.scorers.GRImbalancedScorer; import java.io.Serializable; public final class PenalizedSplitDiffScorer extends GRImbalancedScorer<ClassificationCounter> { /* * The general idea here is that a good split is one where the proportions * of classifications on each side of the split are as different as * possible. eg. if 50% of the classifications in set A are "dog", then the * further away from 50% the proportion of "dog" classifications in set B * are, the better. * * We therefore add up the differences between the proportions, however we * have another goal, which is that its preferable for the sets to be of * close to equal getSize. Without this requirement a split with 0 on one getSize * would get a high score because all of the proportions on that side would * be 0. * * So, we multiply the score by the getSize of the smallest side, which * experimentally seems to provide an adequate bias against one-sided * splits. */ public PenalizedSplitDiffScorer(double degreeOfGainRatioPenalty, double imbalancePenaltyPower, AttributeStats<ClassificationCounter> attributeStats) { super(degreeOfGainRatioPenalty, imbalancePenaltyPower, attributeStats); } @Override public double getUnSplitScore(ClassificationCounter a) { return 0; } @Override public double scoreSplit(final ClassificationCounter a, final ClassificationCounter b) { double score = 0; for (final Serializable value : Sets.union(a.allClassifications(), b.allClassifications())) { final double aProp = (double) a.getCount(value) / a.getTotal(); final double bProp = (double) b.getCount(value) / b.getTotal(); score += Math.abs(aProp - bProp) * Math.min(a.getTotal(), b.getTotal()); } return correctForGainRatio(score); } public String toString() { return "SplitDiffScorer"; } }