package quickml.supervised.PredictiveModelsFromPreviousVersionsToBenchMarkAgainst.oldScorers;
import com.google.common.collect.Sets;
import quickml.supervised.PredictiveModelsFromPreviousVersionsToBenchMarkAgainst.OldScorer;
import quickml.supervised.PredictiveModelsFromPreviousVersionsToBenchMarkAgainst.oldTree.OldClassificationCounter;
import java.io.Serializable;
public final class SplitDiffOldScorer implements OldScorer {
/*
* The general idea here is that a good split is one where the proportions
* of classifications on each side of the split are as different as
* possible. eg. if 50% of the classifications in set A are "dog", then the
* further away from 50% the proportion of "dog" classifications in set B
* are, the better.
*
* We therefore add up the differences between the proportions, however we
* have another goal, which is that its preferable for the sets to be of
* close to equal size. Without this requirement a split with 0 on one size
* would get a high score because all of the proportions on that side would
* be 0.
*
* So, we multiply the score by the size of the smallest side, which
* experimentally seems to provide an adequate bias against one-sided
* splits.
*/
@Override
public double scoreSplit(final OldClassificationCounter a, final OldClassificationCounter b) {
double score = 0;
for (final Serializable value : Sets.union(a.allClassifications(), b.allClassifications())) {
final double aProp = (double) a.getCount(value) / a.getTotal();
final double bProp = (double) b.getCount(value) / b.getTotal();
score += Math.abs(aProp - bProp) * Math.min(a.getTotal(), b.getTotal());
}
return score;
}
public String toString() {
return "SplitDiffScorer";
}
}