package quickml.supervised.tree.scorers; import quickml.supervised.tree.reducers.AttributeStats; import quickml.supervised.tree.summaryStatistics.ValueCounter; import java.io.Serializable; /** * The scorerFactory is responsible for assessing the quality of a "split" of data. */ public abstract class GRScorer<VC extends ValueCounter<VC>> implements Scorer<VC>, Serializable { protected final double degreeOfGainRatioPenalty; protected final double intrinsicValue; protected final double unSplitScore; public GRScorer(double degreeOfGainRatioPenalty, AttributeStats<VC> attributeStats) { this.degreeOfGainRatioPenalty = degreeOfGainRatioPenalty; this.intrinsicValue = getIntrinsicValue(attributeStats); this.unSplitScore = getUnSplitScore(attributeStats.getAggregateStats()); } private double getIntrinsicValue(AttributeStats<VC> attributeStats) { double intrinsicValue = 0; double attributeValProb = 0; if (attributeStats.getStatsOnEachValue() != null && !attributeStats.getStatsOnEachValue().isEmpty()) { for (VC valueCounter : attributeStats.getStatsOnEachValue()) { if (!valueCounter.isEmpty()) { // if it is empty, it should not be considered. attributeValProb = valueCounter.getTotal() / attributeStats.getAggregateStats().getTotal(); intrinsicValue -= attributeValProb * Math.log(attributeValProb) / Math.log(2); } } } else { intrinsicValue = 1.0; } return intrinsicValue==0.0 ? 1.0 : intrinsicValue; } /** * @return A score, where a higher value indicates a better split. A value * of 0 being the lowest, and indicating no value. */ protected abstract double getUnSplitScore(VC a); protected double correctForGainRatio(double uncorrectedScore) { /** call this method from score split only degreeOfGainRatioPenalty is non zero*/ return uncorrectedScore * (1 - degreeOfGainRatioPenalty) + degreeOfGainRatioPenalty * (uncorrectedScore / intrinsicValue); } }