package weka.classifiers.trees.j48; import weka.core.RevisionUtils; import weka.core.Utils; /** * Class for computing the information gain for a given distribution. * * @author Ryan Lichtenwalter (rlichten@cse.nd.edu) * @version $Revision: 1.00 $ */ public final class HDIGSplitCrit extends EntropyBasedSplitCrit { /** * This method is a straightforward implementation of the HD-IG product distance * criterion for the given distribution. * * @param bags the distribution */ public final double splitCritValue( Distribution bags ) { double n = bags.actualNumClasses(); double hellingerDistance = 0; for( int classIndex1 = 0; classIndex1 < n; classIndex1++ ) { for( int classIndex2 = 0; classIndex2 < n; classIndex2++ ) { if( classIndex1 == classIndex2 ) { continue; } double classCount1 = bags.perClass( classIndex1 ); double classCount2 = bags.perClass( classIndex2 ); double tempDistance = 0; for( int bagIndex = 0; bagIndex < 2; bagIndex++ ) { double classByBagCount1 = bags.perClassPerBag( bagIndex, classIndex1 ); double classByBagCount2 = bags.perClassPerBag( bagIndex, classIndex2 ); double percInBag1 = classByBagCount1 / classCount1; double percInBag2 = classByBagCount2 / classCount2; double rootDifference = Math.sqrt( percInBag1 ) - Math.sqrt( percInBag2 ); tempDistance += Math.pow( rootDifference, 2 ); } hellingerDistance += Math.sqrt( tempDistance ); } } // for multi-class support hellingerDistance = hellingerDistance / ( n * ( n - 1 ) / 2 ); double informationGain = oldEnt(bags)-newEnt(bags); double value = hellingerDistance * informationGain; // Splits with no gain are useless. if( Utils.eq( value, 0 ) ) { return Double.NEGATIVE_INFINITY; } return value; } /** * This method is a straightforward implementation of the HD-IG product distance * criterion for the given distribution. * * @param bags the distribution * @param totalNoInst weight of ALL instances (including the ones with missing values). */ public final double splitCritValue( Distribution bags, double totalNoInst ) { double noUnknown = totalNoInst - bags.total(); double unknownRate = noUnknown / totalNoInst; double distance = splitCritValue( bags ); return (1-unknownRate) * distance; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract( "$Revision: 1.00 $" ); } }