/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Genetic_Rule_Learning.PART;
import java.util.Enumeration;
import java.util.Vector;
/**
* <p>
* Class to implement the calculus of the cut point
* </p>
*
* <p>
* @author Written by Crist�bal Romero Morales (University of Oviedo) 01/07/2008
* @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008
* @version 1.1
* @since JDK1.2
* </p>
*/
public class Cut{
/**Classification of class values. */
protected Classification classification;
/** Number of subsets. */
protected int numSubsets;
/** Number of branches. */
private int nBranches;
/** Attribute to cut on. */
private int attributeIndex;
/** Minimum number of itemsets per leaf. */
private int minItemsets;
/** Cut point. */
private double cutPoint;
/** Information gain of cut. */
private double infoGain;
/** Gain ratio of cut. */
private double gainRatio;
/** The sum of the weights of the itemsets. */
private double sumOfWeights;
/** Number of cut points. */
private int nCuts;
/** Function to initialize the cut model.
*
* @param index The attribute index.
* @param nObj Minimum number of itemsets.
* @param weights The weight of all the itemsets.
*/
public Cut( int index,int nObj, double weights )
{
// Get index of attribute to cut on.
attributeIndex = index;
// Set minimum number of objects.
minItemsets = nObj;
// Set the sum of the weights
sumOfWeights = weights;
}
/** Function to use when no cut is necessary.
*
* @param dist Distribution of values per class.
*/
public Cut( Classification dist )
{
classification = new Classification( dist );
numSubsets = 1;
}
/** Function to create the cut point.
*
* @param trainItemsets The dataset to classify.
*
* @throws Exception If the classification cannot be made.
*/
public void classify( MyDataset trainItemsets ) throws Exception
{
if ( numSubsets == 1 )
classification = new Classification( trainItemsets );
else
{
// Initialize the remaining itemset variables.
numSubsets = 0;
cutPoint = Double.MAX_VALUE;
infoGain = 0;
gainRatio = 0;
// Different treatment for enumerated and numeric attributes.
if ( trainItemsets.getAttribute( attributeIndex ).isDiscret() )
{
if ( nBranches != 2 )
{
nBranches = trainItemsets.getAttribute( attributeIndex ).numValues();
nCuts = nBranches;
}
else
nCuts = 0;
cutDiscret( trainItemsets );
}
else
{
nCuts = 0;
trainItemsets.sort( attributeIndex );
cutContinuous( trainItemsets );
}
}
}
/** Function to compute the probability for itemset.
*
* @param classIndex The index of the class.
* @param itemset The itemset.
* @param subset The index of the subset.
*
* @return The probability computed.
*/
public final double classProbability( int classIndex, Itemset itemset, int subset )
{
if( numSubsets == 1 )
{
if ( subset > -1 )
return classification.probability( classIndex, subset );
else
{
double [] weights = weights( itemset );
if ( weights == null )
return classification.probability( classIndex );
else
{
double prob = 0;
for ( int i = 0; i < weights.length; i++ )
prob += weights[i] * classification.probability( classIndex, i );
return prob;
}
}
}
else
{
if ( subset <= -1 )
{
double [] weights = weights( itemset );
if ( weights == null )
return classification.probability( classIndex );
else
{
double prob = 0;
for ( int i = 0; i < weights.length; i++ )
prob += weights[i] * classification.probability( classIndex, i );
return prob;
}
}
else
{
if ( classification.perValue( subset ) > 0 )
return classification.probability( classIndex, subset );
else
{
if ( classification.maxClass() == classIndex )
return 1;
else
return 0;
}
}
}
}
/** Function to create the cut on continuous attributes.
*
* @param trainItemsets The dataset used to compute the cut.
*/
private void cutContinuous( MyDataset trainItemsets )
{
int firstMiss, next = 1, last = 0, cutIndex = -1, i;
double currentInfoGain, defaultEnt, minCut;
Itemset itemset;
// Current attribute is a numeric attribute.
classification = new Classification( 2, trainItemsets.numClasses() );
// Only Dataset with known values are relevant.
Enumeration enum2 = trainItemsets.enumerateItemsets();
i = 0;
while ( enum2.hasMoreElements() )
{
itemset = (Itemset) enum2.nextElement();
if ( itemset.isMissing( attributeIndex ) )
break;
classification.add( 1, itemset );
i++;
}
firstMiss = i;
// Compute minimum number of Dataset required in each subset.
minCut = 0.1*( classification.getTotal() ) / ( (double)trainItemsets.numClasses() );
if ( minCut <= minItemsets )
minCut = minItemsets;
else if ( minCut > 5 )
minCut = 25;
// Enough Dataset with known values?
if ( (double)firstMiss < 2 * minCut )
return;
// Compute values of criteria for all possible cut indices.
defaultEnt = oldEntropy( classification );
while ( next < firstMiss )
{
if ( trainItemsets.itemset( next - 1 ).getValue( attributeIndex )
+ 1e-5 < trainItemsets.itemset( next ).getValue( attributeIndex ) )
{
// Move class values for all Dataset up to next
// possible cut point.
classification.shiftRange( 1, 0, trainItemsets, last, next );
// Check if enough Dataset in each subset and compute
// values for criteria.
if ( classification.perValue( 0 ) >= minCut &&
classification.perValue( 1 ) >= minCut )
{
currentInfoGain = infoGainCutCrit(classification, sumOfWeights, defaultEnt );
if ( currentInfoGain > infoGain )
{
infoGain = currentInfoGain;
cutIndex = next-1;
}
nCuts++;
}
last = next;
}
next++;
}
// Was there any useful cut?
if ( nCuts == 0 )
return;
// Compute modified information gain for best cut.
infoGain = infoGain - ( ( Math.log( nCuts ) / Math.log( 2 ) ) / sumOfWeights );
if ( infoGain <= 0 )
return;
// Set itemset variables' values to values for best cut.
numSubsets = 2;
cutPoint = ( trainItemsets.itemset( cutIndex + 1 ).getValue( attributeIndex )+
trainItemsets.itemset( cutIndex ).getValue( attributeIndex ) ) / 2;
// Restore classification for best cut.
classification = new Classification( 2, trainItemsets.numClasses() );
classification.addRange( 0, trainItemsets, 0, cutIndex + 1 );
classification.addRange( 1, trainItemsets, cutIndex + 1, firstMiss );
// Compute modified gain ratio for best cut.
gainRatio = gainRatioCutCrit( classification, sumOfWeights, infoGain );
}
/** Function to create the cut on discret attributes.
*
* @param trainItemsets The dataset used to compute the cut.
*/
private void cutDiscret( MyDataset trainItemsets )
{
Itemset itemset;
classification = new Classification( nBranches, trainItemsets.numClasses() );
// Only Dataset with known values are relevant.
Enumeration enum2 = trainItemsets.enumerateItemsets();
while ( enum2.hasMoreElements() )
{
itemset = (Itemset) enum2.nextElement();
if ( !itemset.isMissing( attributeIndex ) )
classification.add( (int)itemset.getValue( attributeIndex ), itemset );
}
// Check if minimum number of Dataset in at least two subsets.
if ( classification.check( minItemsets ) )
{
numSubsets = nBranches;
infoGain = infoGainCutCrit( classification, sumOfWeights, oldEntropy( classification ) );
gainRatio = gainRatioCutCrit( classification, sumOfWeights, infoGain );
}
}
/** Function to set the cut point.
*
* @param allItemsets The dataset used for the cut.
*/
public final void setCutPoint( MyDataset allItemsets )
{
double newCutPoint = -Double.MAX_VALUE;
double tempValue;
Itemset itemset;
if ( ( allItemsets.getAttribute( attributeIndex ).isContinuous() ) && ( numSubsets > 1 ) )
{
Enumeration enum2 = allItemsets.enumerateItemsets();
while ( enum2.hasMoreElements() )
{
itemset = (Itemset) enum2.nextElement();
if ( !itemset.isMissing( attributeIndex ) )
{
tempValue = itemset.getValue( attributeIndex );
if ( tempValue > newCutPoint && tempValue <= cutPoint )
newCutPoint = tempValue;
}
}
cutPoint = newCutPoint;
}
}
/** Function to cut the dataset in subsets.
*
* @param data The dataset to cut.
*
* @return All the datasets created.
*
* @throws Exception If the dataset cannot be cut.
*/
public final MyDataset [] cutDataset( MyDataset data ) throws Exception
{
MyDataset [] itemsets = new MyDataset[numSubsets];
double [] weights;
double newWeight;
Itemset itemset;
int subset, i, j;
for ( j = 0; j < numSubsets; j++ )
itemsets[j] = new MyDataset( (MyDataset)data, data.numItemsets() );
for ( i = 0; i < data.numItemsets(); i++ )
{
itemset = ( (MyDataset) data ).itemset( i );
weights = weights( itemset );
subset = whichSubset( itemset );
if ( subset > -1 )
itemsets[subset].addItemset( itemset );
else
for ( j = 0; j < numSubsets; j++ )
if ( weights[j] > 0 )
{
newWeight = weights[j] * itemset.getWeight();
itemsets[j].addItemset( itemset );
itemsets[j].lastItemset().setWeight( newWeight );
}
}
for ( j = 0; j < numSubsets; j++ )
( (Vector)itemsets[j].itemsets ).trimToSize();
return itemsets;
}
/** Function to reset the classification of the model.
*
* @param data The new dataset used.
*
* @throws Exception If the classification cannot be reset.
*/
public void resetClassification( MyDataset data ) throws Exception
{
if ( numSubsets == 1 )
classification = new Classification( data, this );
else
{
MyDataset insts = new MyDataset( data, data.numItemsets() );
for ( int i = 0; i < data.numItemsets(); i++ )
if ( whichSubset( data.itemset( i ) ) > -1 )
insts.addItemset( data.itemset( i ) );
Classification newD = new Classification( insts, this );
newD.addWithUnknownValue( data, attributeIndex );
classification = newD;
}
}
/** Returns weights if itemset is assigned to more than one subset, null otherwise.
*
* @param itemset The itemset.
* @return weights if itemset is assigned to more than one subset, null otherwise.
*/
public final double [] weights( Itemset itemset )
{
if ( numSubsets == 1 )
return null;
else
{
double [] weights;
int i;
if ( itemset.isMissing( attributeIndex ) )
{
weights = new double [numSubsets];
for ( i = 0; i < numSubsets; i++ )
weights [i] = classification.perValue( i ) / classification.getTotal();
return weights;
}
else
return null;
}
}
/** Returns index of subset itemset is assigned to.
*
* @param itemset The itemset.
* @return index of subset itemset is assigned to.
*/
public final int whichSubset( Itemset itemset )
{
if ( numSubsets == 1 )
return 0;
else
{
if ( itemset.isMissing( attributeIndex ) )
return -1;
else
{
if ( itemset.getAttribute( attributeIndex ).isDiscret() )
return (int)itemset.getValue( attributeIndex );
else if ( itemset.getValue( attributeIndex ) <= cutPoint )
return 0;
else
return 1;
}
}
}
/** Function to check if generated model is valid.
*
* @return True if the model is valid. False otherwise.
*/
public final boolean checkModel()
{
if ( numSubsets > 0 )
return true;
else
return false;
}
/** Returns the classification created by the model.
* @return the classification created by the model.
*/
public final Classification classification()
{
return classification;
}
/** Returns the number of created subsets for the cut.
* @return the number of created subsets for the cut.
*/
public final int numSubsets()
{
return numSubsets;
}
/** Function to compute the gain ratio.
*
* @param values The classification used to compute the gain ratio.
* @param totalnoInst Number of itemsets.
* @param numerator The information gain.
*
* @return The gain ratio for the classification.
*/
public final double gainRatioCutCrit( Classification values, double totalnoInst, double numerator )
{
double denumerator, noUnknown, unknownRate;
int i;
// Compute cut info.
denumerator = cutEntropy( values, totalnoInst );
// Test if cut is trivial.
if ( denumerator == 0 )
return 0;
denumerator = denumerator / totalnoInst;
return numerator / denumerator;
}
/** Function to compute the information gain.
*
* @param values The classification used to compute the information gain.
* @param totalNoInst Number of itemsets.
* @param oldEnt The value for the entropy before cutting.
*
* @return The information gain.
*/
public final double infoGainCutCrit( Classification values, double totalNoInst, double oldEnt )
{
double numerator, noUnknown, unknownRate;
int i;
noUnknown = totalNoInst - values.getTotal();
unknownRate = noUnknown / totalNoInst;
numerator = ( oldEnt - newEntropy( values ) );
numerator = ( 1 - unknownRate ) * numerator;
// Cuts with no gain are useless.
if ( numerator == 0 )
return 0;
return numerator / values.getTotal();
}
/** Function to compute the cut entropy.
*
* @param values The classification used to compute the entropy.
* @param totalnoInst Number of itemsets.
*
* @return The entropy of the cut.
*/
private final double cutEntropy( Classification values, double totalnoInst )
{
double returnValue = 0, noUnknown;
int i;
noUnknown = totalnoInst - values.getTotal();
if ( values.getTotal() > 0 )
{
for ( i = 0; i < values.numValues(); i++ )
returnValue = returnValue - logFunc( values.perValue( i ) );
returnValue = returnValue - logFunc( noUnknown );
returnValue = returnValue + logFunc( totalnoInst );
}
return returnValue;
}
/** Function to compute entropy of classification before cutting.
*
* @param values The classification used to compute the entropy before cutting.
*
* @return The entropy for the classification before cutting.
*/
public final double oldEntropy( Classification values )
{
double returnValue = 0;
int j;
for ( j = 0; j < values.numClasses(); j++ )
returnValue = returnValue + logFunc( values.perClass( j ) );
return logFunc( values.getTotal() ) - returnValue;
}
/** Function to compute entropy of classification after cutting.
*
* @param values The classification used to compute the entropy after cutting.
*
* @return The entropy for the classification after cutting.
*/
public final double newEntropy( Classification values )
{
double returnValue = 0;
int i, j;
for ( i = 0; i < values.numValues(); i++ )
{
for ( j = 0; j < values.numClasses(); j++ )
returnValue = returnValue + logFunc( values.perClassPerValue( i, j ) );
returnValue = returnValue - logFunc( values.perValue( i ) );
}
return -returnValue;
}
/** Returns the log2
*
* @param num The number to compute the log2.
* @return the log2
*/
protected final double logFunc( double num )
{
// Constant hard coded for efficiency reasons
if ( num < 1e-6 )
return 0;
else
return num * Math.log( num ) / Math.log( 2 );
}
/** Returns information gain for the generated cut.
* @return information gain for the generated cut.
*/
public final double getInfoGain()
{
return infoGain;
}
/** Returns the gain ratio for the cut.
* @return the gain ratio for the cut.
*/
public final double getGainRatio()
{
return gainRatio;
}
/** Function to print left side of condition.
*
* @param data The dataset.
*
* @return The name of the attribute used in the cut.
*/
public final String leftSide( MyDataset data )
{
if ( numSubsets == 1 )
return "";
else
return data.getAttribute( attributeIndex ).name();
}
/** Function to print the condition satisfied by itemsets in a subset.
*
* @param index The index of the value.
* @param data The dataset.
*
* @return The value for the attribute of the cut.
*/
public final String rightSide( int index, MyDataset data )
{
if ( numSubsets == 1 )
return "";
else
{
StringBuffer text;
text = new StringBuffer();
if ( data.getAttribute( attributeIndex ).isDiscret() )
text.append( " = " + data.getAttribute( attributeIndex ).value( index ) );
else if (index == 0)
text.append( " <= " + doubleToString( cutPoint, 6 ) );
else
text.append( " > " + doubleToString( cutPoint, 6 ) );
return text.toString();
}
}
/** Function to print label for subset index of itemsets.
*
* @param index The index of the subset.
* @param data The dataset.
*
* @return The label created.
*/
public final String label( int index, MyDataset data )
{
StringBuffer text;
text = new StringBuffer();
text.append( ( (MyDataset)data ).getClassAttribute().value( classification.maxClass( index ) ) );
return text.toString();
}
/** Returns the index of the attribute to cut on.
* @return the index of the attribute to cut on.
*/
public final int attributeIndex()
{
return attributeIndex;
}
/** Function to round a double and converts it into String.
*
* @param value The value to print.
* @param afterDecimalPoint Number of decimals positions.
*
* @return The value with the given number of decimals.
*/
public static String doubleToString( double value, int afterDecimalPoint )
{
StringBuffer stringBuffer;
double temp;
int i,dotPosition;
long precisionValue;
temp = value * Math.pow( 10.0, afterDecimalPoint );
if ( Math.abs( temp ) < Long.MAX_VALUE )
{
precisionValue = ( temp > 0 ) ? (long)( temp + 0.5 ) : -(long)( Math.abs( temp ) + 0.5 );
if ( precisionValue == 0 )
stringBuffer = new StringBuffer( String.valueOf( 0 ) );
else
stringBuffer = new StringBuffer( String.valueOf( precisionValue ) );
if ( afterDecimalPoint == 0 )
return stringBuffer.toString();
dotPosition = stringBuffer.length() - afterDecimalPoint;
while ( ( (precisionValue < 0 ) && ( dotPosition < 1 ) ) || ( dotPosition < 0 ) )
{
if ( precisionValue < 0 )
stringBuffer.insert( 1, 0 );
else
stringBuffer.insert( 0, 0 );
dotPosition++;
}
stringBuffer.insert( dotPosition, '.' );
if ( ( precisionValue < 0 ) && ( stringBuffer.charAt(1) == '.' ) )
stringBuffer.insert( 1, 0 );
else if ( stringBuffer.charAt( 0 ) == '.' )
stringBuffer.insert( 0, 0 );
int currentPos = stringBuffer.length() - 1;
if ( stringBuffer.charAt( currentPos ) == '.' )
stringBuffer.setCharAt( currentPos, ' ' );
return stringBuffer.toString().trim();
}
return new String("" + value);
}
/** Function to round a double and converts it into String.
*
* @param value The value to print.
* @param width The width that must have the string generated.
* @param afterDecimalPoint Number of decimals positions.
*
* @return The value with the given number of decimals.
*/
public static String doubleToString( double value, int width, int afterDecimalPoint)
{
String tempString = doubleToString( value, afterDecimalPoint );
char[] result;
int dotPosition;
// Protects sci notation
if ( ( afterDecimalPoint >= width ) || ( tempString.indexOf( 'E' ) != -1 ) )
return tempString;
// Initialize result
result = new char[width];
for ( int i = 0; i < result.length; i++ )
result[i] = ' ';
if ( afterDecimalPoint > 0 )
{
// Get position of decimal point and insert decimal point
dotPosition = tempString.indexOf( '.' );
if ( dotPosition == -1 )
dotPosition = tempString.length();
else
result[width - afterDecimalPoint - 1] = '.';
}
else
dotPosition = tempString.length();
int offset = width - afterDecimalPoint - dotPosition;
if ( afterDecimalPoint > 0 )
offset--;
// Not enough room to decimal align within the supplied width
if ( offset < 0 )
return tempString;
// Copy characters before decimal point
for ( int i = 0; i < dotPosition; i++ )
result[offset + i] = tempString.charAt( i );
// Copy characters after decimal point
for ( int i = dotPosition + 1; i < tempString.length(); i++ )
result[offset + i] = tempString.charAt( i );
return new String( result );
}
/**
* It returns the cutpoint
*
* @return the cutpoint
*/
public double getCutPoint(){
return cutPoint;
}
}