/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Decision_Trees.PUBLIC;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Random;
/**
*
* File: Node.java
*
* Data structure that is used in the construction of the decision tree.
* It stores the information about the attributes with a list of Registers for each
* attribute. The node also has a histogram that resumes this information.
*
* @author Written by Victoria Lopez Morales (University of Granada) 15/03/2009
* @version 1.0
* @since JDK1.5
*/
public class Node {
/**
* Identifier of the node
*/
int identifier;
/**
* Name of each attribute of the node
*/
ArrayList <myAttribute> the_attributes;
/**
* Separate list for each attribute
*/
ArrayList <ArrayList <Register>> attributes;
// Histogram for the attributes
/**
* Number of rows for the histograms of the attributes
*/
int [] rows;
/**
* Number of columns for the histograms
*/
int [] columns;
/**
* Histogram for categorical attributes
*/
ArrayList <int [][]> count_matrix;
/**
* Histogram for continuous attributes c_below
*/
ArrayList <int [][]> c_below;
/**
* Histogram for continuous attributes c_above
*/
ArrayList <int [][]> c_above;
/**
* Generator for pseudorandom numbers
*/
Random generator;
/**
* Creates a node with a base constructor that doesn't initialize the node structures
*/
public Node () {
}
/**
* Creates a node from a complete dataset and its corresponding id. It is the most common use for
* constructing a node
*
* @param data Dataset that has the data that is going to be stored in the node
* @param id Number identifying the node that is being created
*/
public Node (myDataset data, int id) {
myAttribute att;
Register reg;
ArrayList <Register> reglist;
identifier = id;
the_attributes = new ArrayList <myAttribute> ();
attributes = new ArrayList <ArrayList <Register>> ();
// (1) Constructing attribute lists
for (int j=0; j<data.getNumAtr(); j++) {
// Allocate an array list for the attribute
reglist = new ArrayList <Register> ();
att = data.getAttributeI (j);
// Add the array list to the_attributes list
the_attributes.add(new myAttribute (att));
// Add all the information into the attribute list with registers
for (int i=0; i<data.getNumIns(); i++) {
reg = new Register (i, data.getDataI(i,j), data.getOutputI(i));
reglist.add(reg);
}
attributes.add(reglist);
}
// Printing attribute lists to check them
/*for (int i=0; i<attributes.size(); i++) {
System.out.println(attributesNames.get(i));
for (int j=0; j<attributes.get(i).size(); j++) {
System.out.println(attributes.get(i).get(j));
}
}*/
// (2) Sorting the attribute lists for numeric attributes
for (int i=0; i<data.getNumAtr(); i++) {
if (!data.getAttributeI(i).isNominal()) {
for (int j=0; j<data.getNumIns(); j++) {
Collections.sort(attributes.get(i));
}
}
}
// Printing attribute lists to check them
/*for (int i=0; i<attributes.size(); i++) {
System.out.println(attributesNames.get(i));
for (int j=0; j<attributes.get(i).size(); j++) {
System.out.println(attributes.get(i).get(j));
}
}*/
// (3) Constructing the histogram for the class distribution
rows = new int[data.getNumAtr()];
columns = new int[data.getNumAtr()];
// First, we create the information about the data stored in the node
for (int i=0; i<data.getNumAtr(); i++) {
if (!data.getAttributeI(i).isNominal()) {
// Numeric attributes
int aux_columns;
aux_columns = data.getOutputAttribute().getValues().size();
rows[i] = 1;
columns[i] = aux_columns;
}
else {
// Categorical attributes
int aux_rows, aux_columns;
aux_rows = data.getAttributeI(i).getValues().size();
aux_columns = data.getOutputAttribute().getValues().size();
rows[i] = aux_rows;
columns[i] = aux_columns;
}
}
// Then, we calculate all the histograms
calculateHistograms ();
// We initialize the generator of random numbers
generator = new Random(12345678);
// Printing the class histograms to check them
/*for (int i=0; i<data.getNumAtr(); i++) {
System.out.println(the_attributes.get(i).getName());
if (!data.getAttributeI(i).isNominal()) {
// Numeric attributes
System.out.print("C_below:");
for (int j=0; j<columns[i]; j++) {
System.out.print(" " + c_below.get(i)[0][j]);
}
System.out.println();
System.out.print("C_above:");
for (int j=0; j<columns[i]; j++) {
System.out.print(" " + c_above.get(i)[0][j]);
}
System.out.println();
}
else {
// Categorical attributes
System.out.println("Count matrix:");
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]; k++) {
System.out.print(count_matrix.get(i)[j][k] + " ");
}
System.out.println();
}
}
}*/
}
/**
* Creates a node from another existing node
*
* @param nod Original node from which we are going to create a copy
*/
public Node (Node nod) {
ArrayList <Register> auxregister;
int [][] aux_count_matrix;
int [][] aux_c_below;
int [][] aux_c_above;
// Copy each data field to the new attribute
this.identifier = nod.identifier;
// Copy the_attributes list
this.the_attributes = new ArrayList <myAttribute>();
for (int i=0; i<nod.attributes.size(); i++) {
this.the_attributes.add(new myAttribute(nod.the_attributes.get(i)));
}
this.attributes = new ArrayList <ArrayList<Register>>();
for (int i=0; i<nod.attributes.size(); i++) {
auxregister = new ArrayList <Register> ();
for (int j=0; j<nod.attributes.get(i).size(); j++) {
auxregister.add(new Register(nod.attributes.get(i).get(j)));
}
this.attributes.add(auxregister);
}
// Copy the histograms
this.rows = new int [nod.attributes.size()];
System.arraycopy(nod.rows, 0, rows, 0, nod.rows.length);
this.columns = new int [nod.attributes.size()];
System.arraycopy(nod.columns, 0, columns, 0, nod.columns.length);
this.count_matrix = new ArrayList <int [][]> (); // Histogram for categorical
this.c_below = new ArrayList <int [][]> (); // Histogram for continuous
this.c_above = new ArrayList <int [][]> (); // Histogram for continuous
for (int i=0; i<nod.attributes.size(); i++) {
if (!the_attributes.get(i).isNominal()) {
aux_count_matrix = new int[1][1];
aux_c_below = new int[1][columns[i]];
aux_c_above = new int[1][columns[i]];
for (int j=0; j<columns[i]; j++) {
aux_c_below[0][j] = nod.c_below.get(i)[0][j];
aux_c_above[0][j] = nod.c_above.get(i)[0][j];
}
count_matrix.add(aux_count_matrix);
c_below.add(aux_c_below);
c_above.add(aux_c_above);
}
else {
aux_count_matrix = new int[rows[i]][columns[i]];
aux_c_below = new int[1][1];
aux_c_above = new int[1][1];
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]; k++) {
aux_count_matrix[j][k] = nod.count_matrix.get(i)[j][k];
}
}
count_matrix.add(aux_count_matrix);
c_below.add(aux_c_below);
c_above.add(aux_c_above);
}
}
this.generator = nod.generator;
}
/**
* Checks if a node is the same node as another object
*
* @param obj Object that is checked to see if it is the same node
* @return true if the nodes are the same, false otherwise
* @see java.lang.Object#equals(java.lang.Object)
*/
public boolean equals (Object obj) {
boolean eq;
// First we check if the reference is the same
if (this == obj)
return true;
// Then we check if the object exists and is from the class Node
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
// object must be Node at this point
Node test = (Node)obj;
// We check the class attributes of the Node class
eq = (identifier == test.identifier);
eq = eq && ((the_attributes == test.the_attributes) || (the_attributes != null && the_attributes.equals(test.the_attributes)));
eq = eq && ((attributes == test.attributes) || (attributes != null && attributes.equals(test.attributes)));
if (eq) {
for (int i=0; i<attributes.size(); i++) {
if (rows[i] != test.rows[i])
return false;
else if (columns[i] != test.columns[i])
return false;
else {
if (rows[i] > 1) {
// Categorical attribute
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]; k++) {
if (count_matrix.get(i)[j][k] != test.count_matrix.get(i)[j][k]) {
return false;
}
}
}
}
else {
// Numerical attribute
for (int j=0; j<columns[j]; j++) {
if (c_above.get(i)[0][j] != test.c_above.get(i)[0][j])
return false;
else if (c_below.get(i)[0][j] != test.c_below.get(i)[0][j])
return false;
}
}
}
}
}
return eq;
}
/**
* Hash-code function for the class that is used when object is inserted in a structure like a hashtable
*
* @return the hash code obtained
* @see java.lang.Object#hashCode()
*/
public int hashCode() {
int hash = 7;
hash = 31 * hash + identifier;
hash = 31 * hash + (null == the_attributes ? 0 : the_attributes.hashCode());
hash = 31 * hash + (null == attributes ? 0 : attributes.hashCode());
return hash;
}
/**
* Overriden function that converts the class to a string
*
* @return the string representation of the class
* @see java.lang.Object#toString()
*/
public String toString() {
String aux;
aux = new String("Node " + identifier + "\n");
for (int i=0; i<attributes.size(); i++) {
aux = aux + the_attributes.get(i).getName() + "\nAttribute list:\n";
// Print attribute lists
for (int j=0; j<attributes.get(i).size(); j++) {
aux = aux + attributes.get(i).get(j) + "\n";
}
// Print histograms
aux = aux + "Histogram\n";
if (!the_attributes.get(i).isNominal()) {
// Numeric attributes
aux += "C_below:";
for (int j=0; j<columns[i]; j++) {
aux = aux + " " + c_below.get(i)[0][j];
}
aux += "\nC_above:";
for (int j=0; j<columns[i]; j++) {
aux = aux + " " + c_above.get(i)[0][j];
}
aux += "\n";
}
else {
// Categorical attributes
aux += "Count matrix:\n";
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]; k++) {
aux = aux + count_matrix.get(i)[j][k] + " ";
}
aux += "\n";
}
}
}
return aux;
}
/**
* Calculate the histograms for all the attributes of the node taking in account if they are categorical
* attributes or numerical attributes
*/
private void calculateHistograms () {
int [][] aux_count_matrix;
int [][] aux_c_below;
int [][] aux_c_above;
Register reg;
count_matrix = new ArrayList <int [][]> (); // Histogram for categorical
c_below = new ArrayList <int [][]> (); // Histogram for continuous
c_above = new ArrayList <int [][]> (); // Histogram for continuous
for (int i=0; i<attributes.size(); i++) {
if (!the_attributes.get(i).isNominal()) {
// Numeric attributes
aux_count_matrix = new int[1][1];
aux_c_below = new int[1][columns[i]];
aux_c_above = new int[1][columns[i]];
// Initialize structures
for (int j=0; j<columns[i]; j++) {
aux_c_below[0][j] = 0;
aux_c_above[0][j] = 0;
}
// Compute all the instances in the register list
for (int j=0; j<attributes.get(i).size(); j++) {
reg = (Register)attributes.get(i).get(j);
aux_c_above[0][reg.getOutputClass()]++;
}
// Add information to the histogram lists
count_matrix.add(aux_count_matrix);
c_below.add(aux_c_below);
c_above.add(aux_c_above);
}
else {
// Categorical attributes
aux_count_matrix = new int[rows[i]][columns[i]];
aux_c_below = new int[1][1];
aux_c_above = new int[1][1];
// Initialize structure
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]; k++) {
aux_count_matrix[j][k] = 0;
}
}
// Compute all the instances in the register list
for (int j=0; j<attributes.get(i).size(); j++) {
reg = (Register)attributes.get(i).get(j);
aux_count_matrix[(int)reg.getAttributeValue()][reg.getOutputClass()]++;
}
// Add information to the histogram lists
count_matrix.add(aux_count_matrix);
c_below.add(aux_c_below);
c_above.add(aux_c_above);
}
}
}
/**
* Check is a node is pure or a node isn't pure
*
* @return true, if all the data that is in the node is from the same class; false, otherwise
*/
public boolean isPure() {
// If there aren't any instances in the node, then the node is pure
if (attributes.isEmpty())
return true;
// Get the class of the first instance
int oclass = attributes.get(0).get(0).getOutputClass();
ArrayList <Register> aux;
// Compare the class to all the other instances
for (int i=0; i<attributes.size(); i++) {
aux = attributes.get(i);
for (int j=0; j<aux.size(); j++) {
// If one of the instances has a different class, return false
if (aux.get(j).getOutputClass() != oclass)
return false;
}
}
return true;
}
/**
* Evaluates all splits and returns the best split found. For each attribute, it evaluates all possible
* splits and select the split with the minimum entropy.
*
* @return the best split for this node after evaluating all splits
*/
public Split evaluateAllSplits () {
Split aux;
int cursor, class_acc, total_acc, attribute_split;
double entropy, entropyS1, entropyS2, pj, value_split;
double attribute_best_entropy[];
int attribute_position_entropy[];
int matrix[][];
Register reg;
// Allocate memory for the best splits in each attribute
attribute_best_entropy = new double [attributes.size()];
attribute_position_entropy = new int [attributes.size()];
// For each attribute
for (int i=0; i<attributes.size(); i++) {
System.out.println("Evaluating split points for: " + the_attributes.get(i).getName());
if (the_attributes.get(i).isNominal()) {
// Attribute is categorical
// Construct the histogram
matrix = new int [rows[i]][columns[i]+1];
for (int j=0; j<rows[i]; j++) {
for (int k=0; k<columns[i]+1; k++) {
matrix[j][k] = 0;
}
}
for (int j=0; j<attributes.get(i).size(); j++) {
reg = (Register)attributes.get(i).get(j);
matrix[(int)reg.getAttributeValue()][reg.getOutputClass()]++;
matrix[(int)reg.getAttributeValue()][columns[i]]++;
}
attribute_position_entropy[i] = -1;
total_acc = 1;
// Start at the first possible value
for (cursor = 0; cursor < rows[i]; cursor++) {
entropyS1 = 0;
entropyS2 = 0;
// Calculate the entropy for this possible split
for (int j=0; j<columns[i]; j++) {
if (matrix[cursor][columns[i]] != 0) {
pj = (double)matrix[cursor][j]/(double)matrix[cursor][columns[i]];
}
else {
pj = 0;
}
if (pj != 0)
entropyS1 += pj * (Math.log(pj)/Math.log(2.0));
class_acc = 0;
total_acc = 0;
for (int k=0; k<rows[i]; k++) {
if (k != cursor) {
class_acc += matrix[k][j];
total_acc += matrix[k][columns[i]];
}
}
if (total_acc != 0)
pj = (double)class_acc/(double)total_acc;
else
pj = 0;
if (pj != 0)
entropyS2 += pj * Math.log(pj)/Math.log(2.0);
}
entropyS1 = -entropyS1;
entropyS2 = -entropyS2;
entropy = (double)matrix[cursor][columns[i]]/(double)attributes.get(i).size() * entropyS1 + total_acc/(double)attributes.get(i).size() * entropyS2;
// Check if this split generates descendants with items
int elem_exits = 0;
for (int w=0; w<columns[i]; w++) {
elem_exits += matrix[cursor][w];
}
// Store as the best split if it the first possible value, the split generates correct descendants or it is lower
// than the current best split
if (((entropy < attribute_best_entropy[i]) || (attribute_position_entropy[i] == -1)) && (elem_exits != 0)) {
attribute_best_entropy[i] = entropy;
attribute_position_entropy[i] = cursor;
}
}
// The best split for this attribute is calculated
count_matrix.set(i, matrix);
System.out.println("Best split for " + the_attributes.get(i).getName() + " found: " + i + " " + the_attributes.get(i).getValues().get(attribute_position_entropy[i]) + " Entropy = " + attribute_best_entropy[i]);
}
else {
// Attribute is numeric
// Best, a priori is with cursor 0
entropy = 0;
attribute_best_entropy[i] = attributes.get(i).size();
attribute_position_entropy[i] = 0;
// Update histograms
reg = (Register)attributes.get(i).get(0);
matrix = c_below.get(i);
matrix[0][reg.getOutputClass()]++;
c_below.set(i, matrix);
matrix = c_above.get(i);
matrix[0][reg.getOutputClass()]--;
c_above.set(i, matrix);
// Compute the entropy in the middle values
for (cursor = 1; cursor < attributes.get(i).size(); cursor++) {
// As the numeric values are ordered, we only compute possible splits when a new value is found
if (attributes.get(i).get(cursor).getAttributeValue() != attributes.get(i).get(cursor-1).getAttributeValue()) {
// Compute the entropy for the considered split
entropyS1 = 0;
entropyS2 = 0;
for (int j=0; j<columns[i]; j++) {
pj = (double)c_below.get(i)[0][j]/(double)cursor;
if (pj != 0)
entropyS1 += pj * Math.log(pj)/Math.log(2.0);
pj = (double)c_above.get(i)[0][j]/(double)(attributes.get(i).size() - cursor);
if (pj != 0)
entropyS2 += pj * Math.log(pj)/Math.log(2.0);
}
entropyS1 = -entropyS1;
entropyS2 = -entropyS2;
if (attributes.get(i).size() != 0) {
entropy = ((double)cursor/(double)attributes.get(i).size()) * entropyS1 + ((double)(attributes.get(i).size() - cursor)/(double)attributes.get(i).size()) * entropyS2;
}
else {
System.err.println("There aren't any registers in the attribute list");
System.exit(-1);
}
// If the entropy is lower than the best entropy found, we store it
if (entropy < attribute_best_entropy[i]) {
attribute_best_entropy[i] = entropy;
attribute_position_entropy[i] = cursor;
}
}
// We update the histograms
reg = (Register)attributes.get(i).get(cursor);
matrix = c_below.get(i);
matrix[0][reg.getOutputClass()]++;
c_below.set(i, matrix);
matrix = c_above.get(i);
matrix[0][reg.getOutputClass()]--;
c_above.set(i, matrix);
}
// The best split for this attribute is calculated
System.out.println("Best split for " + the_attributes.get(i).getName() + " found: " + i + " " + attributes.get(i).get(attribute_position_entropy[i]).getAttributeValue() + " Entropy = " + attribute_best_entropy[i]);
}
}
// Calculate the best split between the attributes
value_split = attribute_best_entropy[0];
attribute_split = 0;
for (int i=1; i<attributes.size(); i++) {
if (attribute_best_entropy[i] < value_split) {
value_split = attribute_best_entropy[i];
attribute_split = i;
}
}
// Get the value of the split depending on whether the attribute is nominal or numerical
if (the_attributes.get(attribute_split).isNominal())
// The attribute is nominal, we get the value directly
value_split = attribute_position_entropy[attribute_split];
else {
// The attribute is numerical, we have to search for the value
boolean found = false;
value_split = attributes.get(attribute_split).get(attribute_position_entropy[attribute_split]).getAttributeValue();
for (int i=0; i<attributes.get(attribute_split).size() && !found; i++) {
if (attributes.get(attribute_split).get(i).getAttributeValue() > value_split) {
value_split = attributes.get(attribute_split).get(i).getAttributeValue();
found = true;
}
}
}
aux = new Split(attribute_split, value_split);
System.out.println("\nBEST SPLIT FOUND: " + aux);
return aux;
}
/**
* Splits a node into two nodes from a split following the identifier of a given number into an arraylist
* of nodes.
*
* @param best_split Split used in this node to divide the node into two new nodes left and right
* @param last_new_node Identifier of the last node created in the algorithm
* @return an arraylist with two nodes, node left and node right obtained from the original node with the split
*/
public ArrayList <Node> split (Split best_split, int last_new_node) {
ArrayList <Node> result;
ArrayList <Register> aux_left, aux_right;
Node left, right;
Register regaux;
result = new ArrayList <Node> ();
Hashtable <Integer, Integer> registers = new Hashtable <Integer, Integer> ();
left = new Node ();
right = new Node ();
// First we have to copy the attributes into the new nodes
left.the_attributes = new ArrayList<myAttribute>();
right.the_attributes = new ArrayList<myAttribute>();
for (int i=0; i<the_attributes.size(); i++) {
left.the_attributes.add(new myAttribute(the_attributes.get(i)));
right.the_attributes.add(new myAttribute(the_attributes.get(i)));
}
// Then we get new identifiers for the node
left.identifier = last_new_node + 1;
right.identifier = last_new_node + 2;
// Then, we split the separate list for each attribute
left.attributes = new ArrayList <ArrayList<Register>>(attributes.size());
right.attributes = new ArrayList <ArrayList<Register>>(attributes.size());
// Initialize attributes list to contain empty lists
for (int i=0; i<attributes.size(); i++) {
left.attributes.add(new ArrayList <Register> ());
right.attributes.add(new ArrayList <Register> ());
}
aux_left = new ArrayList <Register> ();
aux_right = new ArrayList <Register> ();
// We split according to the best_split given
if (the_attributes.get(best_split.getAttribute()).isNominal()) {
// Attribute is categorical
for (int j=0; j<attributes.get(best_split.getAttribute()).size(); j++) {
regaux = attributes.get(best_split.getAttribute()).get(j);
// Check whether the instance belongs to the left or the right descendant
// Add this information to a hash
if (regaux.getAttributeValue() == best_split.getValue()) {
aux_left.add(new Register (regaux));
registers.put(regaux.getIdentifier(), new Integer(0));
}
else {
aux_right.add(new Register (regaux));
registers.put(regaux.getIdentifier(), new Integer(1));
}
}
}
else {
// Attribute is numerical
for (int j=0; j<attributes.get(best_split.getAttribute()).size(); j++) {
regaux = attributes.get(best_split.getAttribute()).get(j);
// Check whether the instance belongs to the left or the right descendant
// Add this information to a hash
if (regaux.getAttributeValue() < best_split.getValue()) {
aux_left.add(new Register (regaux));
registers.put(regaux.getIdentifier(), new Integer(0));
}
else {
aux_right.add(new Register (regaux));
registers.put(regaux.getIdentifier(), new Integer(1));
}
}
}
// Check that the descendants have at least one instance on them
if ((aux_left.size() == 0) || (aux_right.size() == 0)) {
return null;
}
left.attributes.set(best_split.getAttribute(), aux_left);
right.attributes.set(best_split.getAttribute(), aux_right);
// For the attributes that aren't the best split
for (int i=0; i<attributes.size(); i++) {
if (i != best_split.getAttribute()) {
aux_left = new ArrayList <Register> ();
aux_right = new ArrayList <Register> ();
// Add all their instances to the descendants, using for that the information stored in the
// hash before
for (int j=0; j<attributes.get(i).size(); j++) {
regaux = attributes.get(i).get(j);
Integer leaf = (Integer)registers.get(regaux.getIdentifier());
if (leaf != null) {
if (leaf.equals(0)){
aux_left.add(new Register(regaux));
}
else {
aux_right.add(new Register(regaux));
}
}
else {
System.out.println(regaux);
System.err.println("Register not found in hash table");
System.exit(-1);
}
}
left.attributes.set(i, aux_left);
right.attributes.set(i, aux_right);
}
}
// Calculate rows and columns arrays for both nodes
// A priori there isn't any problem if any solution class dissapear or if there is a categorical value which disappear in the split
left.rows = new int [attributes.size()];
right.rows = new int [attributes.size()];
System.arraycopy(rows, 0, left.rows, 0, rows.length);
System.arraycopy(rows, 0, right.rows, 0, rows.length);
left.columns = new int [attributes.size()];
right.columns = new int [attributes.size()];
System.arraycopy(columns, 0, left.columns, 0, columns.length);
System.arraycopy(columns, 0, right.columns, 0, columns.length);
// Then, calculate histograms for them
left.calculateHistograms();
right.calculateHistograms();
// Copy the random generator to the descendants
left.generator = generator;
right.generator = generator;
// Return the result
result.add(left);
result.add(right);
return result;
}
/**
* Gets the identifier of the node
*
* @return the identifier of the node
*/
public int getIdentifier() {
return identifier;
}
/**
* Replaces the identifier of the node with another new node
*
* @param identifier New identifier for the node
*/
public void setIdentifier(int identifier) {
this.identifier = identifier;
}
/**
* Gets the output class of the first register of the node. If the node is pure, the output class
* is general for the node
*
* @return the output class of the node
*/
public int getOutputClass () {
return attributes.get(0).get(0).getOutputClass();
}
/**
* Gets the number of different registers of data that there are in the node
*
* @return the number of different registers in the node
*/
public int getNumRegisters () {
return attributes.get(0).size();
}
/**
* Gets the number of items that belongs to class i
*
* @param i Class to which the items should belong to be considered
* @return the number of items that belongs to class i
*/
public int getNumItemsClassI (int i) {
myAttribute att;
att = (myAttribute)the_attributes.get(0);
if (att.isNominal()) {
// The attribute is nominal, count with the count_matrix
int numitems = 0;
for (int j=0; j<rows[0]; j++) {
numitems += count_matrix.get(0)[j][i];
}
return numitems;
}
else {
// The attribute is numeral, count with the c_above and c_below
return (c_above.get(0)[0][i] + c_below.get(0)[0][i]);
}
}
/**
* Gets the number of different values for the ith attribute
*
* @param i Position of the attribute which differents values we are counting
* @return the number of different values for the ith attribute
*/
public int getDifferentValuesAttributeI (int i) {
ArrayList <Double> values;
values = new ArrayList <Double> ();
// Create a list with all the different possible values
for (int j=0; j<attributes.get(i).size(); j++) {
if (!values.contains(new Double(attributes.get(i).get(j).getAttributeValue()))) {
values.add(new Double(attributes.get(i).get(j).getAttributeValue()));
}
}
return values.size();
}
/**
* Gets the output class of the majority of the instances. If there are two majority classes, it
* selects randomly one of them
*
* @return the majority class of the node
*/
public int getMajorOutputClass () {
int [] repetitions = new int [columns[0]];
int max, posmax;
for (int i=0; i<columns[0]; i++) {
repetitions[i] = 0;
}
// Count the frecuence of each output class
for (int j=0; j<attributes.get(0).size(); j++) {
repetitions[attributes.get(0).get(j).getOutputClass()]++;
}
max = repetitions[0];
posmax = 0;
// Find the maximum output class
for (int i=1; i<columns[0]; i++) {
if (repetitions[i] > max) {
max = repetitions[i];
posmax = i;
}
else if (repetitions[i] == max) {
// If the maximum is equal, then decide a maximum randomly
System.out.println("Can't decide better outputClass between " + posmax + " y " + i);
int selection = generator.nextInt(2);
if (selection == 1) {
max = repetitions[i];
posmax = i;
}
System.out.println("Finally selected " + posmax);
}
}
return posmax;
}
/**
* Gets the number of different classes in the node
*
* @return the number of different classes in the node
*/
public int getNumClasses() {
ArrayList <Integer> values;
values = new ArrayList <Integer> ();
// Create a list with all the different possible values for the output class
for (int j=0; j<attributes.get(0).size(); j++) {
if (!values.contains(new Integer(attributes.get(0).get(j).getOutputClass()))) {
values.add(new Integer(attributes.get(0).get(j).getOutputClass()));
}
}
return values.size();
}
/**
* Obtains a list of lists with n1,...,nk in decreasing order. The first list includes the classes
* 1,...,k ordered in decreasing order of appearance while the second list includes the absolute
* frecuence of these classes, which is ordered decrementally.
*
* @return a list of lists with n1,...,nk in decreasing order
*/
public ArrayList<ArrayList<Integer>> getDecreasedNI() {
ArrayList <ArrayList <Integer>> result;
ArrayList <Integer> classes;
ArrayList <Integer> frecuencies;
myAttribute att;
result = new ArrayList <ArrayList <Integer>> ();
classes = new ArrayList <Integer> ();
frecuencies = new ArrayList <Integer> ();
int [] numitems = new int [columns[0]];
att = (myAttribute)the_attributes.get(0);
// Get the ni for all classes
for (int i=0; i<columns[0]; i++) {
if (att.isNominal()) {
// The attribute is nominal, count with the count_matrix
numitems[i] = 0;
for (int j=0; j<rows[0]; j++) {
numitems[i] += count_matrix.get(0)[j][i];
}
}
else {
// The attribute is numeral, count with the c_above and c_ below
numitems[i] = (c_above.get(0)[0][i] + c_below.get(0)[0][i]);
}
classes.add(i);
frecuencies.add(numitems[i]);
}
// Sort in decreased order the ni
for(int i=0; i<columns[0] -1; i++){
int current = frecuencies.get(i);
int current_position = classes.get(i);
int k=i;
for(int j=i+1; j<frecuencies.size();j++){
if(current < frecuencies.get(j)){
k = j;
current = frecuencies.get(j);
current_position = classes.get(j);
}
}
frecuencies.set(k,frecuencies.get(i));
frecuencies.set(i,current);
classes.set(k,classes.get(i));
classes.set(i,current_position);
}
// Delete the elements with ni = 0
int delete_point;
delete_point = frecuencies.indexOf(0);
while (delete_point != -1) {
frecuencies.remove(delete_point);
classes.remove(delete_point);
delete_point = frecuencies.indexOf(0);
}
// Return the result
result.add(classes);
result.add(frecuencies);
return result;
}
/**
* Obtains a list of lists with the k classes in decreasing order of ni - V(Si). The first list includes the classes
* 1,...,k ordered accordingly with the second list which includes the absolute
* frecuence minus the V measure of these classes, which is ordered decrementally.
*
* @return a list of lists with the k classes in decreasing order of ni - V(Si)
*/
public ArrayList<ArrayList<Integer>> getDecreasedNIV() {
ArrayList <ArrayList <Integer>> result;
ArrayList <Integer> classes;
ArrayList <Integer> frecuencies;
myAttribute att;
result = new ArrayList <ArrayList <Integer>> ();
classes = new ArrayList <Integer> ();
frecuencies = new ArrayList <Integer> ();
int [] numitems = new int [columns[0]];
double [] Vni = new double [columns[0]];
att = (myAttribute)the_attributes.get(0);
// Get the ni for all classes
for (int i=0; i<columns[0]; i++) {
if (att.isNominal()) {
// The attribute is nominal, count with the count_matrix
numitems[i] = 0;
for (int j=0; j<rows[0]; j++) {
numitems[i] += count_matrix.get(0)[j][i];
}
}
else {
// The attribute is numeral, count with the c_above and c_below
numitems[i] = (c_above.get(0)[0][i] + c_below.get(0)[0][i]);
}
classes.add(i);
frecuencies.add(numitems[i]);
Vni[i] = numitems[i] - V(i);
}
// Sort in decreased order the ni - V
for(int i=0; i<columns[0] -1; i++){
int current = frecuencies.get(i);
int current_position = classes.get(i);
double current_vi = Vni[i];
int k=i;
for(int j=i+1; j<Vni.length;j++){
if(current_vi < Vni[j]){
k = j;
current = frecuencies.get(j);
current_position = classes.get(j);
current_vi = Vni[j];
}
}
Vni[k] = Vni[i];
Vni[i] = current_vi;
frecuencies.set(k,frecuencies.get(i));
frecuencies.set(i,current);
classes.set(k,classes.get(i));
classes.set(i,current_position);
}
// Delete the elements with ni = 0
int delete_point;
delete_point = frecuencies.indexOf(0);
while (delete_point != -1) {
frecuencies.remove(delete_point);
classes.remove(delete_point);
delete_point = frecuencies.indexOf(0);
}
result.add(classes);
result.add(frecuencies);
return result;
}
/**
* Calculates V for the data in this node with k class, in other words, this calculates the minimum
* cost of specifying the split value at the parent of a node containing the data in this node
*
* @param k class for calculating the V measure
* @return the V measure for this node with k class
*/
public double V (int k) {
ArrayList <ArrayList <Double>> all_values;
ArrayList <Double> values;
double min, possible_min;
all_values = new ArrayList <ArrayList <Double>> ();
// Calculate vA for every attribute
for (int i=0; i<attributes.size(); i++) {
values = new ArrayList <Double> ();
for (int j=0; j<attributes.get(i).size(); j++) {
if (attributes.get(i).get(j).getOutputClass() == k) {
if (!values.contains(new Double(attributes.get(i).get(j).getAttributeValue()))) {
values.add(new Double(attributes.get(i).get(j).getAttributeValue()));
}
}
}
all_values.add(values);
}
// Calculate the minimum value
min = attributes.get(0).size() + 1;
for (int i=0; i<attributes.size(); i++) {
if (all_values.get(i).size() != 0) {
if (the_attributes.get(i).isNominal()) {
possible_min = all_values.get(i).size();
}
else {
possible_min = Math.log((double)all_values.get(i).size())/Math.log(2.0);
}
if (possible_min < min) {
min = possible_min;
}
}
}
if (min == attributes.get(0).size() + 1) {
return 0;
}
else {
return min;
}
}
}