/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Decision_Trees.FunctionalTrees; import java.util.ArrayList; import java.util.Arrays; import java.util.Random; /** * Data structure that is used in the construction of the decision tree. * It stores the information about the relationship between nodes in a tree, and * the type of node that it is along some other necessary information such as the * attributes values, the attributes themselves or the output class for the item. * * @author Written by Victoria Lopez Morales (University of Granada) 29/05/2009 * @version 1.0 * @since JDK1.5 */ public class TreeNode { /** * Identifier of the tree node */ int identifier; /** * Left descendant of this node, if it is not a leaf node */ TreeNode left; /** * Right descendant of this node, if it is not a leaf node */ TreeNode right; /** * Whether this node is a leaf node or not. A node that isn't a leaf node can be * changed to a leaf node anytime, but a leaf node can't become a no leaf node */ boolean isLeaf; /** * The class asigned to this node if it is a leaf node, otherwise this class has * no meaning, but it is usually -1 */ int outputClass; /** * Relationship between this node and its descendants */ Split condition; /** * Values of the attributes for each instance in the node */ ArrayList <ArrayList <Double>> values; /** * Class of each instance in the node */ ArrayList <Double> oclass; /** * All the information about the attributes that the node uses */ ArrayList <myAttribute> the_attributes; /** * All the information about the output attribute of the node (class of the node) */ myAttribute output_attribute; /** * Generator for pseudorandom numbers */ Random generator; /** * Classifier that we are going to use in the pruned leaves, which are the leaf nodes */ int prunedClassifier; /** * Parameter for some of the classifier on leaves, number of neares neighbours considered */ int K; /** * Auxiliar structure for computing the Naive Bayes classifier */ ArrayList <int [][]> count_matrix; /** * Auxiliar structure for computing the K-methods */ double [][] normalized_values; /** * Auxiliar structure for computing KSNN */ double further[]; /** * Auxiliar structure for computing KSNN */ boolean selected[]; /** * Auxiliar structure for computing KNNAdaptive */ double radius[]; /** * Auxiliar structure for computing Nearest Means */ double means[][]; /** * Auxiliar structure for computing Nearest Means */ private int[] meanClass; /** * Creates a tree node with empty values that we can identify */ TreeNode () { identifier = 0; left = null; right = null; isLeaf = false; outputClass = -1; condition = null; values = null; oclass = null; the_attributes = null; output_attribute = null; generator = new Random(12345678); prunedClassifier = -1; K = -1; } /** * Creates a node with the identifier, the descendants and its condition as a leaf node. It also * includes the output class selected for it and the relationship between the nodes plus some * other information like a whole dataset, the kind of classifier used at leaves or the K parameter * for those classifiers at leaves * * @param id Number identifying the node that is being created * @param newleft Left descendant of the node that is being created * @param newright Right descendant of the node that is being created * @param leaf Whether the new node is a leaf node or not * @param oclass Output class for the node that is being created * @param cond Way to split this node into its descendants * @param data Dataset that has the data that is going to be stored in the node * @param prunedLeavesClassifier Kind of classifier used at the leaves * @param K_classifier Number of nearest neighboors used in some of the classifiers at the leaves */ TreeNode (int id, TreeNode newleft, TreeNode newright, boolean leaf, int oclass, Split cond, myDataset data, int prunedLeavesClassifier, int K_classifier) { ArrayList <Double> att_values; identifier = id; left = newleft; right = newright; isLeaf = leaf; outputClass = oclass; condition = cond; the_attributes = data.getAttributes(); output_attribute = data.getOutputAttribute(); // Initialize values from the original dataset values = new ArrayList <ArrayList <Double>>(); for (int i=0; i<the_attributes.size(); i++) { att_values = new ArrayList <Double> (); for (int j=0; j<data.getNumIns(); j++) { att_values.add(data.getDataI(j, i)); } values.add(att_values); } // Initialize output class associate with each instances from the original dataset this.oclass = new ArrayList <Double> (); for (int i=0; i<data.getNumIns(); i++) { this.oclass.add(new Double(data.getOutputI(i))); } generator = new Random(12345678); prunedClassifier = prunedLeavesClassifier; K = K_classifier; } /** * Creates a tree node from another existing tree node * * @param tree Original tree node from which we are going to create a copy */ TreeNode (TreeNode tree) { ArrayList <Double> att_values; this.identifier = tree.identifier; this.left = new TreeNode(tree.left); this.right = new TreeNode(tree.right); this.isLeaf = tree.isLeaf; this.outputClass = tree.outputClass; this.condition = new Split (tree.condition); // Copy values that are in the node this.values = new ArrayList <ArrayList <Double>>(); for (int i=0; i<the_attributes.size(); i++) { att_values = new ArrayList <Double> (); for (int j=0; j<tree.values.get(i).size(); j++) { att_values.add(tree.values.get(i).get(j)); } this.values.add(att_values); } // Copy output class associate with each instances this.oclass = new ArrayList <Double> (); for (int i=0; i<tree.oclass.size(); i++) { this.oclass.add(tree.oclass.get(i)); } // Copy the attributes this.the_attributes = new ArrayList <myAttribute> (); for (int i=0; i<tree.the_attributes.size(); i++) { this.the_attributes.add(new myAttribute(tree.the_attributes.get(i))); } this.output_attribute = new myAttribute(tree.output_attribute); this.generator = tree.generator; this.prunedClassifier = tree.prunedClassifier; this.K = tree.K; } /** * Checks if a tree node is the same tree node as another object * * @param obj Object that is checked to see if it is the same tree node * @return true if the tree nodes are the same, false otherwise * @see java.lang.Object#equals(java.lang.Object) */ public boolean equals (Object obj) { boolean result; // First we check if the reference is the same if (this == obj) return true; // Then we check if the object exists and is from the class TreeNode if((obj == null) || (obj.getClass() != this.getClass())) return false; // object must be TreeNode at this point TreeNode test = (TreeNode)obj; result = ((identifier == test.identifier) && (K == test.K) && (prunedClassifier == test.prunedClassifier) && (isLeaf == test.isLeaf) && (outputClass == test.outputClass) && (left == test.left || (left != null && left.equals(test.left))) && (right == test.right || (right != null && right.equals(test.right))) && (condition == test.condition || (condition != null && condition.equals(test.condition))) && (output_attribute == test.output_attribute || (output_attribute != null && output_attribute.equals(test.output_attribute)))); // We check the class attributes of the TreeNode class if (result) { if ((values.size() == test.values.size()) && (oclass.size() == test.oclass.size()) && (the_attributes.size() == test.the_attributes.size())) { // Check if values size is the same for (int i=0; i<values.size() && result; i++) { if (values.get(i).size() != test.values.get(i).size()) result = false; } // Check the_attributes elements for (int i=0; i<the_attributes.size() && result; i++) { if (!((the_attributes.get(i) == test.the_attributes.get(i)) || (the_attributes.get(i) != null && the_attributes.get(i).equals(test.the_attributes.get(i))))) result = false; } // Check oclass elements for (int i=0; i<oclass.size() && result; i++) { if (!((oclass.get(i) == test.oclass.get(i)) || (oclass.get(i) != null && oclass.get(i).equals(test.oclass.get(i))))) result = false; } // Check values elements for (int i=0; i<values.size() && result; i++) { for (int j=0; j<values.get(i).size() && result; j++) { if (!((values.get(i).get(j) == test.values.get(i).get(j)) || (values.get(i).get(j) != null && values.get(i).get(j).equals(test.values.get(i).get(j))))) result = false; } } return result; } else { return false; } } else { return false; } } /** * Hash-code function for the class that is used when object is inserted in a structure like a hashtable * * @return the hash code obtained * @see java.lang.Object#hashCode() */ public int hashCode() { int hash = 7; hash = 31 * hash + identifier; hash = 31 * hash + (null == left ? 0 : left.hashCode()); hash = 31 * hash + (null == right ? 0 : right.hashCode()); hash = 31 * hash + (isLeaf ? 1 : 0); hash = 31 * hash + outputClass; hash = 31 * hash + prunedClassifier; hash = 31 * hash + K; hash = 31 * hash + (null == condition ? 0 : condition.hashCode()); hash = 31 * hash + (null == oclass ? 0 : oclass.hashCode()); return hash; } /** * Overriden function that converts the class to a string * * @return the string representation of the class * @see java.lang.Object#toString() */ public String toString() { String aux = ""; aux = aux + "Tree Node " + identifier + "\n"; if (isLeaf) { if (prunedClassifier == -1) { // This leaf wasn't pruned aux = aux + "Leaf Node, class " + outputClass + "\n"; } else { int [] distribution = getOutputClassDistribution(); switch (prunedClassifier) { // Print the classifier at leaves used case 0: aux = aux + "Leaf Node, Naive Bayes: "; break; case 1: aux = aux + "Leaf Node, " + K + "NN: "; break; case 2: aux = aux + "Leaf Node, NM: "; break; case 3: aux = aux + "Leaf Node, " + K + "SNN: "; break; case 4: aux = aux + "Leaf Node, " + K + "NNAdaptive: "; break; default: System.err.println("The classifier at leaves is not valid"); System.exit(-1); break; } ArrayList <Double> classes = getClasses(); for (int i=0; i<distribution.length; i++) { double classvalue = classes.get(i); aux = aux + output_attribute.getValues().get((int)classvalue) + " " + (double)distribution[i]/(double)oclass.size() + "% "; } aux += "\n"; } } else { aux += "Internal Node\n"; aux = aux + "Split: " + condition + "\n"; } for (int i=0; i<the_attributes.size(); i++) { aux = aux + the_attributes.get(i).getName() + "\n"; } aux = aux + output_attribute.getName() + "\n"; for (int i=0; i<values.get(0).size(); i++) { for (int j=0; j<values.size(); j++) { aux = aux + values.get(j).get(i) + " "; } aux = aux + "Output class: " + oclass.get(i) + "\n"; } return aux; } /** * Prints the tree in a String with all the information that makes it human readable * * @return a String with the tree in it */ public String printTree () { try { StringBuffer text = new StringBuffer(); if (!isLeaf) { printTree(0, text); } return text.toString(); } catch (Exception e) { return "Can not print the tree"; } } /** * Prints the tree in a StringBuffer with a depth given according to the relationship * of the nodes in the whole tree with all the information that makes it human readable * * @param depth Position in the tree of the node that is reflected in the string in a * major number of space in it * @param text Output where the tree is exposed * @throws Exception If the tree cannot be printed. */ private void printTree (int depth, StringBuffer text) throws Exception { String aux = ""; String aux2 = ""; for (int k = 0; k < depth; k++) { aux += "\t"; } for (int k = 1; k < depth; k++) { aux2 += "\t"; } text.append(aux); if (isLeaf) { if ((isPure()) || (outputClass != -1)) { // This node wasn't pruned or is pure if (output_attribute.isNominal()) { text.append(output_attribute.getValues().get(outputClass) + " \n"); } else { text.append(outputClass + " \n"); } } else { // This node was pruned int [] distribution = getOutputClassDistribution(); switch (prunedClassifier) { // Print the classifier at leaves used case 0: text.append("Naive Bayes: "); break; case 1: text.append(K + "NN: "); break; case 2: text.append("NM: "); break; case 3: text.append(K + "SNN: "); break; case 4: text.append(K + "NNAdaptive: "); break; default: System.err.println("The classifier at leaves is not valid"); System.exit(-1); break; } ArrayList <Double> classes = getClasses(); if (output_attribute.isNominal()) { for (int i=0; i<distribution.length; i++) { double classvalue = classes.get(i); text.append(output_attribute.getValues().get((int)classvalue) + " " + (double)distribution[i]/(double)oclass.size() + "% "); } } else { for (int i=0; i<distribution.length; i++) { double classvalue = classes.get(i); text.append(classvalue + " " + (double)distribution[i]/(double)oclass.size() + "% "); } } text.append("\n"); } } else { if (the_attributes.get(condition.getAttribute()).isNominal()) { text.append("if (" + the_attributes.get(condition.getAttribute()).getName() + " in " + the_attributes.get(condition.getAttribute()).getValues().get((int)condition.getValue()) + ") then {\n"); } else { text.append("if (" + the_attributes.get(condition.getAttribute()).getName() + " < " + condition.getValue() + ") then {\n"); } left.printTree(depth + 1, text); text.append(aux + "else { \n"); right.printTree(depth + 1, text); } text.append(aux2 + "}\n"); } /** * Gets the identifier of the node * * @return the identifier of the node */ public int getIdentifier() { return identifier; } /** * Replaces the identifier of the node with another new node * * @param identifier New identifier for the node */ public void setIdentifier(int identifier) { this.identifier = identifier; } /** * Gets the left descendant of the node, if it is not a leaf node * * @return the left descendant of the node */ public TreeNode getLeft() { return left; } /** * Replaces the left descendant of the node with another new left descendant * * @param left New node that is going to be kept as left descendant of this node */ public void setLeft(TreeNode left) { this.left = left; } /** * Gets the right descendant of the node, if it is not a leaf node * * @return the left descendant of the node */ public TreeNode getRight() { return right; } /** * Replaces the right descendant of the node with another new right descendant * * @param right New node that is going to be kept as right descendant of this node */ public void setRight(TreeNode right) { this.right = right; } /** * Answers if the node is a leaf node or not * * @return true if the node is a leaf node, false otherwise */ public boolean isLeaf() { return isLeaf; } /** * Changes the logical attribute stating if a node is leaf or not * * @param isLeaf Logical value stating if a node is leaf or not */ public void setLeaf(boolean isLeaf) { this.isLeaf = isLeaf; } /** * Gets the output class stored for the node. It should be considered only when the node is a leaf node * * @return the output class of the node */ public int getOutputClass() { return outputClass; } /** * Replaces the output class of the node with another new output class * * @param outputClass New output class for the node */ public void setOutputClass(int outputClass) { this.outputClass = outputClass; } /** * Gets the relationship between this node and its descendats, that's the way to split this node in two * * @return the split for this node */ public Split getCondition() { return condition; } /** * Replaces the relationship between this node and its descendants, this means, changing the condition how * the two descendants are created. When using this function is highly recommended using the setLeft() and * setRight() functions too * * @param condition New relationship between this node and its descendants */ public void setCondition(Split condition) { this.condition = condition; } /** * Searches a descendant node from this node with a specific id * * @param id Identifier of the node we are searching for * @return the TreeNode with the specified id */ public TreeNode getNode (int id) { TreeNode aux = null; if (this == null) { return null; } if (identifier == id) { return this; } if (left != null) aux = left.getNode(id); if ((aux == null) && (right != null)) { aux = right.getNode(id); } return aux; } /** * Gets the number of internal nodes from this node and its descendants. If this is a leaf node the number * of internal nodes is 0. * * @return the number of internal nodes from this node and its descendants */ public int getNumNodes () { int nodes = 0; if (isLeaf) { nodes = 0; } else { nodes++; if (left != null) nodes += left.getNumNodes(); if (right != null) nodes += right.getNumNodes(); } return nodes; } /** * Gets the number of leaf nodes from this node and its descendants * * @return the number of leaf nodes from this node and its descendants */ public int getLeafs () { int leafs = 0; if (isLeaf) { leafs = 1; } else { if (left != null) leafs += left.getLeafs(); if (right != null) leafs += right.getLeafs(); } return leafs; } /** * Classifies a given item with the information stored in the node and its descendants, making a call * to the specific classifiers at the leaves * * @param ind Data attribute values for the item we are classifying * @return the class asigned to the item given */ public int evaluate (double [] ind) { // If we are at a leaf node we can obtain the class directly, otherwise we have to descend // to a leaf node if (isLeaf) { // If it is a pure node or it is a pruned node with a specified class, we've got the result if (isPure() || (outputClass != -1)) { return outputClass; } else { // Here, we call the specified classifier switch (prunedClassifier) { case 0: return evaluateNaiveBayes (ind); case 1: return evaluateKNN (ind); case 2: return evaluateNM (ind); case 3: return evaluateKSNN (ind); case 4: return evaluateKNNAdaptive (ind); default: System.err.println("The classifier used at the leaves isn't valid"); System.exit(-1); break; } return -1; } } else { if (condition != null) { // Descend to the corresponding leaf node if (the_attributes.get(condition.getAttribute()).isNominal()) { if (ind[condition.getAttribute()] == condition.getValue()) { return left.evaluate(ind); } else { return right.evaluate(ind); } } else { if (ind[condition.getAttribute()] < condition.getValue()) { return left.evaluate(ind); } else { return right.evaluate(ind); } } } else { System.err.println("Tree not fully built"); System.exit(-1); return -1; } } } /** * Removes all the descendants of this node */ private void deleteDescendants () { // Delete descendants of descendants if (left != null) { left.deleteDescendants(); } if (right != null) { right.deleteDescendants(); } left = null; right = null; condition = null; } /** * Decides if a node is partitionable or not. A node is partitionable when the node is not pure or when * the parent node at least has num_min_instances instances in it * * @param num_min_instances The minimum number of data instances that the tree node must contain to * be considered a partitionable node * @return true, if it is a partitionable node, false otherwise */ public boolean isPartitionable (int num_min_instances) { // First check that at least has X instanes in it if (values.get(0).size() < num_min_instances) { return false; } // Then check if the node is pure double unique_class = oclass.get(0); // Check that the node is not pure for (int i=0; i<oclass.size(); i++) { if (oclass.get(i) != unique_class) return true; } return false; } /** * Sets this node as a leaf node. To set a node as leaf, we have to delete its descendants, mark the * node as leaf, and assign a class for classification */ public void setAsLeaf () { isLeaf = true; deleteDescendants(); // The node doesn't have any descendants assignOutputClass(); // The outputclass of the node is decided from the data stored in the node } /** * Sets this node as a classifier leaf node. To set a node as classifier leaf, we have to delete its * descendants, mark the node as leaf, and assign an invalid class for classification */ public void setAsClassifierLeaf () { isLeaf = true; deleteDescendants(); // The node doesn't have any descendants if (!isPure()) { outputClass = -1; // If the node isn't pure, the other classifier is used } else { assignOutputClass(); } condition = null; // When we make the leaf a classifier, we pre-process the data in it for the later classification switch (prunedClassifier) { case 0: // Naive Bayes needs categorical data and the count matrix for (int i=0; i<the_attributes.size(); i++) { if (!the_attributes.get(i).isNominal()) { System.err.println("The attribute " + the_attributes.get(i).getName() + " is not a nominal attribute"); System.exit(-1); } } calculateCountMatrix(); break; case 1: // KNN needs normalized values normalizeValues(); break; case 2: // NM needs normalized values and the centroids of the classes normalizeValues(); calculateMeans(); break; case 3: // KSNN needs normalized values and the further neighbour matrix normalizeValues(); getFurtherNeighbor(); break; case 4: // KNNAdaptive needs normalized values and the radius matrix normalizeValues(); calculateRadius(); break; default: System.err.println("The classifier used at the leaves isn't valid"); System.exit(-1); break; } } /** * Initializes the output class stored in the node with a valid output class, in a standard way, * with the class stored in the node if it is pure or with the class of the majority of the instances * where if there are two majority classes, it selects randomly one of them */ public void assignOutputClass() { // If the node is pure, the output class is the class of one of its instances; if it is not, // the output class is the major class in the node if (isPure()) { double aux = oclass.get(0); outputClass = (int) aux; } else { outputClass = getMajorOutputClass(); } } /** * Check is a node is pure or a node isn't pure * * @return true, if all the data that is in the node is from the same class; false, otherwise */ private boolean isPure() { // If there aren't any instances in the node, then the node is pure if (oclass.isEmpty()) return true; // Get the class of the first instance double unique_value = oclass.get(0); // Compare the class to all the other instances for (int i=0; i<oclass.size(); i++) { // If one of the instances has a different class, return false if (oclass.get(i) != unique_value) { return false; } } return true; } /** * Gets the output class of the majority of the instances. If there are two majority classes, it * selects randomly one of them * * @return the majority class of the node */ private int getMajorOutputClass () { int num_classes = getNumClasses(); int [] repetitions = new int [num_classes]; int max, posmax; ArrayList <Double> which_classes = getClasses(); for (int i=0; i<num_classes; i++) { repetitions[i] = 0; } // Count the frecuence of each output class for (int j=0; j<oclass.size(); j++) { int position = which_classes.indexOf (oclass.get(j)); repetitions[position]++; } max = repetitions[0]; posmax = 0; // Find the maximum output class for (int i=1; i<num_classes; i++) { if (repetitions[i] > max) { max = repetitions[i]; posmax = i; } else if (repetitions[i] == max) { // If the maximum is equal, then decide a maximum randomly System.out.println("Can't decide better outputClass between " + posmax + " y " + i); int selection = generator.nextInt(2); if (selection == 1) { max = repetitions[i]; posmax = i; } System.out.println("Finally selected " + posmax); } } return posmax; } /** * Gets the number of different classes in the node * * @return the number of different classes in the node */ public int getNumClasses() { ArrayList <Integer> diff_values; diff_values = new ArrayList <Integer> (); // Create a list with all the different possible values for the output class for (int j=0; j<oclass.size(); j++) { double aux = oclass.get(j); if (!diff_values.contains(new Integer((int)aux))) { diff_values.add(new Integer((int)aux)); } } return diff_values.size(); } /** * Gets an array list with the different classes in the node * * @return an array list with the different classes in the node */ public ArrayList <Double> getClasses () { ArrayList <Double> diff_values; diff_values = new ArrayList <Double> (); // Create a list with all the different possible values for the output class for (int j=0; j<oclass.size(); j++) { double aux = oclass.get(j); if (!diff_values.contains(new Double(aux))) { diff_values.add(new Double(aux)); } } // Return all the values of the list return diff_values; } /** * Evaluates all splits and returns the best split found. For each attribute, it evaluates all * possible splits and select the split with the best split criteria * * @param splitCriteria Information measure used to compute the best split. 0 is the value for * Entropy, 1 is for InformationGain, 2 is for GiniIndex and 3 is for GainRatio * @return the best split for this node after evaluating all splits */ public Split evaluateAllSplits (int splitCriteria) { Split best_split = null; double [] best_split_criteria; double split_criteria_value; ArrayList <Split> best_split_attribute; Split a_split; double best_split_value; best_split_criteria = new double [values.size()]; best_split_attribute = new ArrayList <Split> (values.size()); // For each split // 0 is Entropy, the best split is the minimum Entropy // 1 is InformationGain, the best split is the maximum InformationGain // 2 is GiniIndex, the best split is the minimum GiniIndex // 3 is GainRatio, the best split is the maximum GainRatio for (int i=0; i<values.size(); i++) { System.out.println("Evaluating split points for: " + the_attributes.get(i).getName()); if (the_attributes.get(i).isNominal()) { // The attribute is nominal a_split = new Split (-1, 0.0); best_split_attribute.add(a_split); if ((splitCriteria == 0) || (splitCriteria == 2)) { best_split_criteria[i] = Double.MAX_VALUE; } else { best_split_criteria[i] = -Double.MAX_VALUE; } for (int j=0; j<the_attributes.get(i).getValues().size(); j++) { a_split = new Split (i, 1.0*j); if (valid_split(a_split)) { split_criteria_value = evaluateSplit (a_split, splitCriteria); if (split_criteria_value < best_split_criteria[i]) { // The new split has a minor value of the criteria, we change the split if the criteria is Entropy or GiniIndex if ((splitCriteria == 0) || (splitCriteria == 2)) { best_split_attribute.set(i, a_split); best_split_criteria[i] = split_criteria_value; } } else if (split_criteria_value > best_split_criteria[i]) { // The new split has a major value of the criteria, we change the split if the criteria is InformationGain or GainRatio if ((splitCriteria == 1) || (splitCriteria == 3)) { best_split_attribute.set(i, a_split); best_split_criteria[i] = split_criteria_value; } } } } System.out.println("Best split for " + the_attributes.get(i).getName() + " found: " + i + " " + the_attributes.get(i).getValues().get((int)best_split_attribute.get(i).getValue()) + " Split criteria = " + best_split_criteria[i]); } else { // The attribute is numerical a_split = new Split (-1, 0); best_split_attribute.add(a_split); if ((splitCriteria == 0) || (splitCriteria == 2)) { best_split_criteria[i] = Double.MAX_VALUE; } else { best_split_criteria[i] = -Double.MAX_VALUE; } for (int j=0; j<values.get(i).size(); j++) { a_split = new Split (i, values.get(i).get(j)); if (valid_split(a_split)) { split_criteria_value = evaluateSplit (a_split, splitCriteria); if (split_criteria_value < best_split_criteria[i]) { // The new split has a minor value of the criteria, we change the split if the criteria is Entropy or GiniIndex if ((splitCriteria == 0) || (splitCriteria == 2)) { best_split_attribute.set(i, a_split); best_split_criteria[i] = split_criteria_value; } } else if (split_criteria_value > best_split_criteria[i]) { // The new split has a major value of the criteria, we change the split if the criteria is InformationGain or GainRatio if ((splitCriteria == 1) || (splitCriteria == 3)) { best_split_attribute.set(i, a_split); best_split_criteria[i] = split_criteria_value; } } } } System.out.println("Best split for " + the_attributes.get(i).getName() + " found: " + i + " " + best_split_attribute.get(i).getValue() + " Split criteria = " + best_split_criteria[i]); } } best_split = best_split_attribute.get(0); best_split_value = best_split_criteria[0]; for (int i=0; i<values.size(); i++) { if ((splitCriteria == 1) || (splitCriteria == 3)) { if (best_split_criteria[i] > best_split_value) { best_split = best_split_attribute.get(i); best_split_value = best_split_criteria[i]; } } else { //splitCriteria = 0,2 if (best_split_criteria[i] < best_split_value) { best_split = best_split_attribute.get(i); best_split_value = best_split_criteria[i]; } } } System.out.println("\nBEST SPLIT FOUND: " + best_split); return best_split; } /** * Evaluates a split and returns the value of the information measure used * * @param sp Split that is going to be evaluated * @param splitCriteria Information measure used to compute the split. 0 is the value for * Entropy, 1 is for InformationGain, 2 is for GiniIndex and 3 is for GainRatio * @return value of the information measure for the split */ private double evaluateSplit (Split sp, int splitCriteria) { double result, result1, result2, pj, pj1, pj2; ArrayList <ArrayList <Integer>> split_oclass_distribution; int [] oclass_distribution; int num_classes = getNumClasses(); int total; split_oclass_distribution = getOutputClassDistribution (sp); oclass_distribution = getOutputClassDistribution (); result1 = 0; result2 = 0; result = 0; total = split_oclass_distribution.get(0).get(num_classes) + split_oclass_distribution.get(1).get(num_classes); if (splitCriteria == 0) { // The splitCriteria is entropy for (int i=0; i<num_classes; i++) { if (split_oclass_distribution.get(0).get(num_classes) != 0) { pj1 = (double)split_oclass_distribution.get(0).get(i)/(double)split_oclass_distribution.get(0).get(num_classes); if (pj1 != 0) result1 = result1 + pj1 * Math.log(pj1)/Math.log(2.0); } if (split_oclass_distribution.get(1).get(num_classes) != 0) { pj2 = (double)split_oclass_distribution.get(1).get(i)/(double)split_oclass_distribution.get(1).get(num_classes); if (pj2 != 0) result2 = result2 + pj2 * Math.log(pj2)/Math.log(2.0); } } result1 = -result1; result2 = -result2; if (total != 0) { result = (double)split_oclass_distribution.get(0).get(num_classes)/(double)total * result1 + (double)split_oclass_distribution.get(1).get(num_classes)/(double)total * result2; } } else if (splitCriteria == 1) { // The splitCriteria is InformationGain // First, we calculate the entropy of the whole subset for (int i=0; i<num_classes; i++) { if (oclass.size() != 0) { pj = (double)oclass_distribution[i]/(double)oclass.size(); if (pj != 0) result = result + pj * Math.log(pj)/Math.log(2.0); } } result = -result; // Then, we calculate the entropy for the subsets for (int i=0; i<num_classes; i++) { if (split_oclass_distribution.get(0).get(num_classes) != 0) { pj1 = (double)split_oclass_distribution.get(0).get(i)/(double)split_oclass_distribution.get(0).get(num_classes); if (pj1 != 0) result1 = result1 + pj1 * Math.log(pj1)/Math.log(2.0); } if (split_oclass_distribution.get(1).get(num_classes) != 0) { pj2 = (double)split_oclass_distribution.get(1).get(i)/(double)split_oclass_distribution.get(1).get(num_classes); if (pj2 != 0) result2 = result2 + pj2 * Math.log(pj2)/Math.log(2.0); } } result1 = -result1; result2 = -result2; // We calculate the informationGain result = result - (double)split_oclass_distribution.get(0).get(num_classes)/(double)oclass.size() * result1 - (double)split_oclass_distribution.get(1).get(num_classes)/(double)oclass.size() * result2; } else if (splitCriteria == 2) { // The splitCriteria is GiniIndex // We calculate the gini index for the subsets for (int i=0; i<num_classes; i++) { if (split_oclass_distribution.get(0).get(num_classes) != 0) { pj1 = (double)split_oclass_distribution.get(0).get(i)/(double)split_oclass_distribution.get(0).get(num_classes); if (pj1 != 0) result1 = result1 + Math.pow (pj1,2.0); } if (split_oclass_distribution.get(1).get(num_classes) != 0) { pj2 = (double)split_oclass_distribution.get(1).get(i)/(double)split_oclass_distribution.get(1).get(num_classes); if (pj2 != 0) result2 = result2 + Math.pow (pj2,2.0); } } result1 = 1 - result1; result2 = 1 - result2; result = (double)split_oclass_distribution.get(0).get(num_classes)/(double)oclass.size() * result1 + (double)split_oclass_distribution.get(1).get(num_classes)/(double)oclass.size() * result2; } else if (splitCriteria == 3) { // The splitCriteria is GainRatio double informationGain; // First, we calculate the entropy of the whole subset for (int i=0; i<num_classes; i++) { if (oclass.size() != 0) { pj = (double)oclass_distribution[i]/(double)oclass.size(); if (pj != 0) result = result + pj * Math.log(pj)/Math.log(2.0); } } result = -result; // Then, we calculate the entropy for the subsets for (int i=0; i<num_classes; i++) { if (split_oclass_distribution.get(0).get(num_classes) != 0) { pj1 = (double)split_oclass_distribution.get(0).get(i)/(double)split_oclass_distribution.get(0).get(num_classes); if (pj1 != 0) result1 = result1 + pj1 * Math.log(pj1)/Math.log(2.0); } if (split_oclass_distribution.get(1).get(num_classes) != 0) { pj2 = (double)split_oclass_distribution.get(1).get(i)/(double)split_oclass_distribution.get(1).get(num_classes); if (pj2 != 0) result2 = result2 + pj2 * Math.log(pj2)/Math.log(2.0); } } result1 = -result1; result2 = -result2; // We calculate the informationGain informationGain = result - (double)split_oclass_distribution.get(0).get(num_classes)/(double)oclass.size() * result1 - (double)split_oclass_distribution.get(1).get(num_classes)/(double)oclass.size() * result2; result = informationGain/result; } return result; } /** * Gets an array with the frecuencies of the classes in the node * * @return an array with the frecuencies of the classes in the node */ private int [] getOutputClassDistribution () { int [] output; int num_classes = getNumClasses(); ArrayList <Double> which_classes = getClasses(); output = new int [num_classes]; // Before starting, the number of elements in each class is 0 for (int i=0; i<num_classes; i++) { output[i] = 0; } // We count every instance in the node for the class distribution for (int i=0; i<oclass.size(); i++) { int position = which_classes.indexOf (oclass.get(i)); output[position]++; } return output; } /** * Gets a list of lists with the frecuencies of the classes in the descendants node given a split * * @param sp Split that is going to be used to compute the frecuencies of the classes for the * supposed descendants * @return a list of lists with the frecuencies of classes depending on a split */ private ArrayList <ArrayList <Integer>> getOutputClassDistribution (Split sp) { int [] output1; int [] output2; int num_classes = getNumClasses(); ArrayList <Double> which_classes = getClasses(); int total1, total2; ArrayList <Integer> o1; ArrayList <Integer> o2; ArrayList <ArrayList <Integer>> o12; output1 = new int [num_classes]; output2 = new int [num_classes]; // Before starting, the number of elements in each class is 0 for (int i=0; i<num_classes; i++) { output1[i] = 0; output2[i] = 0; } if (the_attributes.get(sp.getAttribute()).isNominal()) { // The attribute for the split is nominal // We count every instance in the node for the class distribution for (int i=0; i<oclass.size(); i++) { int position = which_classes.indexOf (oclass.get(i)); if (sp.getValue() == values.get(sp.getAttribute()).get(i)) { output1[position]++; } else { output2[position]++; } } } else { // The attribute for the split is numerical // We count every instance in the node for the class distribution for (int i=0; i<oclass.size(); i++) { int position = which_classes.indexOf (oclass.get(i)); if (values.get(sp.getAttribute()).get(i) < sp.getValue()) { output1[position]++; } else { output2[position]++; } } } // Add the results to the arraylist output o1 = new ArrayList <Integer> (num_classes); o2 = new ArrayList <Integer> (num_classes); total1 = 0; total2 = 0; for (int i=0; i<num_classes; i++) { total1 += output1[i]; total2 += output2[i]; o1.add(output1[i]); o2.add(output2[i]); } o1.add(total1); o2.add(total2); o12 = new ArrayList <ArrayList <Integer>> (2); o12.add(o1); o12.add(o2); return o12; } /** * Checks if a split is valid for a TreeNode, this means, that it will generate valid child nodes * * @param sp Split that is going to be checked if it generates valid child nodes * @return true, if the split generates valid child nodes, false otherwise */ public boolean valid_split (Split sp) { boolean found = false; if (the_attributes.get(sp.getAttribute()).isNominal()) { // The split is meant for a nominal attribute // A split for a nominal attribute is valid when there are two different values for the attribute and one of them is the split_value for (int i=0; i<values.get(sp.getAttribute()).size() && !found; i++) { if (values.get(sp.getAttribute()).get(i) == sp.getValue()) found = true; } if (found) { found = false; for (int i=0; i<values.get(sp.getAttribute()).size() && !found; i++) { if (values.get(sp.getAttribute()).get(i) != sp.getValue()) found = true; } return found; } else { return false; } } else { // The split is meant for a numerical attribute // A split for a numerical attribute is valid when there is a value below it and a value equal or above it for (int i=0; i<values.get(sp.getAttribute()).size() && !found; i++) { if (values.get(sp.getAttribute()).get(i) < sp.getValue()) found = true; } if (found) { found = false; for (int i=0; i<values.get(sp.getAttribute()).size() && !found; i++) { if (values.get(sp.getAttribute()).get(i) >= sp.getValue()) found = true; } return found; } else { return false; } } } /** * Splits a node into two nodes from a split following the identifier of a given number into an arraylist * of nodes. * * @param sp Split used in this node to divide the node into two new nodes left and right * @param newidentifier Identifier of the last node created in the algorithm * @return an arraylist with two nodes, node left and node right obtained from the original node with the split */ public ArrayList <TreeNode> split (Split sp, int newidentifier) { ArrayList <TreeNode> result; Double value; result = new ArrayList <TreeNode> (2); left = new TreeNode (); right = new TreeNode (); // Before doing anything, we have to check is the split is valid if (sp.getAttribute() == -1) { // We weren't able to find a valid split, so we mark this node as leaf setAsLeaf(); return null; } else if (!valid_split(sp)) { System.err.println("This split isn't valid, and cannot be used to split"); System.exit(-1); } // First we have to copy the attributes into the new nodes left.the_attributes = new ArrayList<myAttribute>(); right.the_attributes = new ArrayList<myAttribute>(); for (int i=0; i<the_attributes.size(); i++) { left.the_attributes.add(new myAttribute(the_attributes.get(i))); right.the_attributes.add(new myAttribute(the_attributes.get(i))); } left.identifier = newidentifier + 1; right.identifier = newidentifier + 2; // Then, we split the separate list for each attribute left.values = new ArrayList <ArrayList <Double>> (values.size()); right.values = new ArrayList <ArrayList<Double>> (values.size()); // Initialize attributes list to contain empty lists for (int i=0; i<values.size(); i++) { left.values.add(new ArrayList <Double> ()); right.values.add(new ArrayList <Double> ()); } left.oclass = new ArrayList <Double> (); right.oclass = new ArrayList <Double> (); if (the_attributes.get(sp.getAttribute()).isNominal()) { // Attribute is categorical for (int j=0; j<values.get(sp.getAttribute()).size(); j++) { value = values.get(sp.getAttribute()).get(j); if (value == sp.getValue()) { // This instance will belong to the left son for (int k=0; k<values.size(); k++) { left.values.get(k).add(values.get(k).get(j)); } left.oclass.add(oclass.get(j)); } else { // This instance will belong to the right son for (int k=0; k<values.size(); k++) { right.values.get(k).add(values.get(k).get(j)); } right.oclass.add(oclass.get(j)); } } } else { // Attribute is numerical for (int j=0; j<values.get(sp.getAttribute()).size(); j++) { value = values.get(sp.getAttribute()).get(j); if (value < sp.getValue()) { // This instance will belong to the left son for (int k=0; k<values.size(); k++) { left.values.get(k).add(values.get(k).get(j)); } left.oclass.add(oclass.get(j)); } else { // This instance will belong to the right son for (int k=0; k<values.size(); k++) { right.values.get(k).add(values.get(k).get(j)); } right.oclass.add(oclass.get(j)); } } } if ((left.values.get(0).size() == 0) || (right.values.get(0).size() == 0)) { return null; } left.generator = generator; right.generator = generator; left.isLeaf = false; right.isLeaf = false; left.outputClass = -1; right.outputClass = -1; left.condition = null; right.condition = null; left.output_attribute = new myAttribute (output_attribute); right.output_attribute = new myAttribute (output_attribute); left.prunedClassifier = prunedClassifier; right.prunedClassifier = prunedClassifier; left.K = K; right.K = K; isLeaf = false; outputClass = -1; condition = sp; result.add(left); result.add(right); return result; } /** * Prunes all the leaves in the tree, this means, that all the leaves disappear and the ascendants * of those leaves become the new leaves of the tree with the classifier at leaves */ public void pruneAllLeaves () { if ((!isLeaf) && (left != null) && (right != null)) { if (left.isLeaf && right.isLeaf) { // This node will be a leaf node and its children disappear System.out.println("Nodes " + left.identifier + " and " + right.identifier + " are pruned, and node " + identifier + " is set as a leaf node"); setAsClassifierLeaf (); } else { left.pruneAllLeaves(); right.pruneAllLeaves(); } } } /** * Prunes the leaves in the tree that have greater error than the general error of the tree, making * them leaves with classifiers on them */ public void pruneWithError () { int [] total_error; double general_error; total_error = getTreeError(); general_error = (double)total_error[0]/(double)total_error[1]; if (isLeaf) { prune(general_error); } else { left.prune(general_error); right.prune(general_error); } } /** * Prunes this leave or its descendants according to the error given, making it a leave with a * classifier on it * * @param error General error of the tree that has to be surpassed in order to prune the leave */ public void prune (double error) { if (isLeaf) { int [] local_error = getNodeError(); double node_error = (double)local_error[0]/(double)local_error[1]; if (node_error > error) { System.out.println("Node " + identifier + " is set as a leaf node with a classifier"); setAsClassifierLeaf (); } } else { left.prune(error); right.prune(error); } } /** * Obtains the general error of the tree from all its leaves in a two number array * * @return an array with two numbers that summarizes the general error of the tree */ public int [] getTreeError () { int [] error, auxerror; error = new int [2]; error[0] = 0; error[1] = 1; if (isLeaf) { error = getNodeError(); } else { if (left != null) { auxerror = left.getTreeError(); error[0] += auxerror[0]; error[1] += auxerror[1]; } if (right != null) { auxerror = right.getTreeError(); error[0] += auxerror[0]; error[1] += auxerror[1]; } } return error; } /** * Obtains the error of a leave node in a the tree from all the instances in the leaf in * a two number array * * @return an array with two numbers that summarizes the leaf error in the tree */ public int [] getNodeError () { int [] error = new int [2]; if (isLeaf) { error[0] = 0; error[1] = oclass.size(); for (int i=0; i<oclass.size(); i++) { if (oclass.get(i).intValue() != outputClass) { error[0]++; } } } else { System.err.println("This node isn't a leaf, so the error cannot be computed"); System.exit(-1); } return error; } /** * This function builds the normalized values matrix for the node's data which means that it * normalizes the values and stores them in the matrix */ private void normalizeValues() { double minimum []; double range []; normalized_values = new double [values.get(0).size()][values.size()]; minimum = new double[values.size()]; range = new double[values.size()]; for (int i=0; i<values.size(); i++) { if (!the_attributes.get(i).isNominal()) { minimum[i] = the_attributes.get(i).getMin(); range[i] = the_attributes.get(i).getMax() - minimum[i]; } } // Both real and nominal data are normalized in [0,1] for (int i=0; i<values.get(0).size(); i++) { for (int j = 0; j < values.size(); j++) { if (the_attributes.get(j).isNominal()) { if (the_attributes.get(j).getValues().size() > 1) { normalized_values[i][j] = values.get(j).get(i)/the_attributes.get(j).getValues().size()-1; } } else { normalized_values[i][j] = values.get(j).get(i) - minimum[j]; normalized_values[i][j] = normalized_values[i][j] / range[j]; } } } } /** * This function normalizes the values for a given example looking at the values of the attributes * in this node * * @param example An array with the values that have to be normalized * @return an array with the values normalized accordingly to the values of the whole node */ private double [] normalize (double example[]) { double minimum []; double range []; double result [] = new double [values.size()]; minimum = new double[values.size()]; range = new double[values.size()]; for (int i=0; i<values.size(); i++) { if (!the_attributes.get(i).isNominal()) { minimum[i] = the_attributes.get(i).getMin(); range[i] = the_attributes.get(i).getMax() - minimum[i]; } } // Both real and nominal data are normalized in [0,1] for (int j = 0; j < values.size(); j++) { if (the_attributes.get(j).isNominal()) { if (the_attributes.get(j).getValues().size() > 1) { result[j] = example[j]/the_attributes.get(j).getValues().size()-1; } } else { result[j] = example[j] - minimum[j]; result[j] = result[j] / range[j]; } } return result; } /** * Classifies a given item with the information stored in the node with the KNN classifier * * @param item Data attribute values for the item we are classifying * @return the class asigned to the item given */ private int evaluateKNN (double item[]) { double minDist[]; double element[]; int nearestN[]; int selectedClasses[]; double dist; int prediction; int predictionValue; boolean stop; int num_classes = output_attribute.getValues().size(); element = normalize(item); nearestN = new int[K]; minDist = new double[K]; for (int i=0; i<K; i++) { nearestN[i] = -1; minDist[i] = Double.MAX_VALUE; } //KNN Method starts here for (int i=0; i<normalized_values.length; i++) { dist = distance(normalized_values[i], element); // See if it's nearer than our previous selected neighbors stop = false; for(int j=0; j<K && !stop;j++){ if (dist < minDist[j]) { for (int l = K - 1; l >= j+1; l--) { minDist[l] = minDist[l - 1]; nearestN[l] = nearestN[l - 1]; } minDist[j] = dist; nearestN[j] = i; stop=true; } } } // We have check all the instances... see what is the most present class selectedClasses = new int[num_classes]; for (int i=0; i<num_classes; i++) { selectedClasses[i] = 0; } for (int i=0; i<K; i++) { if (nearestN[i] != -1) { selectedClasses[oclass.get(nearestN[i]).intValue()] += 1; } } prediction=0; predictionValue=selectedClasses[0]; for (int i=1; i<num_classes; i++) { if (predictionValue < selectedClasses[i]) { predictionValue = selectedClasses[i]; prediction = i; } } if (predictionValue == 0) { prediction = getMajorOutputClass(); } return prediction; } /** * Calculates the euclidean distance between two instances * * @param instance1 First instance to calculate the distance * @param instance2 Second instance to calculate the distance * @return the euclidean distance between them * */ private double distance (double instance1[], double instance2[]) { double length = 0.0; for (int i=0; i<instance1.length; i++) { length += (instance1[i]-instance2[i])*(instance1[i]-instance2[i]); } length = Math.sqrt(length); return length; } /** * Precalculates the radius of each train instance, used in the KNNAdaptive classifier */ private void calculateRadius(){ int ownClass; double minDist; double dist; radius = new double[normalized_values.length]; for(int i=0;i<normalized_values.length;i++){ ownClass = oclass.get(i).intValue(); minDist=Double.MAX_VALUE; //Search the nearest enemy (instance from another class) for(int j=0; j<normalized_values.length;j++){ if(ownClass != oclass.get(j).intValue()){ dist = distance(normalized_values[i], normalized_values[j]); if (dist < minDist){ minDist=dist; } } } radius[i] = minDist; } } //end-method /** * Classifies a given item with the information stored in the node with the KNNAdaptive classifier * * @param item Data attribute values for the item we are classifying * @return the class asigned to the item given */ private int evaluateKNNAdaptive (double item[]) { double example[]; double minDist[]; int nearestN[]; int selectedClasses[]; double dist; int prediction; int predictionValue; boolean stop; int num_classes = output_attribute.getValues().size(); example = normalize(item); nearestN = new int[K]; minDist = new double[K]; for (int i=0; i<K; i++) { nearestN[i] = -1; minDist[i] = Double.MAX_VALUE; } //KNN Method starts here for (int i=0; i<normalized_values.length; i++) { dist = adaptiveDistance(normalized_values[i], example, i); // See if it's nearer than our previous selected neighbors stop=false; for(int j=0;j<K && !stop;j++){ if (dist < minDist[j]) { for (int l = K - 1; l >= j+1; l--) { minDist[l] = minDist[l - 1]; nearestN[l] = nearestN[l - 1]; } minDist[j] = dist; nearestN[j] = i; stop=true; } } } //we have check all the instances... see what is the most present class selectedClasses= new int[num_classes]; for (int i=0; i<num_classes; i++) { selectedClasses[i] = 0; } for (int i=0; i<K; i++) { if (nearestN[i] != -1) { selectedClasses[oclass.get(nearestN[i]).intValue()]+=1; } } prediction=0; predictionValue=selectedClasses[0]; for (int i=1; i<num_classes; i++) { if (predictionValue < selectedClasses[i]) { predictionValue = selectedClasses[i]; prediction = i; } } if (predictionValue == 0) { prediction = getMajorOutputClass(); } return prediction; } /** * Calculates the adaptive distance between two instances * * @param instance1 First instance to calculate the distance * @param instance2 Second instance to calculate the distance * @param index Index of train instance in radius structure * @return the adaptive distance between them */ private double adaptiveDistance (double instance1[], double instance2[], int index) { double dist; dist = distance(instance1, instance2); // Apply the radius conversion dist = dist/radius[index]; return dist; } //end-method /** * Calculates, for each train instance, the distance to its further K neighbour */ private void getFurtherNeighbor(){ double minDist[]; int nearestN[]; double dist; boolean stop; further = new double [normalized_values.length]; selected = new boolean [normalized_values.length]; nearestN = new int[K]; minDist = new double[K]; for(int instance=0;instance<normalized_values.length;instance++){ Arrays.fill(nearestN,-1); Arrays.fill(minDist,Double.MAX_VALUE); //find its K nearest neighbors for (int i=0; i<normalized_values.length; i++) { dist = distance(normalized_values[instance], normalized_values[i]); // see if it's nearer than our previous selected neighbors stop=false; for(int j=0;j<K && !stop;j++){ if (dist < minDist[j]) { for (int l = K - 1; l >= j+1; l--) { minDist[l] = minDist[l - 1]; nearestN[l] = nearestN[l - 1]; } minDist[j] = dist; nearestN[j] = i; stop=true; } } } // Get the maximum distance further[instance]=minDist[K-1]; } } /** * Classifies a given item with the information stored in the node with the KSNN classifier * * @param item Data attribute values for the item we are classifying * @return the class asigned to the item given */ private int evaluateKSNN (double item[]) { int output; int votes[]; double minDist[]; double example[]; int nearestN[]; double dist; boolean stop; int maxVotes; int num_classes = output_attribute.getValues().size(); example = normalize(item); votes = new int[num_classes]; nearestN = new int[K]; minDist = new double[K]; for (int i=0; i<normalized_values.length; i++) { selected[i]=false; } //find its K nearest neighbors for (int i=0; i<K; i++) { nearestN[i] = -1; minDist[i] = Double.POSITIVE_INFINITY; } for (int i=0; i<normalized_values.length; i++) { dist = distance(example,normalized_values[i]); //see if it's nearer than our previous selected neighbors stop=false; for (int j=0;j<K && !stop;j++){ if (dist < minDist[j]) { for (int l = K - 1; l >= j+1; l--) { minDist[l] = minDist[l - 1]; nearestN[l] = nearestN[l - 1]; } minDist[j] = dist; nearestN[j] = i; stop=true; } } //Select if the example would be a nearest neighbor if (dist < further[i]) { selected[i]=true; } } //Select the neighbors for (int i=0; i<K; i++) { if (nearestN[i] != -1) { selected[nearestN[i]]=true; } } // Voting process for (int i=0; i<num_classes; i++) { votes[i]=0; } for (int i=0; i<normalized_values.length; i++) { if(selected[i]==true){ votes[oclass.get(i).intValue()]++; } } //Select the final output output=-1; maxVotes=0; for(int i=0;i<num_classes;i++){ if(maxVotes<votes[i]){ maxVotes=votes[i]; output=i; } } if (maxVotes == 0) { output = getMajorOutputClass(); } return output; } /** * Calculates the mean (centroid) of each class */ private void calculateMeans () { int num_classes = output_attribute.getValues().size(); int isClass; int nInstances [] = new int [num_classes]; for(int i=0; i<num_classes; i++){ nInstances[i]=0; } for(int i=0;i<oclass.size();i++){ nInstances[oclass.get(i).intValue()]++; } means = new double[num_classes][values.size()]; meanClass = new int[num_classes]; //Initialize the mean's structure for(int i=0;i<num_classes;i++){ for(int j=0;j<values.size();j++){ means[i][j]=0.0; } meanClass[i]=i; } // Calculate the sum of every instance for each class for (int i=0;i<normalized_values.length;i++){ isClass=oclass.get(i).intValue(); for(int j=0;j<values.size();j++){ means[isClass][j]+=normalized_values[i][j]; } } // Get the means for(int i=0;i<num_classes;i++){ for(int j=0;j<values.size();j++){ if(nInstances[i]>0){ means[i][j]/=(double)nInstances[i]; } } } } /** * Classifies a given item with the information stored in the node with the NM classifier * * @param item Data attribute values for the item we are classifying * @return the class asigned to the item given */ private int evaluateNM (double item[]) { int output; double aux; double min; double [] example; example = normalize(item); min=Double.MAX_VALUE; output=-1; //get the nearest mean for(int i=0;i<means.length;i++){ aux=distance(example,means[i]); if(aux<min){ min=aux; output=i; } } //use their class output=meanClass[output]; return output; } /** * Calculates the count matrix of the data in this node, which is used in the evaluation process * in the Naive Bayes classifier. The count matrix is a matrix that stores the number of items in * each class depending on the values of certain attributes. */ private void calculateCountMatrix () { int [][] aux_count_matrix; count_matrix = new ArrayList <int [][]> (); // Histogram for Naive-Bayes for (int i=0; i<the_attributes.size(); i++) { aux_count_matrix = new int[the_attributes.get(i).getValues().size()][output_attribute.getValues().size()+1]; for (int j=0; j<the_attributes.get(i).getValues().size(); j++) { for (int k=0; k<output_attribute.getValues().size(); k++) { aux_count_matrix[j][k] = 0; } aux_count_matrix[j][output_attribute.getValues().size()] = 0; } for (int j=0; j<values.get(i).size(); j++) { aux_count_matrix[values.get(i).get(j).intValue()][oclass.get(j).intValue()]++; aux_count_matrix[values.get(i).get(j).intValue()][output_attribute.getValues().size()]++; } count_matrix.add(aux_count_matrix); } } /** * Classifies a given item with the information stored in the node with the Naive Bayes classifier * * @param item Data attribute values for the item we are classifying * @return the class asigned to the item given */ private int evaluateNaiveBayes (double item[]) { int selected; double max; double [] points; int num_classes = output_attribute.getValues().size(); points = new double[num_classes]; // Initialize points for(int j=0;j<num_classes;j++){ points[j]=1.0; } // Accumulate points for(int j=0;j<item.length;j++){ for(int k=0;k<num_classes;k++){ // Here we do the Laplace correction points[k] = points[k] * ((double)(count_matrix.get(j)[(int)item[j]][k]+1)/(double)(count_matrix.get(j)[(int)item[j]][num_classes]+1)); } } // Find the maximum selected=-1; max=0; for (int j=0;j<num_classes;j++){ if (max<=points[j]) { max=points[j]; selected=j; } } return selected; } }