/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Decision_Trees.FunctionalTrees; import java.util.ArrayList; import java.util.StringTokenizer; import keel.Algorithms.Preprocess.Basic.CheckException; import keel.Dataset.Attribute; import keel.Dataset.Attributes; import keel.Dataset.Instance; import keel.Dataset.InstanceSet; /** * This class contains the most useful information about a dataset, and provides a set of functions to * manage this information easily. * * @author Written by Victoria Lopez Morales (University of Granada) 15/05/2009 * @version 0.1 * @since JDK1.6 */ public class myDataset { /** * Representative name of the dataset */ private String name; /** * The number of attributes that the dataset has */ private int numAtr; /** * The number of instances that the dataset has */ private int numIns; /** * All the information about the attributes that the dataset uses */ private ArrayList<myAttribute> attributes; /** * All the information about the output attribute of the dataset */ private myAttribute outputAttribute; /** * The number of different classes that there are in the dataset */ private int numClasses; /** * The number of instances of each class that there are in the dataset */ private int nInstances[]; /** * Values of the attributes for each instance in the dataset */ private double data[][]; /** * Class of each instance in the dataset */ private int output[]; /** * Kind of the dataset we are considering. kind = 1 if we are considering a train dataset; kind = 2 if we are considering a reference dataset; and kind = 3 if we are considering a test dataset. */ private int kind; /** * Creates a dataset by reading the .dat file that contains the information of it, * and gives values to every field of the class * * @param nameFile The name of the file that is going to be read * @param newkind The kind of dataset to determine if the file is for training, reference or test * @exception keel.Algorithms.Preprocess.Basic.CheckException Thrown from the CheckException class */ public myDataset(String nameFile, int newkind) throws CheckException { Attribute at; String nameat; double min, max; ArrayList<String> nomValues; StringTokenizer tokens; Instance temp; boolean[] nulls; InstanceSet IS; kind = newkind; IS = new InstanceSet(); // Read of data file try { if (newkind == 3) { IS.readSet(nameFile, false); } else { IS.readSet(nameFile, true); } } catch (Exception e) { System.err.println(e); System.exit(1); } // Check if dataset corresponding with a classification problem if (Attributes.getOutputNumAttributes() < 1) { throw new CheckException ("This dataset doesn't have any outputs, so it doesn't belong to a classification problem"); } else if (Attributes.getOutputNumAttributes() > 1) { throw new CheckException("This dataset has more than one output"); } if (Attributes.getOutputAttribute(0).getType() == Attribute.REAL) { throw new CheckException("This dataset has an output attribute with float values, so it doesn't belong to a classification problem"); } // Get the name, number of attributes and number of instances of the dataset name = new String (Attributes.getRelationName()); numAtr = Attributes.getInputNumAttributes(); numIns = IS.getNumInstances(); // Create vectors to hold information attributes = new ArrayList<myAttribute>(numAtr); // Store attribute inputs for (int j = 0; j < numAtr; j++) { at = Attributes.getInputAttribute(j); nameat = new String (at.getName()); // Check if it is real or integer if ((at.getType() == 1) || (at.getType() == 2)) { // Create continuous attribute min = (double) at.getMinAttribute(); max = (double) at.getMaxAttribute(); attributes.add(new myAttribute(nameat, at.getType(), min, max, true)); } else { // Create nominal attribute myAttribute aux; int numNominal = at.getNumNominalValues(); nomValues = new ArrayList<String>(numNominal); for (int k = 0; k < numNominal; k++) { nomValues.add(at.getNominalValue(k)); } aux = new myAttribute(nameat, 3, true); aux.setValues(nomValues); attributes.add(aux); } } // for // Copy the data tokens = new StringTokenizer(IS.getHeader(), " \n\r"); tokens.nextToken(); tokens.nextToken(); // Get space for the instances data = new double[IS.getNumInstances()][numAtr]; output = new int[IS.getNumInstances()]; for (int i = 0; i < IS.getNumInstances(); i++) { // Store the values of the instances in the corresponding data structures temp = IS.getInstance(i); data[i] = temp.getAllInputValues(); output[i] = (int) temp.getOutputRealValues(0); nulls = temp.getInputMissingValues(); // Clean missing values for (int j = 0; j < nulls.length; j++) { if (nulls[j]) { data[i][j] = 0.0; } } } // Store output attributes at = Attributes.getOutputAttribute(0); nameat = new String (at.getName()); // Check if it is real if ((at.getType() == 1) || (at.getType() == 2)) { // Create continuous attribute min = (double) at.getMinAttribute(); max = (double) at.getMaxAttribute(); outputAttribute = new myAttribute(nameat, at.getType(), min, max, false); } else { // Create nominal attribute myAttribute aux; int numNominal = at.getNumNominalValues(); nomValues = new ArrayList<String>(numNominal); for (int k = 0; k < numNominal; k++) { nomValues.add(at.getNominalValue(k)); } aux = new myAttribute(nameat, 3, false); aux.setValues(nomValues); outputAttribute = aux; } // Get the number of classes numClasses = Attributes.getOutputAttribute(0).getNumNominalValues(); // And the number of instances on each class nInstances = new int[numClasses]; for (int i = 0; i < numClasses; i++) { nInstances[i] = 0; } for (int i = 0; i < output.length; i++) { nInstances[output[i]]++; } IS.setAttributesAsNonStatic(); if (kind == 3) { Attributes.clearAll(); } } /** * Creates a dataset from another existing dataset * * @param dataset Original dataset from which we are going to create a copy */ public myDataset(myDataset dataset) { myAttribute aux; // Copy each data field to the new dataset kind = dataset.kind; name = dataset.name; numAtr = dataset.numAtr; numIns = dataset.numIns; numClasses = dataset.numClasses; outputAttribute = new myAttribute(dataset.outputAttribute); attributes = new ArrayList<myAttribute>(); for (int i = 0; i < dataset.attributes.size(); i++) { aux = new myAttribute((myAttribute) dataset.attributes.get(i)); attributes.add((myAttribute) aux); } nInstances = new int[numClasses]; System.arraycopy(dataset.nInstances, 0, nInstances, 0, dataset.nInstances.length); output = new int[numIns]; System.arraycopy(dataset.output, 0, output, 0, dataset.output.length); data = new double[numIns][numAtr]; for (int i=0; i<numIns; i++) { for (int j=0; j<numAtr; j++) { data[i][j] = dataset.data[i][j]; } } } /** * Check if a dataset is the same dataset as another object * * @param obj Object that is checked to see if it is the same dataset * @return true if the datasets are the same, false otherwise * @see java.lang.Object#equals(java.lang.Object) */ public boolean equals (Object obj) { // First we check if the reference is the same if (this == obj) return true; // Then we check if the object exists and is from the class myDataset if((obj == null) || (obj.getClass() != this.getClass())) return false; // object must be myDataset at this point myDataset test = (myDataset)obj; // We check if the values for nInstances are the same for (int i=0; i<numClasses; i++) { if (nInstances[i] != test.nInstances[i]) return false; } // We check if the values for output are the same for (int i=0; i<numIns; i++) { if (output[i] != test.output[i]) return false; } // We check if the values for data are the same for (int i=0; i<numIns; i++) { for (int j=0; j<numAtr; j++) { if (data[i][j] != test.data[i][j]) { return false; } } } // We check the other class attributes of the dataset return ((numAtr == test.numAtr) && (numIns == test.numIns) && (numClasses == test.numClasses) && (kind == test.kind) && (name == test.name || (name != null && name.equals(test.name))) && (attributes == test.attributes || (attributes != null && attributes.equals(test.attributes))) && (outputAttribute == test.outputAttribute || (outputAttribute != null && outputAttribute.equals(test.outputAttribute)))); } /** * Hash-code function for the class that is used when object is inserted in a structure like a hashtable * * @return the hash code obtained * @see java.lang.Object#hashCode() */ public int hashCode() { int hash = 7; hash = 31 * hash + (null == name ? 0 : name.hashCode()); hash = 31 * hash + numAtr; hash = 31 * hash + numIns; hash = 31 * hash + (null == outputAttribute ? 0 : outputAttribute.hashCode()); hash = 31 * hash + numClasses; hash = 31 * hash + kind; return hash; } /** * Overriden function that converts the class to a string * * @return the string representation of the class * @see java.lang.Object#toString() */ public String toString() { String aux; // First we get the dataset name aux = new String(name); aux += "\n"; // Then, we get the attributes in the dataset for (int i=0; i<attributes.size(); i++) { aux = aux + (myAttribute)attributes.get(i) + "\n" ; } // and the output attribute aux = aux + outputAttribute + "\n"; // Then, we print the values of the attributes and the output class for that data for (int i=0; i<numIns; i++) { for (int j=0; j<numAtr; j++) { aux = aux + data[i][j] + " "; } aux = aux + output[i] + "\n"; } // Finally, we see the kind of dataset we're dealing with switch (kind) { case 1: aux = aux + "Training dataset\n"; break; case 2: aux = aux + "Reference dataset\n"; break; case 3: aux = aux + "Test dataset\n"; break; default: System.err.println("Error: This dataset isn't correctly specified\n"); System.exit(1); break; } return aux; } /** * Gets the name of the dataset * * @return the name of the dataset */ public String getName() { return name; } /** * Replaces the name of the dataset with another new name * * @param name New name for the dataset */ public void setName(String name) { this.name = name; } /** * Gets the number of attributes that the dataset has * * @return the number of attributes that the dataset has */ public int getNumAtr() { return numAtr; } /** * Replaces the number of attributes in this dataset with a new number of attributes * * @param numAtr New number of attributes for this dataset */ public void setNumAtr(int numAtr) { this.numAtr = numAtr; } /** * Gets the number of instances that the dataset has * * @return the number of instances that the dataset has */ public int getNumIns() { return numIns; } /** * Replaces the number of instances in this dataset with a new number of instances * * @param numIns New number of instances for this dataset */ public void setNumIns(int numIns) { this.numIns = numIns; } /** * Gets all the information about the attributes that the dataset uses * * @return all the information about the attributes that the dataset uses */ public ArrayList<myAttribute> getAttributes() { return attributes; } /** * Replaces all the information about the attributes that the dataset uses with new information about the attributes * * @param attributes New information about the attributes that the dataset uses */ public void setAttributes(ArrayList<myAttribute> attributes) { this.attributes = attributes; } /** * Gets all the information about ith the attribute that the dataset uses * * @return all the information about the ith attribute that the dataset uses */ public myAttribute getAttributeI (int i) { myAttribute aux; aux = (myAttribute)attributes.get(i); return aux; } /** * Replaces the information about the ith attribute that the dataset uses with new information about that attribute * * @param i Position of the attribute that is going to be replaced * @param att New information about the attribute that the dataset uses */ public void setAttributeI (int i, myAttribute att) { attributes.set(i, att); } /** * Gets all the information about the output attribute of the dataset * * @return all the information about the output attribute of the dataset */ public myAttribute getOutputAttribute() { return outputAttribute; } /** * Replaces the information about the output attribute with new information about that attribute * * @param outputAttribute Attribute to be stored like the output attribute */ public void setOutputAttribute(myAttribute outputAttribute) { this.outputAttribute = outputAttribute; } /** * Gets the number of different classes that there are in the dataset * * @return the number of different classes that there are in the dataset */ public int getNumClasses() { return numClasses; } /** * Replaces the number of classes in this dataset with a new number of classes * * @param numClasses New number of classes for this dataset */ public void setNumClasses(int numClasses) { this.numClasses = numClasses; } /** * Gets the number of instances of each class that there are in the dataset * * @return the number of instances of each class that there are in the dataset */ public int getNInstancesI (int i) { return nInstances[i]; } /** * Replaces the number of instances of a class that there are in the dataset with a another number of instances for that class * * @param i Class which number of instances is going to be modified * @param instances New number of instances for the ith class */ public void setNInstancesI(int i, int instances) { nInstances[i] = instances; } /** * Gets the value of the jth attribute for the ith instance in the dataset * * @return the value of the jth attribute for the ith instance in the dataset */ public double getDataI (int i, int j) { return data[i][j]; } /** * Gets the value of the ith instance in the dataset * * @return the value of the ith instance in the dataset */ public double [] getDataItem (int i) { return data[i]; } /** * Replaces the value of the ith instance at the jth attribute in this dataset with the specified value * * @param i Position of the instance which value is going to be replaced * @param j Attribute which value is going to be replaced * @param data Value to be stored at the specified instance and attribute */ public void setDataI (int i, int j, double data) { this.data[i][j] = data; } /** * Gets the class of each instance in the dataset * * @return the class of each instance in the dataset */ public int getOutputI (int i) { return output[i]; } /** * Replaces the output attribute value at the specified instance in this dataset with the specified value * * @param i Index of the instance output value to replace * @param output Value to be stored at the specified instance */ public void setOutputI (int i, int output) { this.output[i] = output; } /** * Gets the kind of the dataset we are considering (training, reference, test) * * @return the name of the dataset */ public int getKind() { return kind; } /** * Changes the kind of dataset to a new kind * * @param kind New kind for the dataset */ public void setKind(int kind) { this.kind = kind; } }