/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Rosa Venzala (University of Granada) 02/06/2008 * @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 16/12/2008 * @version 1.1 * @since JDK1.2 * </p> */ package keel.Algorithms.Hyperrectangles.EACH; /** * <p>Descripci�: Contiene los metodos de lectura del fichero de train y test</p> * <p>Copyright: Copyright Rosa (c) 2007</p> * <p>Empresa: Mi Casa</p> * @author Rosa Venzala * @version 1.0 */ import java.io.*; import keel.Dataset.*; import java.util.*; //import java.util.Arrays; import org.core.*; public class Dataset { /** * <p> * Class to manage data sets * </p> */ private double[][] X = null; // for the nominal values private String [][]X2=null; private boolean[][] missing = null; private int[] C = null; // for nominal classes private String[] C2=null; private double[] eMaximum; private double[] eMinimum; // number of examples private int nData; // number of variables private int nVariables; // number of in-puts private int nInputs; // number of classes private int nClasses; final static boolean debug = false; private InstanceSet IS; private int[] common; private int []numValues; private double[][]sortedValuesList; /** * <p> * Return the values of the in-put attributes * </p> * @return double[][] An array with the in-put attributes */ public double[][] getX() { return X; } public String[][]getX2(){ return X2; } /** * <p> * Return the values of the in-put attributes for an instance * </p> * @param pos The position of the instance in the set of values * @return double[] An array with the in-put attributes for the instance */ public double []getX(int pos){ return X[pos]; } public double []getXNor(int pos){ normalize(); return X[pos]; } public InstanceSet getInstanceSet(){ return IS; } /** * <p> * Returns the nominal value of the double value of the attribute * </p> */ public String findNominalValue(int atr,double valor){ String dev=""; boolean parar=false; for (int i=0;i<nData&&!parar;i++){ if(valor==X[i][atr]){dev=X2[i][atr];parar=true;} } return dev; } /** * <p> * Checks if in the instances set left instances of a determined class * </p> * @param whichClass The class * @return true if there're instances, false otherwise */ public boolean thereInstancesOfClass(int whichClass)throws ArrayIndexOutOfBoundsException{ boolean resul=false; int cadena; Instance[] instanceSet; if (whichClass <0 || whichClass >= nClasses) {throw new ArrayIndexOutOfBoundsException("You are trying to access to "+whichClass+" class and there are only "+nClasses+".");} instanceSet=IS.getInstances(); //este bucle va a sustituir a la funcion antes definida en instance set for(int i=0;i<IS.getNumInstances();i++){ cadena=instanceSet[i].getOutputNominalValuesInt(0); if(cadena==whichClass){resul=true; //System.out.println(i); } } //resul=IS.hayInstanciasDeClaseC(whichClass); return resul; } /** * <p> * Gets for each attribute the sorted list of its possible values * </p> */ public double [][]getListValues(){ double [][]valuesList=new double[nInputs] [nData]; for (int i=0;i<nInputs;i++){ double []vector=new double [nData]; for(int j=0;j<nData;j++){ vector[j]=X[j][i]; } vector=removeDuplicated(vector,i); Arrays.sort(vector,0,numValues[i]); valuesList[i]=vector; } sortedValuesList=valuesList; return valuesList; } private double[] removeDuplicated(double[]v,int atributo){ int contador=0; boolean encontrado; double[]aux=new double[v.length]; for(int i=0;i<v.length;i++){ encontrado=false; for(int j=0;j<contador&&(!encontrado);j++){ if(aux[j]==v[i])encontrado=true;//el valor ya esta } if(!encontrado){aux[contador]=v[i];contador++;} } numValues[atributo]=contador; return aux;//v=aux; } /** * <p> * Creates a matrix training set, stored for each class, each attribute, and each value * the number of examples of class C that have value V for the attribute A COUNT[C,V,A] * </p> * @return int [][][] A matrix with the number of examples */ public int [][][] createCount(){ int [][][] count=new int [nClasses][nInputs] [nData];; //para definir la matriz Count, nvalores podria ser en el peor caso // el numero de instancias, es decir, no se repite ningun valor para algun atributo //ndatos es IS.getNumInstances(); //obtener un array ordenado de los valores de un atributo sin repetirse //esta seria la nueva funcion getNumValues, la de antes no sirve //buscar el valor actual en este vector y devolver el indice double valor_actual;String nominal_actual;int indice; int []num_valores=new int [nInputs]; int []tipos=new int [nInputs]; num_valores=getNumValues(); for (int i=0; i<nClasses;i++) {for (int j=0; j<nInputs;j++){ for (int k=0; k<nData;k++)count[i][j][k]=0; }} for (int i=0; i<nClasses;i++) { for (int j=0; j<nInputs;j++){ tipos=typesVariable(); for (int k=0; k<nData;k++){ //System.out.println("clase "+C[k]); if(C[k]==i){//esta instancia es de la clase actual i valor_actual=X[k][j];nominal_actual=X2[k][j]; //System.out.println("valor actual es "+X[k][j]); // System.out.println("valor actual NOMINAL es "+X2[k][j]); //El atributo es nominal if(tipos[j]==0)count[i][j][(int)valor_actual]++; //para los numericos es cuando tenemos que ordenar else{ indice=search(sortedValuesList[j],valor_actual); if(indice==-1)System.err.println("Error: el valor no se encontro en el vector"); count[i][j][indice]++; }// System.out.println("la clase atributo y valor "+i+" "+j+" "+(int)valor_actual+" lleva "+Count[i][j][(int)valor_actual]); } }}}//de los 3for return count; } /** * <p> * Look for an element in a sorted vector. * Returns the index where found it, returns -1 otherwise. * </p> */ private int search(double[]v,double valor){ boolean found=false; int index=-1; for(int i=0;i<v.length&&(!found);i++){ if(v[i]==valor){found=true;index=i;} } return index; } /** * <p> * Returns a vector with the optimum class for each pair attribute-value * </p> * @return int [][] vector with the optimum classes */ public int [][]getOptimumClass(int [][][]Count,long seed){ int [][]optimum=new int[nInputs] [nData]; int []vector=new int[nClasses]; for (int i=0; i<nInputs;i++){ for (int j=0; j<numValues[i];j++){ for(int k=0;k<nClasses;k++)vector[k]=Count[k][i][j]; optimum[i][j]=getMaximum(vector,seed); } } return optimum; } /** * <p> * Returns the index where is the maximum in an array of integers * </p> * @return int index where is the aximum value */ private int getMaximum(int []num, long seed){ Randomize.setSeed(seed); int max=num[0];int indice=0; int []options=new int[nClasses]; int counter=0; options[counter]=0; counter++; for(int i=1;i<num.length;i++){ if(num[i]>max){max=num[i];indice=i;counter=0;options[counter]=i;counter++;} else{if(num[i]==max){options[counter]=i;counter++;}} } if((counter-1)>0){//es que hay mas de una clase que es optima, la elegimos aleatoriamente indice=Randomize.RandintClosed(0, counter); indice=options[indice]; } return indice; } /** * <p> * Returns the index where is the maximum in an array of doubles * </p> * @return int index where is the aximum value */ public int getMax(double []num, long seed){ Randomize.setSeed(seed); double max=num[0]; int index=0; int []options=new int[nInputs]; int counter=0; options[counter]=0;counter++; for(int i=1;i<num.length;i++){ if(num[i]>max){max=num[i];index=i;counter=0;options[counter]=i;counter++;} else{if(num[i]==max){options[counter]=i;counter++;}} } if((counter-1) > 0){//es que hay mas de una clase que es optima, la elegimos aleatoriamente index=Randomize.RandintClosed(0, counter); System.out.println("Elegimos "+index); index=options[index]; } return index; } /** * <p> * Returns for each attribute the number of different values * </p> * @return int [] an array with the number of different values */ public int []getNumValues(){ int []num=new int [nInputs]; for(int i=0;i<nInputs;i++){ num[i]=0; for (int j=1;j<nData;j++){ if((int)X[j][i]> num[i])num[i]=(int)X[j][i]; } num[i]++; } return num; } /** * <p> * Returns for each attribute the number of different values * </p> * @return int [] an array with the number of different values */ public int []getNumValues2(){ return numValues; } /** * <p> * Returns the values for the out-put(class) * </p> * @return int[] An array with the values of the class */ public int[] getC() { int[] retorno = new int[C.length]; for (int i = 0; i < C.length; i++) { retorno[i] = C[i]; } return retorno; } public String[] getC2() { String[] retorno = new String[C2.length]; for (int i = 0; i < C2.length; i++) { retorno[i] = C2[i]; } return retorno; } /** * Devuelve el valor de los atributos de salida para una instancia determinada *@param pos La posicion de la instancia en el conjunto de datos * @return int el valor de la clase para esa instancia */ public int getC(int pos){ return C[pos]; } /** * <p> * Returns an array with the maximum values of the in-put attributes * </p> * @return double[] idem */ public double[] getMaximum() { return eMaximum; } /** * <p> * Returns an array with the minimum values of the in-put values * </p> * @return double[] idem */ public double[] getMinimum() { return eMinimum; } /** * <p> * Return the number of examples * </p> * @return int the number of examples */ public int getNData() { return nData; } /** * <p> * Returns the number of variables * </p> * @return int The number of variables(including in-put and out-put) */ public int getNVariables() { return nVariables; } /** * <p> * Return the number of in-put variables * </p> * @return int Total of the in-put variables */ public int getNInPuts() { return nInputs; } /** * <p> * Returns the total number of classes * </p> * @return int the number of classes */ public int getNClasses() { return nClasses; } /** * <p> * Checks if one attribute is lost or not * </p> * @param i int Number of example * @param j int Number of attribue * @return boolean True if lost */ public boolean isMissing(int i, int j) { // True is the value is missing (0 in the table) return missing[i][j]; } /** * <p> * Constructor, creates a new set of instances * </p> */ public Dataset() { IS = new InstanceSet(); // Init a new set of instances } /** * <p> * Reads the file of examples(Train&Test) * </p> * @param nfejemplos String Nom of the examples file * @param train boolean True if Train set. False is test set. * @throws IOException A possible I/O error */ public void readSet(String nfejemplos, boolean train) throws IOException { try { // Load in memory a dataset that contains a classification problem IS.readSet(nfejemplos, train); nData = IS.getNumInstances(); nInputs = Attributes.getInputNumAttributes(); nVariables = nInputs + Attributes.getOutputNumAttributes(); /* System.out.println(ndatos); System.out.println(nentradas); System.out.println(nvariables);*/ // Check that there is only one output variable if (Attributes.getOutputNumAttributes() > 1) { System.out.println( "This algorithm can not process MIMO datasets"); System.out.println( "All outputs but the first one will be removed"); System.exit(1); //TERMINAR } boolean noOutputs = false; if (Attributes.getOutputNumAttributes() < 1) { System.out.println( "This algorithm can not process datasets without outputs"); System.out.println("Zero-valued output generated"); noOutputs = true; System.exit(1); //TERMINAR } // Initialice and fill our own tables X = new double[nData][nInputs]; X2 = new String[nData][nInputs]; missing = new boolean[nData][nInputs]; C = new int[nData]; C2=new String[nData]; numValues=new int[nInputs]; sortedValuesList=new double[nInputs] [nData]; // Maximum and minimum of inputs eMaximum = new double[nInputs]; eMinimum = new double[nInputs]; // All values are casted into double/integer nClasses = 0; for (int i = 0; i < nData; i++) { Instance inst = IS.getInstance(i); for (int j = 0; j < nInputs; j++) { X2[i][j] = IS.getInputNominalValue(i, j); //inst.getInputRealValues(j); X[i][j] = IS.getInputNumericValue(i, j); // System.out.println(X[i][j]); missing[i][j] = inst.getInputMissingValues(j); if (X[i][j] > eMaximum[j] || i == 0) { eMaximum[j] = X[i][j]; } if (X[i][j] < eMinimum[j] || i == 0) { eMinimum[j] = X[i][j]; } } if (noOutputs) { C[i] = 0; } else { C[i] = (int)IS.getOutputNumericValue(i, 0); C2[i] = IS.getOutputNominalValue(i, 0); //(int)inst.getOutputRealValues(i); } if (C[i] > nClasses) { nClasses = C[i]; } } nClasses++; System.out.println("Number of classes=" + nClasses); //IMPRIME TODOS LOS ATRIBUTOS Y TODAS LAS INSTANCIAS // IS.print(); } catch (Exception e) { System.out.println("DBG: Exception in readSet"); e.printStackTrace(); } } /** * <p> * Returns a string with the header of the file * </p> * @return String The data of the header of the file */ public String copyHeaderTest() { // Header of the output file String p = new String(""); p = "@relation " + Attributes.getRelationName() + "\n"; p += Attributes.getInputAttributesHeader(); p += Attributes.getOutputAttributesHeader(); p += Attributes.getInputHeader() + "\n"; p += Attributes.getOutputHeader() + "\n"; p += "@data\n"; return p; } /** * <p> * Convert all the values of the set of values in the inetrval[0,1] * </p> */ public void normalize() { int atts = this.getNInPuts(); double moximus[] = new double[atts]; for (int j = 0; j < atts; j++) { moximus[j] = 1.0 / (eMaximum[j] - eMinimum[j]); } for (int i = 0; i < this.getNData(); i++) { for (int j = 0; j < atts; j++) { if (isMissing(i, j)) { ; //no escojo este ejemplo } else { X[i][j] = (X[i][j] - eMinimum[j]) * moximus[j]; } } } } /** * <p> * Return the types of each in-put(NOMINAL[0] o NUMERIC[1]) * </p> * @return int[] A vector with (NOMINAL[0] o NUMERIC[1]) */ public int[] typesVariable() { int[] types = new int[this.nInputs]; for (int i = 0; i < this.nInputs; i++) { types[i] = 1; if (Attributes.getInputAttribute(i).getType() == Attribute.NOMINAL) { types[i] = 0; } } return types; } /** * <p> * Calculate the values most commons for each column or attribute * </p> */ public void computeMostComon() { common = new int[nInputs]; int[] aux = new int[nData]; for (int i = 0; i < nInputs; i++) { for (int j = 0; j < nData; j++) { if (this.isMissing(j, i)) { aux[j] = -1; } else { aux[j] = (int) X[j][i]; } } Arrays.sort(aux); int mostC1 = aux[0]; int counter1 = 1, j; for (j = 1; (aux[j] == mostC1) && (j < nData - 1); j++, counter1++) { ; } int counter2 = 1; int mostC2 = aux[j]; if (j + 1 < nData) { for (j = j + 1; j < nData; j++) { if (aux[j] == mostC2) { counter2++; } else { mostC2 = aux[j]; if (counter2 > counter1) { counter1 = counter2; mostC1 = mostC2; counter2 = 1; } } } } common[i] = mostC1; } } /** * <p> * Return the value most common of the attribute 'i' * </p> * @param i int Number of the attribute * @return int Most common value for this variable */ public int mostCommon(int i) { return common[i]; } /** * <p> * Returns the name of the problem's variables * </p> * @return String[] An array with the name of the problem's variables */ public String[] getNames() { String[] out = new String[nVariables]; for (int i = 0; i < nInputs; i++) { out[i] = Attributes.getInputAttribute(i).getName(); } out[nInputs] = Attributes.getOutputAttribute(0).getName(); return out; } /** * <p> * Returns teh value of the classes * </p> * @return String[] An aray with the name of the out-puts(classes) */ public String[] giveClasses(){ String [] out = new String[nClasses]; Attribute at = Attributes.getOutputAttribute(0); if (at.getType() == at.NOMINAL){ for (int i = 0; i < nClasses; i++) { out[i] = at.getNominalValue(i); } } else{ out = null; //luego guardar�el valor de las clases num�icas } return out; } /** * <p> * Checks if in the data base there is a in-put type real or continous * </p> * @return boolean True if exists, False otherwise */ public boolean hayAtributosContinuos(){ return Attributes.hasRealAttributes(); } }