/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Jose A. Saez Munoz, research group SCI2S (Soft Computing and Intelligent Information Systems). * DECSAI (DEpartment of Computer Science and Artificial Intelligence), University of Granada - Spain. * Date: 06/01/10 * @version 1.0 * @since JDK1.6 * </p> */ package keel.GraphInterKeel.datacf.partitionData.PartitioningSchemes; import java.io.File; import java.util.Arrays; import java.util.Vector; import org.core.Files; import org.core.Randomize; import keel.Dataset.Attribute; import keel.Dataset.Attributes; import keel.Dataset.Instance; import keel.Dataset.InstanceSet; /** * <p> * This class implements a stratified scheme (equal number of examples of each class in each partition) to partition a dataset * </p> */ public class DistanceBased_best{ private Instance[] instances; private Vector[] partitions; private Instance[][] trainPartition; private Instance[][] testPartition; int nclasses, ninstances, nattributes, numPartitions; boolean used[]; int[] numExClass; int[] sortedIndex; int[] numUsedPerClass; double[] stdDev; double[][][] nominalDistance; //******************************************************************************************************************************* /** * <p> * It reads the training set and creates the partitions * </p> */ public DistanceBased_best(String source_file, int np){ InstanceSet is = new InstanceSet(); Attributes.clearAll(); try { is.readSet(source_file, true); }catch(Exception e){ System.exit(1); } instances = is.getInstances(); nclasses = Attributes.getOutputAttribute(0).getNumNominalValues(); nattributes = Attributes.getInputAttributes().length; ninstances = instances.length; numPartitions = np; calculo_previo_hvdm(); createPartitions(); } //******************************************************************************************************************************* /** * <p> * It creates the partitions from the original training set * </p> */ private void createPartitions(){ // 1) to count the number of examples of each class numExClass = new int[nclasses]; Arrays.fill(numExClass, 0); for(int i = 0 ; i < ninstances ; i++) numExClass[instances[i].getOutputNominalValuesInt(0)]++; // 2) to sort the indexes of examples per class sortedIndex = new int[ninstances]; int k = 0; for (int i = 0; i < nclasses ; i++) for (int j = 0; j < ninstances ; j++) if (instances[j].getOutputNominalValuesInt(0) == i) sortedIndex[k++] = j; // 3) to shuffle the examples of each class int tmp; k = 0; for(int i = 0 ; i < nclasses ; i++){ for(int j = 0 ; j < numExClass[i] ; j++){ int randPos = Randomize.Randint(j, numExClass[i]); tmp = sortedIndex[j+k]; sortedIndex[j+k] = sortedIndex[randPos+k]; sortedIndex[randPos+k] = tmp; } k += numExClass[i]; } // 4) to create the partitions partitions = new Vector[numPartitions]; for (int i = 0; i < numPartitions; i++) partitions[i] = new Vector(); // meter instancias en cada particion used = new boolean[ninstances]; Arrays.fill(used, false); numUsedPerClass = new int[nclasses]; Arrays.fill(numUsedPerClass, 0); int num_vecinos = numPartitions-1; int neighbors[] = new int[num_vecinos]; for(int cl = 0 ; cl < nclasses ; ++cl){ while(quedanEjemplos(cl)){ //System.out.println("CLASE = " + cl + ", " + Attributes.getOutputAttribute(0).getNominalValue(cl)); int instancia = instanciaNoUsada(cl); //System.out.println("Instancia no usada = "+instancia); boolean distance = false; //hvdm distance evaluationKNNClass (num_vecinos, instancia, nclasses, distance, neighbors,cl); // meto la instancia no usada en el primer fold y el resto en los otros folds partitions[0].add(new Integer(instancia)); used[instancia] = true; numUsedPerClass[cl]++; for (int i = 0; i < num_vecinos && neighbors[i] != -1; i++){ //System.out.println(neighbors[i]); partitions[i+1].add(new Integer(neighbors[i])); used[neighbors[i]] = true; numUsedPerClass[cl]++; } } } // 5) create the training and test partitions getTrainTest(); //System.out.println("\n\n\nHOLA!!!"); } //******************************************************************************************************************************* /** * <p> * Main method * </p> * @param args the command line arguments */ private void getTrainTest(){ trainPartition = new Instance[numPartitions][]; testPartition = new Instance[numPartitions][]; for(int par = 0 ; par < numPartitions ; ++par){ // count the number of instances in train number par int tam = 0; for(int i = 0 ; i < numPartitions ; ++i) if(i != par) tam += partitions[i].size(); trainPartition[par] = new Instance[tam]; testPartition[par] = new Instance[partitions[par].size()]; // create the training partition int size = 0; for(int i = 0 ; i < numPartitions ; ++i) if(i != par){ for(int j = 0 ; j < partitions[i].size() ; ++j) trainPartition[par][size++] = instances[(Integer)partitions[i].get(j)]; } // create the test partition for(int j = 0 ; j < partitions[par].size() ; ++j) testPartition[par][j] = instances[(Integer)partitions[par].get(j)]; } } //******************************************************************************************************************************* /** * <p> * It returns the training partition specified * </p> * @param num number of the partition * @return the training partition */ public Instance[] getTrainPartition(int num){ Instance[] res = new Instance[trainPartition[num].length]; for(int i = 0 ; i < res.length ;++i ){ res[i] = new Instance(trainPartition[num][i]); } return res; } //******************************************************************************************************************************* /** * <p> * It returns the test partition specified * </p> * @param num number of the partition * @return the test partition */ public Instance[] getTestPartition(int num){ Instance[] res = new Instance[testPartition[num].length]; for(int i = 0 ; i < res.length ;++i ){ res[i] = new Instance(testPartition[num][i]); } return res; } //******************************************************************************************************************************* /** * <p> * It returns all the original instances * </p> * @param the instances */ public Instance[] getInstances(){ return instances; } //******************************************************************************************************************************* /** * <p> * It returns the indexes of the original instances in all partitions * </p> * @param the indexes of the instances in each partition */ public Vector[] getPartitions(){ return partitions; } //******************************************************************************************************************************* /** * <p> * It creates the files of each training and test partition * </p> */ public void createPartitionFiles(String _carpeta, String _ds){ String sep = System.getProperty("file.separator"); Attribute []att = Attributes.getInputAttributes(); String header = ""; header = "@relation " + Attributes.getRelationName() + "\n"; header += Attributes.getInputAttributesHeader(); header += Attributes.getOutputAttributesHeader(); header += Attributes.getInputHeader() + "\n"; header += Attributes.getOutputHeader() + "\n"; header += "@data\n"; String outputTrain = "", outputTest = ""; for (int i = 0; i < numPartitions ; i++) { outputTest = header; outputTrain = header; // create test partition----------------------------- for(int j = 0 ; j < testPartition[i].length ; ++j){ boolean[] missing = testPartition[i][j].getInputMissingValues(); String newInstance = ""; for(int ak = 0 ; ak < nattributes ; ak++){ if(missing[ak]) newInstance += "?"; else{ if(att[ak].getType() == Attribute.REAL) newInstance += testPartition[i][j].getInputRealValues(ak); if(att[ak].getType() == Attribute.INTEGER) newInstance += (int)testPartition[i][j].getInputRealValues(ak); if(att[ak].getType() == Attribute.NOMINAL) newInstance += testPartition[i][j].getInputNominalValues(ak); } newInstance += ", "; } String className = testPartition[i][j].getOutputNominalValues(0); newInstance += className + "\n"; outputTest += newInstance; } // create train partition----------------------------- for(int j = 0 ; j < trainPartition[i].length ; ++j){ boolean[] missing = trainPartition[i][j].getInputMissingValues(); String newInstance = ""; for(int ak = 0 ; ak < nattributes ; ak++){ if(missing[ak]) newInstance += "?"; else{ if(att[ak].getType() == Attribute.REAL) newInstance += trainPartition[i][j].getInputRealValues(ak); if(att[ak].getType() == Attribute.INTEGER) newInstance += (int)trainPartition[i][j].getInputRealValues(ak); if(att[ak].getType() == Attribute.NOMINAL) newInstance += trainPartition[i][j].getInputNominalValues(ak); } newInstance += ", "; } String className = trainPartition[i][j].getOutputNominalValues(0); newInstance += className + "\n"; outputTrain += newInstance; } Files.addToFile(_carpeta + sep + _ds + "-" + numPartitions + "dobscv-" + String.valueOf(i + 1) + "tra.dat", outputTrain); Files.addToFile(_carpeta + sep + _ds + "-" + numPartitions + "dobscv-" + String.valueOf(i + 1) + "tst.dat", outputTest); } } //******************************************************************************************************************************* /** * <p> * It deletes the files of each training and test partition * </p> */ public void deletePartitionFiles(){ for(int i = 0 ; i < numPartitions ; ++i){ File fichero = new File("train"+(i+1)+".dat"); fichero.delete(); fichero = new File("test"+(i+1)+".dat"); fichero.delete(); } } //******************************************************************************************************************************* /** * <p> * Computes the k nearest neighbors of a given item belonging to a fixed class. * With that neighbors a suggested class for the item is returned. * </p> * * @param nvec Number of nearest neighbors that are going to be searched * @param conj Matrix with the data of all the items in the dataset * @param real Matrix with the data associated to the real attributes of the dataset * @param nominal Matrix with the data associated to the nominal attributes of the dataset * @param nulos Matrix with the data associated to the missing values of the dataset * @param clases Array with the associated class for each item in the dataset * @param ejemplo Array with the data of the specific item in the dataset used * as a reference in the nearest neighbor search * @param ejReal Array with the data of the real attributes of the specific item in the dataset * @param ejNominal Array with the data of the nominal attributes of the specific item in the dataset * @param ejNulos Array with the data of the missing values of the specific item in the dataset * @param nClases Class of the specific item in the dataset * @param distance Kind of distance used in the nearest neighbors computation. * If true the distance used is the euclidean, if false the HVMD distance is used * @param vecinos Array that will have the nearest neighbours id for the current specific item * @param clase Class of the neighbours searched for the item * @return the majority class for all the neighbors of the item */ //evaluationKNNClass (num_vecinos, instancia, 2, distance, neighbors,cl); public int evaluationKNNClass (int nvec, int instancia, int nClases, boolean distance, int vecinos[], int clase) { int i, j, l; boolean parar = false; int vecinosCercanos[]; double minDistancias[]; int votos[]; double dist; int votada, votaciones; if (nvec > ninstances) nvec = ninstances; votos = new int[nClases]; vecinosCercanos = new int[nvec]; minDistancias = new double[nvec]; for (i=0; i<nvec; i++) { vecinosCercanos[i] = -1; minDistancias[i] = Double.POSITIVE_INFINITY; } for (i=0; i<ninstances; i++) { if(!used[i]){ dist = distancia(i, instancia); if (dist > 0 && instances[i].getOutputNominalValuesInt(0) == clase) { parar = false; for (j = 0; j < nvec && !parar; j++) { if (dist < minDistancias[j]) { parar = true; for (l = nvec - 1; l >= j+1; l--) { minDistancias[l] = minDistancias[l - 1]; vecinosCercanos[l] = vecinosCercanos[l - 1]; } minDistancias[j] = dist; vecinosCercanos[j] = i; } } } } } for (j=0; j<nClases; j++) { votos[j] = 0; } for (j=0; j<nvec; j++) { if (vecinosCercanos[j] >= 0) votos[instances[vecinosCercanos[j]].getOutputNominalValuesInt(0)] ++; } votada = 0; votaciones = votos[0]; for (j=1; j<nClases; j++) { if (votaciones < votos[j]) { votaciones = votos[j]; votada = j; } } for (i=0; i<vecinosCercanos.length; i++) vecinos[i] = vecinosCercanos[i]; return votada; } boolean quedanEjemplos (int clase){ return (numUsedPerClass[clase]<numExClass[clase]); } int instanciaNoUsada(int clase){ for(int i = 0 ; i < ninstances ; ++i){ if(!used[sortedIndex[i]] && instances[sortedIndex[i]].getOutputNominalValuesInt(0)==clase) return sortedIndex[i]; } return -1; } /** * Calculates the HVDM distance between two instances * * @param ej1 First instance * @param ej1Real First instance (Real valued) * @param ej1Nom First instance (Nominal valued) * @param ej1Nul First instance (Null values) * @param ej2 Second instance * @param ej2Real First instance (Real valued) * @param ej2Nom First instance (Nominal valued) * @param ej2Nul First instance (Null values) * @param Euc Use euclidean distance instead of HVDM * * @return The HVDM distance */ //KNN.distancia(conj[i], real[i], nominal[i], nulos[i], ejemplo, ejReal, ejNominal, ejNulos, distance); public double distancia (int ej1, int ej2) { int i; double suma = 0; for (i=0; i<nattributes; i++) { if (instances[ej1].getInputMissingValues(i) == true || instances[ej2].getInputMissingValues(i) == true) { suma += 1; } else if (Attributes.getInputAttribute(i).getType() == Attribute.NOMINAL) { suma += nominalDistance[i][instances[ej1].getInputNominalValuesInt(i)][instances[ej2].getInputNominalValuesInt(i)]; } else { suma += Math.abs(instances[ej1].getInputRealValues(i)-instances[ej2].getInputRealValues(i)) / 4*stdDev[i]; } } suma = Math.sqrt(suma); return suma; } public void calculo_previo_hvdm(){ double VDM, Nax, Nay,Naxc,Nayc,media, SD; stdDev = new double[Attributes.getInputNumAttributes()]; nominalDistance = new double[Attributes.getInputNumAttributes()][][]; int nClases = Attributes.getOutputAttribute(0).getNumNominalValues(); for (int i=0; i<nominalDistance.length; i++) { if (Attributes.getInputAttribute(i).getType() == Attribute.NOMINAL) { nominalDistance[i] = new double[Attributes.getInputAttribute(i).getNumNominalValues()][Attributes.getInputAttribute(i).getNumNominalValues()]; for (int j=0; j<Attributes.getInputAttribute(i).getNumNominalValues(); j++) { nominalDistance[i][j][j] = 0.0; } for (int j=0; j<Attributes.getInputAttribute(i).getNumNominalValues(); j++) { for (int l=j+1; l<Attributes.getInputAttribute(i).getNumNominalValues(); l++) { VDM = 0.0; Nax = Nay = 0; for (int m=0; m<ninstances; m++) { if ( instances[m].getInputNominalValuesInt(i) == j) { Nax++; } if (instances[m].getInputNominalValuesInt(i) == l) { Nay++; } } for (int m=0; m<nClases; m++) { Naxc = Nayc = 0; for (int n=0; n<ninstances; n++) { if ( instances[n].getInputNominalValuesInt(i) == j && instances[n].getOutputNominalValuesInt(0) == m) { Naxc++; } if ( instances[n].getInputNominalValuesInt(i) == l && instances[n].getOutputNominalValuesInt(0) == m) { Nayc++; } } VDM += (((double)Naxc / (double)Nax) - ((double)Nayc / (double)Nay)) * (((double)Naxc / (double)Nax) - ((double)Nayc / (double)Nay)); } nominalDistance[i][j][l] = Math.sqrt(VDM); nominalDistance[i][l][j] = Math.sqrt(VDM); } } } else { media = 0; SD = 0; for (int j=0; j<ninstances; j++) { media += instances[j].getInputRealValues(i); SD += instances[j].getInputRealValues(i)*instances[j].getInputRealValues(i); } media /= (double)ninstances; stdDev[i] = Math.sqrt((SD/((double)ninstances)) - (media*media)); } } } }