/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /* Depur.java Isaac Triguero Vel�zquez. Created by Isaac Triguer o Vel�zquez 11-8-2008 Copyright (c) 2008 __MyCompanyName__. All rights reserved. */ package keel.Algorithms.Instance_Generation.Depur; import keel.Algorithms.Preprocess.Basic.*; import org.core.*; import java.util.StringTokenizer; /** * @param k * @param k' * @author Isaac Triguero * @version 1.0 */ public class Depur extends Metodo { /*Own parameters of the algorithm*/ // We need the variable K to use with k-NN rule private int k; // In addition, we use a second variable k' to establish the numbers of neighbours // that must have the same class. private int k2; /** * Constructor. * * @param ficheroScript * */ public Depur (String ficheroScript) { super (ficheroScript); } public void ejecutar () { double conjS[][]; double conjR[][]; int conjN[][]; boolean conjM[][]; int clasesS[]; int S[]; /* Binary Vector, to decide if the instance will be included*/ int i, j, l, cont; int nClases; int tamS; int transformations; int claseObt[]; int clasePredominante; long tiempo = System.currentTimeMillis(); transformations=0; /*Getting the number of different classes*/ nClases = 0; for (i=0; i<clasesTrain.length; i++) if (clasesTrain[i] > nClases) nClases = clasesTrain[i]; nClases++; if (nClases < 2) { System.err.println("Input dataset is empty"); nClases = 0; } /*Algorithm body. First, S=TS. Then, for each instance of TS, the first step is to repeat the aplication of the k-nn, and then we decide if we need to change the label of the instance or we don't need it. */ /*Inicialization of the candidates set, S=X, where X is the original Training Set*/ S = new int[datosTrain.length]; for (i=0; i<S.length; i++) S[i] = 1; /* All included*/ tamS = datosTrain.length; System.out.print("K= "+k+"\n"); System.out.print("K'= "+k2+"\n"); for(i=0; i<datosTrain.length;i++){ /* I need find the k-nn of i in X - {i}, so I make conjS without i*/ conjS = new double[datosTrain.length-1][datosTrain[0].length]; conjR = new double[datosTrain.length-1][datosTrain[0].length]; conjN = new int[datosTrain.length-1][datosTrain[0].length]; conjM = new boolean[datosTrain.length-1][datosTrain[0].length]; clasesS = new int[datosTrain.length-1]; cont=0; for (j = 0; j < datosTrain.length; j++) { if(i!=j){ for (l = 0; l < datosTrain[0].length; l++) { conjS[cont][l] = datosTrain[j][l]; conjR[cont][l] = realTrain[j][l]; conjN[cont][l] = nominalTrain[j][l]; conjM[cont][l] = nulosTrain[j][l]; } clasesS[cont] = clasesTrain[j]; cont++; } } /*Do KNN to the instance*/ claseObt = KNN.evaluacionKNN3(k, conjS, conjR, conjN, conjM, clasesS, datosTrain[i], realTrain[i], nominalTrain[i], nulosTrain[i], nClases, distanceEu); /* System.out.print("Las clases de los k vecinos m�s cercanos son\n"); for(int m=0;m<k;m++){ System.out.print(claseObt[m]+ " "); } System.out.print("\n-----------------------------------------------\n"); */ /*Now, we must check that we have at least k2 neighboors with the same class. */ int max =0; clasePredominante = 0; for(int m=0;m<claseObt.length;m++){ int claseDeInstancia= claseObt[m]; // Select one class. int iguales=0; for(j=0; j< claseObt.length;j++){ // Check numbers of instances with this class if(j!=m){ // I can't count the same. if(claseObt[j]==claseDeInstancia){ iguales++; } } } // I must check if there is another class with more instances. if(iguales >max){ max = iguales; clasePredominante = claseObt[m]; } } //System.out.print("max " + max +"\n"); //System.out.print("Clase Predominante: "+clasePredominante+"\n"); /* Max+1 = number of neighbours with the same class*/ if( (max) >= k2 ){ /* if there are at least k2 neighbour, we change the class in S, */ if(clasePredominante!= clasesTrain[i]) transformations++; clasesTrain[i]=clasePredominante; S[i]=1; }else{ /* Discard.*/ tamS--; S[i] =0; } } System.out.print("S size resultante= " + tamS +"\n"); System.out.print("Transformations = " + transformations +"\n"); /*Construction of the S set from the previous vector S*/ conjS = new double[tamS][datosTrain[0].length]; conjR = new double[tamS][datosTrain[0].length]; conjN = new int[tamS][datosTrain[0].length]; conjM = new boolean[tamS][datosTrain[0].length]; clasesS = new int[tamS]; cont =0; /* To establish the sets' sizes */ for (j = 0; j < datosTrain.length; j++) { if(S[j]==1){ /* Checking the instance is included*/ for (l = 0; l < datosTrain[0].length; l++) { conjS[cont][l] = datosTrain[j][l]; conjR[cont][l] = realTrain[j][l]; conjN[cont][l] = nominalTrain[j][l]; conjM[cont][l] = nulosTrain[j][l]; } clasesS[cont] = clasesTrain[j]; cont++; } } System.out.println("Time elapse: "+ (double)(System.currentTimeMillis()-tiempo)/1000.0 + "s"); OutputIS.escribeSalida(ficheroSalida[0], conjR, conjN, conjM, clasesS, entradas, salida, nEntradas, relation); OutputIS.escribeSalida(ficheroSalida[1], test, entradas, salida, nEntradas, relation); } public void leerConfiguracion (String ficheroScript) { String fichero, linea, token; StringTokenizer lineasFichero, tokens; byte line[]; int i, j; ficheroSalida = new String[2]; fichero = Fichero.leeFichero (ficheroScript); lineasFichero = new StringTokenizer (fichero,"\n\r"); lineasFichero.nextToken(); linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); /*Getting the names of the training and test files*/ line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroTraining = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroTest = new String (line,i,j-i); /*Getting the path and base name of the results files*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); /*Getting the names of output files*/ line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroSalida[0] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroSalida[1] = new String (line,i,j-i); /*Getting the number of neighbours*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); k = Integer.parseInt(tokens.nextToken().substring(1)); /*Getting the k' */ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); k2 = Integer.parseInt(tokens.nextToken().substring(1)); /*Getting the type of distance function*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); distanceEu = tokens.nextToken().substring(1).equalsIgnoreCase("Euclidean")?true:false; } }