/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. Sánchez (luciano@uniovi.es) J. Alcalá-Fdez (jalcala@decsai.ugr.es) S. García (sglopez@ujaen.es) A. Fernández (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ // // SMOTE.java // // Salvador Garc�a L�pez // // Created by Salvador Garc�a L�pez 30-3-2006. // Copyright (c) 2004 __MyCompanyName__. All rights reserved. // package keel.Algorithms.ImbalancedClassification.Ensembles.SMOTE; import java.util.Arrays; import keel.Algorithms.ImbalancedClassification.Ensembles.Basic.*; import keel.Dataset.Attribute; import org.core.*; import java.util.StringTokenizer; import keel.Algorithms.ImbalancedClassification.Ensembles.multi_C45; import keel.Dataset.*; public class MSMOTE extends Metodo { /*Own parameters of the algorithm*/ private long semilla; private int kSMOTE, kClean; private int ASMO; private boolean balance; private double smoting; public MSMOTE (String ficheroScript) { super (ficheroScript); } public MSMOTE (InstanceSet IS, long seed, int kClean, int k, int ASMO, boolean bal, double smoting, String distance) { int nClases, i, j, l, m, n; double VDM; int Naxc, Nax, Nayc, Nay; double media, SD; this.kClean = kClean; this.semilla = seed; this.training = new InstanceSet(IS); this.test = new InstanceSet(IS); this.kSMOTE = k; this.balance = bal; this.smoting = smoting; distanceEu = distance.equalsIgnoreCase("Euclidean")?true:false; ficheroSalida = new String[2]; ficheroSalida[0] = multi_C45.outputTr.substring(0,multi_C45.outputTr.length()-4) + "train.tra"; ficheroSalida[1] = multi_C45.outputTr.substring(0,multi_C45.outputTr.length()-4) + "train.tst"; try { /*Normalize and check the data*/ normalizar(); } catch (Exception e) { System.err.println(e); System.exit(1); } /*Previous computation for HVDM distance*/ if (distanceEu == false) { stdDev = new double[Attributes.getInputNumAttributes()]; nominalDistance = new double[Attributes.getInputNumAttributes()][][]; nClases = Attributes.getOutputAttribute(0).getNumNominalValues(); for (i = 0; i < nominalDistance.length; i++) { if (Attributes.getInputAttribute(i).getType() == Attribute.NOMINAL) { nominalDistance[i] = new double[Attributes.getInputAttribute(i). getNumNominalValues()][Attributes.getInputAttribute(i). getNumNominalValues()]; for (j = 0; j < Attributes.getInputAttribute(i).getNumNominalValues(); j++) { nominalDistance[i][j][j] = 0.0; } for (j = 0; j < Attributes.getInputAttribute(i).getNumNominalValues(); j++) { for (l = j + 1; l < Attributes.getInputAttribute(i).getNumNominalValues(); l++) { VDM = 0.0; Nax = Nay = 0; for (m = 0; m < training.getNumInstances(); m++) { if (nominalTrain[m][i] == j) { Nax++; } if (nominalTrain[m][i] == l) { Nay++; } } for (m = 0; m < nClases; m++) { Naxc = Nayc = 0; for (n = 0; n < training.getNumInstances(); n++) { if (nominalTrain[n][i] == j && clasesTrain[n] == m) { Naxc++; } if (nominalTrain[n][i] == l && clasesTrain[n] == m) { Nayc++; } } VDM += ( ( (double) Naxc / (double) Nax) - ( (double) Nayc / (double) Nay)) * ( ( (double) Naxc / (double) Nax) - ( (double) Nayc / (double) Nay)); } nominalDistance[i][j][l] = Math.sqrt(VDM); nominalDistance[i][l][j] = Math.sqrt(VDM); } } } else { media = 0; SD = 0; for (j = 0; j < training.getNumInstances(); j++) { media += realTrain[j][i]; SD += realTrain[j][i] * realTrain[j][i]; } media /= (double) realTrain.length; stdDev[i] = Math.sqrt( Math.abs((SD / ( (double) realTrain.length)) - (media * media))); } } } } public void ejecutar () { int nPos = 0; int nNeg = 0; int i, j, l, m; int tmp; int posID, negID; int positives[]; double conjS[][]; double conjR[][]; int conjN[][]; boolean conjM[][]; int clasesS[]; double genS[][]; double genR[][]; int genN[][]; boolean genM[][]; int clasesGen[]; int tamS; int pos; int neighbors[][]; int nn; int type[]; int claseObt; boolean marcas[]; int nSel = 0; long tiempo = System.currentTimeMillis(); /*Inicialization of the flagged instances vector for a posterior copy*/ marcas = new boolean[datosTrain.length]; for (i=0; i<datosTrain.length; i++) marcas[i] = false; /*Count of number of positive and negative examples*/ for (i=0; i<clasesTrain.length; i++) { if (clasesTrain[i] == 0) nPos++; else nNeg++; } if (nPos > nNeg) { tmp = nPos; nPos = nNeg; nNeg = tmp; posID = 1; negID = 0; } else { posID = 0; negID = 1; } /*Body of the algorithm. For each instance in T, search the correspond class conform his mayority from the nearest neighborhood. Is it is positive, the instance is selected.*/ for (i=0; i<datosTrain.length; i++) { /*Apply KNN to the instance*/ claseObt = KNN.evaluacionKNN2 (kClean, datosTrain, realTrain, nominalTrain, nulosTrain, clasesTrain, datosTrain[i], realTrain[i], nominalTrain[i], nulosTrain[i], 2, distanceEu); if (claseObt == clasesTrain[i] || clasesTrain[i] != negID) { //agree with your majority, it is included in the solution set marcas[i] = true; nSel++; } } /*Building of the S set from the flags*/ conjS = new double[nSel][datosTrain[0].length]; conjR = new double[nSel][datosTrain[0].length]; conjN = new int[nSel][datosTrain[0].length]; conjM = new boolean[nSel][datosTrain[0].length]; clasesS = new int[nSel]; for (i=0, l=0; i<datosTrain.length; i++) { if (marcas[i]) { //the instance will be copied to the solution for (j=0; j<datosTrain[0].length; j++) { conjS[l][j] = datosTrain[i][j]; conjR[l][j] = realTrain[i][j]; conjN[l][j] = nominalTrain[i][j]; conjM[l][j] = nulosTrain[i][j]; } clasesS[l] = clasesTrain[i]; l++; } } datosTrain = conjS; realTrain = conjR; nominalTrain = conjN; nulosTrain = conjM; clasesTrain = clasesS; nNeg = 0; nPos = 0; for (i=0; i<clasesTrain.length; i++) { if (clasesTrain[i] == posID) nPos++; else nNeg++; } if (nNeg < nPos) { System.out.println("MSMOTE "+ relation + " " + (double)(System.currentTimeMillis()-tiempo)/1000.0 + "s"); OutputIS.escribeSalida(ficheroSalida[0], conjR, conjN, conjM, clasesS, entradas, salida, nEntradas, relation); return; } conjS = null; conjR = null;; conjN = null; conjM = null; clasesS = null; /*Localize the positive instances*/ positives = new int[nPos]; for (i=0, j=0; i<clasesTrain.length; i++) { if (clasesTrain[i] == posID) { positives[j] = i; j++; } } /*Randomize the instance presentation*/ //Randomize.setSeed (semilla); for (i=0; i<positives.length; i++) { tmp = positives[i]; pos = Randomize.Randint(0,positives.length-1); positives[i] = positives[pos]; positives[pos] = tmp; } /*Obtain k-nearest neighbors of each positive instance*/ neighbors = new int[positives.length][kSMOTE]; for (i=0; i<positives.length; i++) { switch (ASMO) { case 0: KNN.evaluacionKNN2 (kSMOTE, datosTrain, realTrain, nominalTrain, nulosTrain, clasesTrain, datosTrain[positives[i]], realTrain[positives[i]], nominalTrain[positives[i]], nulosTrain[positives[i]], 2, distanceEu, neighbors[i]); break; case 1: evaluacionKNNClass (kSMOTE, datosTrain, realTrain, nominalTrain, nulosTrain, clasesTrain, datosTrain[positives[i]], realTrain[positives[i]], nominalTrain[positives[i]], nulosTrain[positives[i]], 2, distanceEu, neighbors[i],posID); break; case 2: evaluacionKNNClass (kSMOTE, datosTrain, realTrain, nominalTrain, nulosTrain, clasesTrain, datosTrain[positives[i]], realTrain[positives[i]], nominalTrain[positives[i]], nulosTrain[positives[i]], 2, distanceEu, neighbors[i],negID); break; } } /* Verify sample type, * security = all neighbors from miniority class 2 * Border = Neither all from minority nor from majority 1 * Latent noise = all neighbors from majority class 0 */ type = new int[neighbors.length]; int noiseCount = 0; for (i = 0; i < neighbors.length; i++) { int count = 0; for (j = 0; j < kSMOTE; j++) if (clasesTrain[neighbors[i][j]] == posID) count++; if (count == 0) { type[i] = 0; noiseCount++; } else if (count > 0 && count < kSMOTE) type[i] = 1; else type[i] = 2; } /*Interpolation of the minority instances*/ if (balance) { genS = new double[nNeg-nPos][datosTrain[0].length]; genR = new double[nNeg-nPos][datosTrain[0].length]; genN = new int[nNeg-nPos][datosTrain[0].length]; genM = new boolean[nNeg-nPos][datosTrain[0].length]; clasesGen = new int[nNeg-nPos]; } else { genS = new double[(int)((nPos - noiseCount)*smoting)][datosTrain[0].length]; genR = new double[(int)((nPos - noiseCount)*smoting)][datosTrain[0].length]; genN = new int[(int)((nPos - noiseCount)*smoting)][datosTrain[0].length]; genM = new boolean[(int)((nPos - noiseCount)*smoting)][datosTrain[0].length]; clasesGen = new int[(int)((nPos - noiseCount)*smoting)]; } i = 0; int aux = 0; for (i = 0; i < positives.length; i++) { if (type[i] != 0) aux++; } if (aux == 0) Arrays.fill(type, 1); for (int count = 0; count<genS.length; ) { if (type[i%positives.length] != 0) { clasesGen[count] = posID; if (type[i%positives.length] == 2) nn = Randomize.Randint(0,kSMOTE-1); else nn = 0; interpola (realTrain[positives[i%positives.length]], realTrain[neighbors[i%positives.length][nn]], nominalTrain[positives[i%positives.length]], nominalTrain[neighbors[i%positives.length][nn]], nulosTrain[positives[i%positives.length]], nulosTrain[neighbors[i%positives.length][nn]], genS[count],genR[count],genN[count],genM[count]); count++; } /*else count--;*/ i++; } if (balance) { tamS = 2*nNeg; } else { tamS = nNeg + nPos + (int)((nPos - noiseCount)*smoting); } /*Construction of the S set from the previous vector S*/ conjS = new double[tamS][datosTrain[0].length]; conjR = new double[tamS][datosTrain[0].length]; conjN = new int[tamS][datosTrain[0].length]; conjM = new boolean[tamS][datosTrain[0].length]; clasesS = new int[tamS]; for (j=0; j<datosTrain.length; j++) { for (l=0; l<datosTrain[0].length; l++) { conjS[j][l] = datosTrain[j][l]; conjR[j][l] = realTrain[j][l]; conjN[j][l] = nominalTrain[j][l]; conjM[j][l] = nulosTrain[j][l]; } clasesS[j] = clasesTrain[j]; } for (m=0;j<tamS; j++, m++) { for (l=0; l<datosTrain[0].length; l++) { conjS[j][l] = genS[m][l]; conjR[j][l] = genR[m][l]; conjN[j][l] = genN[m][l]; conjM[j][l] = genM[m][l]; } clasesS[j] = clasesGen[m]; } System.out.println("MSMOTE "+ relation + " " + (double)(System.currentTimeMillis()-tiempo)/1000.0 + "s"); OutputIS.escribeSalida(ficheroSalida[0], conjR, conjN, conjM, clasesS, entradas, salida, nEntradas, relation); // OutputIS.escribeSalida(ficheroSalida[1], test, entradas, salida, nEntradas, relation); } public static int evaluacionKNNClass (int nvec, double conj[][], double real[][], int nominal[][], boolean nulos[][], int clases[], double ejemplo[], double ejReal[], int ejNominal[], boolean ejNulos[], int nClases, boolean distance, int vecinos[], int clase) { int i, j, l; boolean parar = false; int vecinosCercanos[]; double minDistancias[]; int votos[]; double dist; int votada, votaciones; if (nvec > conj.length) nvec = conj.length; votos = new int[nClases]; vecinosCercanos = new int[nvec]; minDistancias = new double[nvec]; for (i=0; i<nvec; i++) { vecinosCercanos[i] = -1; minDistancias[i] = Double.POSITIVE_INFINITY; } for (i=0; i<conj.length; i++) { dist = KNN.distancia(conj[i], real[i], nominal[i], nulos[i], ejemplo, ejReal, ejNominal, ejNulos, distance); if (dist > 0 && clases[i] == clase) { parar = false; for (j = 0; j < nvec && !parar; j++) { if (dist < minDistancias[j]) { parar = true; for (l = nvec - 1; l >= j+1; l--) { minDistancias[l] = minDistancias[l - 1]; vecinosCercanos[l] = vecinosCercanos[l - 1]; } minDistancias[j] = dist; vecinosCercanos[j] = i; } } } } for (j=0; j<nClases; j++) { votos[j] = 0; } for (j=0; j<nvec; j++) { if (vecinosCercanos[j] >= 0) votos[clases[vecinosCercanos[j]]] ++; } votada = 0; votaciones = votos[0]; for (j=1; j<nClases; j++) { if (votaciones < votos[j]) { votaciones = votos[j]; votada = j; } } for (i=0; i<vecinosCercanos.length; i++) vecinos[i] = vecinosCercanos[i]; return votada; } void interpola (double ra[], double rb[], int na[], int nb[], boolean ma[], boolean mb[], double resS[], double resR[], int resN[], boolean resM[]) { int i; double diff; double gap; int suerte; for (i=0; i<ra.length; i++) { if (ma[i] == true && mb[i] == true) { resM[i] = true; resS[i] = 0; } else if (ma[i] == true){ if (entradas[i].getType() == Attribute.REAL) { resR[i] = rb[i]; resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else if (entradas[i].getType() == Attribute.INTEGER) { resR[i] = rb[i]; resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else { resN[i] = nb[i]; resS[i] = (double)resN[i] / (double)(entradas[i].getNominalValuesList().size() - 1); } } else if (mb[i] == true) { if (entradas[i].getType() == Attribute.REAL) { resR[i] = ra[i]; resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else if (entradas[i].getType() == Attribute.INTEGER) { resR[i] = ra[i]; resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else { resN[i] = na[i]; resS[i] = (double)resN[i] / (double)(entradas[i].getNominalValuesList().size() - 1); } } else { resM[i] = false; if (entradas[i].getType() == Attribute.REAL) { diff = rb[i] - ra[i]; gap = Randomize.Rand(); resR[i] = ra[i] + gap*diff; resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else if (entradas[i].getType() == Attribute.INTEGER) { diff = rb[i] - ra[i]; gap = Randomize.Rand(); resR[i] = Math.round(ra[i] + gap*diff);// Math.round( (ra[i] + gap*diff)* // (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute())); //Math.round(ra[i] + gap*diff); resS[i] = (resR[i] + entradas[i].getMinAttribute()) / (entradas[i].getMaxAttribute() - entradas[i].getMinAttribute()); } else { suerte = Randomize.Randint(0, 2); if (suerte == 0) { resN[i] = na[i]; } else { resN[i] = nb[i]; } resS[i] = (double)resN[i] / (double)(entradas[i].getNominalValuesList().size() - 1); } } } } public void leerConfiguracion (String ficheroScript) { String fichero, linea, token; StringTokenizer lineasFichero, tokens; byte line[]; int i, j; ficheroSalida = new String[2]; fichero = Fichero.leeFichero (ficheroScript); lineasFichero = new StringTokenizer (fichero,"\n\r"); lineasFichero.nextToken(); linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); /*Getting the names of the training and test files*/ line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroTraining = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroTest = new String (line,i,j-i); /*Getting the path and base name of the results files*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); /*Getting the names of output files*/ line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroSalida[0] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); ficheroSalida[1] = new String (line,i,j-i); /*Getting the seed*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); semilla = Long.parseLong(tokens.nextToken().substring(1)); /*Getting the number of neighbors*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); kSMOTE = Integer.parseInt(tokens.nextToken().substring(1)); /*Getting the type of SMOTE algorithm*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); token = token.substring(1); if (token.equalsIgnoreCase("both")) ASMO = 0; else if (token.equalsIgnoreCase("minority")) ASMO = 1; else ASMO = 2; /*Getting the type of balancing in SMOTE*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); token = token.substring(1); if (token.equalsIgnoreCase("YES")) balance = true; else balance = false; /*Getting the quantity of smoting*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); smoting = Double.parseDouble(tokens.nextToken().substring(1)); /*Getting the type of distance function*/ linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); distanceEu = tokens.nextToken().substring(1).equalsIgnoreCase("Euclidean")?true:false; } }