/**
* <p>
* File: SMOTE_RSB.java
* </p>
*
* The SMOTE_RSB algorithm is an oversampling method used to deal with the imbalanced
* problem.
*
* @author Written by Enislay Ramentol (University of Camag�ey) 07/09/2011
* @version 0.1
* @since JDK1.5
*
*/
package keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.core.Fichero;
import org.core.Randomize;
import keel.Algorithms.Preprocess.Basic.KNN;
import keel.Algorithms.Preprocess.Basic.Metodo;
import keel.Algorithms.Preprocess.Basic.OutputIS;
import keel.Dataset.Attribute;
import keel.Dataset.Attributes;
import keel.Dataset.InstanceSet;
import keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.FastVector;
import keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.Instance;
import keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.Instances;
import keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.RoughSetsCuttoff;
import keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.RoughSetsOriginal;
public class SMOTE_RSB extends Metodo {
private long semilla;
private int kSMOTE;
private int ASMO;
private boolean balance;
private double smoting;
private int extention;
private int tipoComparacion;
private double cutOffInitial;
private double cutOffFinal;
private boolean debeContinuar;
private String ficheroSaTra;
private String ficheroSaTest;
private String cadena = "";
public SMOTE_RSB(String config) {
super(config);
cadena += "fichero primario de lectura: " + ficheroTraining + "\n";
ficheroSaTra = getFicheroSalida()[0];
ficheroSaTest = getFicheroSalida()[1];
}
public void ejecutar() throws Exception {
double tempCutOff = cutOffInitial;
while (tempCutOff <= cutOffFinal && debeContinuar) {
boolean termino = false;
boolean isProgress = true;
ficheroSaTra = getFicheroSalida()[0];
ejecutarSMOTE();
while (!termino && (tempCutOff <= cutOffFinal)) {
BufferedReader r = new BufferedReader(new FileReader(ficheroSaTra));
Instances newInst = new Instances(r);
FastVector finalInstances = new FastVector();
for (int i = 0; i < originalElementsIndex; i++) {
finalInstances.addElement(newInst.instance(i));
}
RoughSetsOriginal rs = new RoughSetsOriginal(newInst, tipoComparacion, tempCutOff);
rs.lower_aproximation();
int[][] mylower = rs.get_lower_aproximation();
if (!(mylower[0].length ==0 && mylower[1].length==0)){
for(int i=0; i<newInst.numClasses();i++){
for (int j=0;j< mylower[i].length; j++){
if ((mylower[i][j]> originalElementsIndex)){
finalInstances.addElement(newInst.instance(j));
}
}
}
}
else{
tempCutOff += 0.05;
continue;
}
writeResult(ficheroSaTra, ficheroSaTest, finalInstances, newInst);
//isProgress = finalInstances.size() > originalElementsIndex;
termino = finalInstances.size() == newInst.numInstances();
debeContinuar = !termino;
if (!termino) {
reconfigure(ficheroSaTra, ficheroSaTest);
ejecutarSMOTE();
//termino = !isProgress;
tempCutOff += 0.05;
}
System.out.println(termino + ", is progres: " + isProgress);
//tempCutOff += 0.05;
}
}}
private void writeResult(String ficheroSaTra, String ficheroSaTest,
FastVector finalInstances, Instances newInst) {
double[][] data = new double[finalInstances.size()][newInst
.numAttributes() - 1];
int[][] dataN = new int[finalInstances.size()][newInst.numAttributes() - 1];
boolean[][] dataM = new boolean[finalInstances.size()][newInst
.numAttributes() - 1];
int[] clases = new int[finalInstances.size()];
int[] byclass = { 0, 0 };
for (int i = 0; i < finalInstances.size(); i++) {
Instance instance = (Instance) finalInstances.elementAt(i);
for (int j = 0; j < data[i].length; j++) {
keel.Algorithms.ImbalancedClassification.Resampling.SMOTE_RSB.Rough_Sets.Attribute att = instance.attribute(j);
data[i][j] = instance.value(j);
dataN[i][j] = att.type();
dataM[i][j] = false;
}
clases[i] = (int) instance.classValue();
byclass[clases[i]]++;
}
OutputIS.escribeSalida(ficheroSaTra, data, dataN, dataM, clases,
entradas, salida, nEntradas, relation);
OutputIS.escribeSalida(ficheroSaTest, test, entradas, salida,
nEntradas, relation);
cadena += "-------------------------nueva escritura----------------------------------\n";
cadena += "fichero de escritura Tra: " + ficheroSaTra + "\n";
cadena += "fichero de escritura Test: " + ficheroSaTest + "\n";
cadena += "cantidad de elementos hasta ahora: " + finalInstances.size()
+ "\n";
cadena += "cantidad de elementos por clase; clase 0: " + byclass[0]
+ ", clase 1: " + byclass[1] + "\n";
}
// //////////////////////////////////////////SMOTE//////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
private int minorityID, mayorityID = -1;
private int originalElementsIndex;
public void ejecutarSMOTE() {
int nPos = 0;
int nNeg = 0;
int i, j, l, m;
int tmp;
int positives[];
// double conjS[][];
double conjR[][];
int conjN[][];
boolean conjM[][];
int clasesS[];
double genS[][];
double genR[][];
int genN[][];
boolean genM[][];
int clasesGen[];
int tamS;
int pos;
int neighbors[][];
int nn;
long tiempo = System.currentTimeMillis();
/* Count of number of positive and negative examples */
for (i = 0; i < clasesTrain.length; i++) {
if (clasesTrain[i] == 0)
nPos++;
else
nNeg++;
}
originalElementsIndex = nPos + nNeg;
if (nPos > 0 && nNeg > 0) {
if (nPos > nNeg) {
tmp = nPos;
nPos = nNeg;
nNeg = tmp;
minorityID = 1;
mayorityID = 0;
} else {
minorityID = 0;
mayorityID = 1;
}
/* Localize the positive instances */
positives = new int[nPos];
for (i = 0, j = 0; i < clasesTrain.length; i++) {
if (clasesTrain[i] == minorityID) {
positives[j] = i;
j++;
}
}
/* Randomize the instance presentation */
Randomize.setSeed(semilla);
for (i = 0; i < positives.length; i++) {
tmp = positives[i];
pos = Randomize.Randint(0, positives.length - 1);
positives[i] = positives[pos];
positives[pos] = tmp;
}
/* Obtain k-nearest neighbors of each positive instance */
neighbors = new int[positives.length][kSMOTE];
for (i = 0; i < positives.length; i++) {
switch (ASMO) {
case 0:
KNN.evaluacionKNN2(kSMOTE, datosTrain, realTrain,
nominalTrain, nulosTrain, clasesTrain,
datosTrain[positives[i]], realTrain[positives[i]],
nominalTrain[positives[i]],
nulosTrain[positives[i]], 2, distanceEu,
neighbors[i]);
break;
case 1:
evaluacionKNNClass(kSMOTE, datosTrain, realTrain,
nominalTrain, nulosTrain, clasesTrain,
datosTrain[positives[i]], realTrain[positives[i]],
nominalTrain[positives[i]],
nulosTrain[positives[i]], 2, distanceEu,
neighbors[i], minorityID);
break;
case 2:
evaluacionKNNClass(kSMOTE, datosTrain, realTrain,
nominalTrain, nulosTrain, clasesTrain,
datosTrain[positives[i]], realTrain[positives[i]],
nominalTrain[positives[i]],
nulosTrain[positives[i]], 2, distanceEu,
neighbors[i], mayorityID);
break;
}
}
/* Interpolation of the minority instances */
if (balance) {
genS = new double[nNeg - nPos][datosTrain[0].length];
genR = new double[nNeg - nPos][datosTrain[0].length];
genN = new int[nNeg - nPos][datosTrain[0].length];
genM = new boolean[nNeg - nPos][datosTrain[0].length];
clasesGen = new int[nNeg - nPos];
} else {
genS = new double[(int) (nPos * smoting)][datosTrain[0].length];
genR = new double[(int) (nPos * smoting)][datosTrain[0].length];
genN = new int[(int) (nPos * smoting)][datosTrain[0].length];
genM = new boolean[(int) (nPos * smoting)][datosTrain[0].length];
clasesGen = new int[(int) (nPos * smoting)];
}
for (i = 0; i < genS.length; i++) {
clasesGen[i] = minorityID;
nn = Randomize.Randint(0, kSMOTE - 1);
interpola(realTrain[positives[i % positives.length]],
realTrain[neighbors[i % positives.length][nn]],
nominalTrain[positives[i % positives.length]],
nominalTrain[neighbors[i % positives.length][nn]],
nulosTrain[positives[i % positives.length]],
nulosTrain[neighbors[i % positives.length][nn]],
genS[i], genR[i], genN[i], genM[i]);
}
if (balance) {
tamS = 2 * nNeg;
} else {
tamS = nNeg + nPos + (int) (nPos * smoting);
}
/* Construction of the S set from the previous vector S */
// conjS = new double[tamS][datosTrain[0].length];
conjR = new double[tamS][datosTrain[0].length];
conjN = new int[tamS][datosTrain[0].length];
conjM = new boolean[tamS][datosTrain[0].length];
clasesS = new int[tamS];
for (j = 0; j < datosTrain.length; j++) {
for (l = 0; l < datosTrain[0].length; l++) {
// conjS[j][l] = datosTrain[j][l];
conjR[j][l] = realTrain[j][l];
conjN[j][l] = nominalTrain[j][l];
conjM[j][l] = nulosTrain[j][l];
}
clasesS[j] = clasesTrain[j];
}
for (m = 0; j < tamS; j++, m++) {
for (l = 0; l < datosTrain[0].length; l++) {
// conjS[j][l] = genS[m][l];
conjR[j][l] = genR[m][l];
conjN[j][l] = genN[m][l];
conjM[j][l] = genM[m][l];
}
clasesS[j] = clasesGen[m];
}
System.out.println("SMOTE_RSB " + relation + " "
+ (double) (System.currentTimeMillis() - tiempo) / 1000.0
+ "s");
OutputIS.escribeSalida(ficheroSalida[0], conjR, conjN, conjM,
clasesS, entradas, salida, nEntradas, relation);
OutputIS.escribeSalida(ficheroSalida[1], test, entradas, salida,
nEntradas, relation);
}
}
public static int evaluacionKNNClass(int nvec, double conj[][],
double real[][], int nominal[][], boolean nulos[][], int clases[],
double ejemplo[], double ejReal[], int ejNominal[],
boolean ejNulos[], int nClases, boolean distance, int vecinos[],
int clase) {
int i, j, l;
boolean parar = false;
int vecinosCercanos[];
double minDistancias[];
int votos[];
double dist;
int votada, votaciones;
if (nvec > conj.length)
nvec = conj.length;
votos = new int[nClases];
vecinosCercanos = new int[nvec];
minDistancias = new double[nvec];
for (i = 0; i < nvec; i++) {
vecinosCercanos[i] = -1;
minDistancias[i] = Double.POSITIVE_INFINITY;
}
for (i = 0; i < conj.length; i++) {
dist = KNN.distancia(conj[i], real[i], nominal[i], nulos[i],
ejemplo, ejReal, ejNominal, ejNulos, distance);
if (dist > 0 && clases[i] == clase) {
parar = false;
for (j = 0; j < nvec && !parar; j++) {
if (dist < minDistancias[j]) {
parar = true;
for (l = nvec - 1; l >= j + 1; l--) {
minDistancias[l] = minDistancias[l - 1];
vecinosCercanos[l] = vecinosCercanos[l - 1];
}
minDistancias[j] = dist;
vecinosCercanos[j] = i;
}
}
}
}
for (j = 0; j < nClases; j++) {
votos[j] = 0;
}
for (j = 0; j < nvec; j++) {
if (vecinosCercanos[j] >= 0)
votos[clases[vecinosCercanos[j]]]++;
}
votada = 0;
votaciones = votos[0];
for (j = 1; j < nClases; j++) {
if (votaciones < votos[j]) {
votaciones = votos[j];
votada = j;
}
}
for (i = 0; i < vecinosCercanos.length; i++)
vecinos[i] = vecinosCercanos[i];
return votada;
}
void interpola(double ra[], double rb[], int na[], int nb[], boolean ma[],
boolean mb[], double resS[], double resR[], int resN[],
boolean resM[]) {
int i;
double diff;
double gap;
int suerte;
for (i = 0; i < ra.length; i++) {
if (ma[i] == true && mb[i] == true) {
resM[i] = true;
resS[i] = 0;
} else if (ma[i] == true) {
if (entradas[i].getType() == Attribute.REAL) {
resR[i] = rb[i];
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else if (entradas[i].getType() == Attribute.INTEGER) {
resR[i] = rb[i];
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else {
resN[i] = nb[i];
resS[i] = (double) resN[i]
/ (double) (entradas[i].getNominalValuesList()
.size() - 1);
}
} else if (mb[i] == true) {
if (entradas[i].getType() == Attribute.REAL) {
resR[i] = ra[i];
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else if (entradas[i].getType() == Attribute.INTEGER) {
resR[i] = ra[i];
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else {
resN[i] = na[i];
resS[i] = (double) resN[i]
/ (double) (entradas[i].getNominalValuesList()
.size() - 1);
}
} else {
resM[i] = false;
if (entradas[i].getType() == Attribute.REAL) {
diff = rb[i] - ra[i];
gap = Randomize.Rand();
resR[i] = ra[i] + gap * diff;
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else if (entradas[i].getType() == Attribute.INTEGER) {
diff = rb[i] - ra[i];
gap = Randomize.Rand();
resR[i] = Math.round(ra[i] + gap * diff);
resS[i] = (resR[i] + entradas[i].getMinAttribute())
/ (entradas[i].getMaxAttribute() - entradas[i]
.getMinAttribute());
} else {
suerte = Randomize.Randint(0, 2);
if (suerte == 0) {
resN[i] = na[i];
} else {
resN[i] = nb[i];
}
resS[i] = (double) resN[i]
/ (double) (entradas[i].getNominalValuesList()
.size() - 1);
}
}
}
}
public void leerConfiguracion(String ficheroScript) {
String fichero, linea, token;
StringTokenizer lineasFichero, tokens;
byte line[];
int i, j;
ficheroSalida = new String[2];
fichero = Fichero.leeFichero(ficheroScript);
lineasFichero = new StringTokenizer(fichero, "\n\r");
lineasFichero.nextToken();
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
token = tokens.nextToken();
/* Getting the names of the training and test files */
line = token.getBytes();
for (i = 0; line[i] != '\"'; i++)
;
i++;
for (j = i; line[j] != '\"'; j++)
;
ficheroTraining = new String(line, i, j - i);
for (i = j + 1; line[i] != '\"'; i++)
;
i++;
for (j = i; line[j] != '\"'; j++)
;
ficheroTest = new String(line, i, j - i);
/* Getting the path and base name of the results files */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
token = tokens.nextToken();
/* Getting the names of output files */
line = token.getBytes();
for (i = 0; line[i] != '\"'; i++)
;
i++;
for (j = i; line[j] != '\"'; j++)
;
ficheroSalida[0] = new String(line, i, j - i);
for (i = j + 1; line[i] != '\"'; i++)
;
i++;
for (j = i; line[j] != '\"'; j++)
;
ficheroSalida[1] = new String(line, i, j - i);
/* Getting the seed */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
semilla = Long.parseLong(tokens.nextToken().substring(1));
/* Getting the number of neighbors */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
kSMOTE = Integer.parseInt(tokens.nextToken().substring(1));
/* Getting the type of SMOTE algorithm */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
token = tokens.nextToken();
token = token.substring(1);
if (token.equalsIgnoreCase("both"))
ASMO = 0;
else if (token.equalsIgnoreCase("minority"))
ASMO = 1;
else
ASMO = 2;
/* Getting the type of balancing in SMOTE */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
token = tokens.nextToken();
token = token.substring(1);
if (token.equalsIgnoreCase("YES"))
balance = true;
else
balance = false;
/* Getting the quantity of smoting */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
smoting = Double.parseDouble(tokens.nextToken().substring(1));
/* Getting the type of distance function
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
distanceEu = tokens.nextToken().substring(1).equalsIgnoreCase(
"Euclidean") ? true : false;
/* Getting the type of extention */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
extention = Integer.parseInt(tokens.nextToken().substring(1));
/* Getting the type of comparation */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
tipoComparacion = Integer.parseInt(tokens.nextToken().substring(1));
/* Getting the initial cutoff's value */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
cutOffInitial = Double.parseDouble(tokens.nextToken().substring(1));
/* Getting the final cutoff's value */
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
cutOffFinal = Double.parseDouble(tokens.nextToken().substring(1));
linea = lineasFichero.nextToken();
tokens = new StringTokenizer(linea, "=");
tokens.nextToken();
token = tokens.nextToken();
token = token.substring(1);
if (token.equalsIgnoreCase("YES"))
debeContinuar = true;
else
debeContinuar = false;
}
public void reconfigure(String ficheroTra, String ficheroTest) {
int nClases, i, j, l, m, n;
double VDM;
int Naxc, Nax, Nayc, Nay;
double media, SD;
Attributes.clearAll();
/* Read of data files */
try {
training = new InstanceSet();
training.readSet(ficheroTra, true);
/* Normalize and check the data */
normalizar();
} catch (Exception e) {
System.err.println(e);
System.exit(1);
}
try {
test = new InstanceSet();
test.readSet(ficheroTest, false);
} catch (Exception e) {
System.err.println(e);
System.exit(1);
}
/* Previous computation for HVDM distance */
if (distanceEu == false) {
stdDev = new double[Attributes.getInputNumAttributes()];
nominalDistance = new double[Attributes.getInputNumAttributes()][][];
nClases = Attributes.getOutputAttribute(0).getNumNominalValues();
for (i = 0; i < nominalDistance.length; i++) {
if (Attributes.getInputAttribute(i).getType() == Attribute.NOMINAL) {
nominalDistance[i] = new double[Attributes
.getInputAttribute(i).getNumNominalValues()][Attributes
.getInputAttribute(i).getNumNominalValues()];
for (j = 0; j < Attributes.getInputAttribute(i)
.getNumNominalValues(); j++) {
nominalDistance[i][j][j] = 0.0;
}
for (j = 0; j < Attributes.getInputAttribute(i)
.getNumNominalValues(); j++) {
for (l = j + 1; l < Attributes.getInputAttribute(i)
.getNumNominalValues(); l++) {
VDM = 0.0;
Nax = Nay = 0;
for (m = 0; m < training.getNumInstances(); m++) {
if (nominalTrain[m][i] == j) {
Nax++;
}
if (nominalTrain[m][i] == l) {
Nay++;
}
}
for (m = 0; m < nClases; m++) {
Naxc = Nayc = 0;
for (n = 0; n < training.getNumInstances(); n++) {
if (nominalTrain[n][i] == j
&& clasesTrain[n] == m) {
Naxc++;
}
if (nominalTrain[n][i] == l
&& clasesTrain[n] == m) {
Nayc++;
}
}
VDM += (((double) Naxc / (double) Nax) - ((double) Nayc / (double) Nay))
* (((double) Naxc / (double) Nax) - ((double) Nayc / (double) Nay));
}
nominalDistance[i][j][l] = Math.sqrt(VDM);
nominalDistance[i][l][j] = Math.sqrt(VDM);
}
}
} else {
media = 0;
SD = 0;
for (j = 0; j < training.getNumInstances(); j++) {
media += realTrain[j][i];
SD += realTrain[j][i] * realTrain[j][i];
}
media /= (double) realTrain.length;
stdDev[i] = Math.sqrt((SD / ((double) realTrain.length))
- (media * media));
}
}
}
}
public String[] getFicheroSalida() {
return ficheroSalida;
}
public String getFicheroTraining() {
return ficheroTraining;
}
public void setFicheroTraining(String ficheroTraining) {
this.ficheroTraining = ficheroTraining;
}
public String getFicheroTest() {
return ficheroTest;
}
public InstanceSet getTest() {
return test;
}
public void setFicheroTest(String ficheroTest) {
this.ficheroTest = ficheroTest;
}
public Attribute[] getEntradas() {
return entradas;
}
public Attribute getSalida() {
return salida;
}
public int getNEntradas() {
return nEntradas;
}
public String getRelation() {
return relation;
}
public int getOriginalElementsIndex() {
return originalElementsIndex;
}
}