/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. Sanchez (luciano@uniovi.es) J. Alcala-Fdez (jalcala@decsai.ugr.es) S. Garcia (sglopez@ujaen.es) A. Fernandez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * File: svmClassifierCost.java * * This class is a wrapper to the LibSVM C-SVM classifier, in order to operate with KEEL * data sets and parameters. The implementation has been adapted to deal with imbalanced * classification problems. * * @author Written by Julian Luengo Martin 09/10/2007 * @author Modified by Victoria Lopez Morales 01/05/2010 * @author Modified by Victoria Lopez Morales 05/10/2010 * @author Modified by Sarah Vluymans 28/01/2014 * @version 0.3 * @since JDK 1.5 * </p> */ package keel.Algorithms.ImbalancedClassification.CSMethods.C_SVMCost; import java.io.*; import java.util.*; import keel.Dataset.*; import keel.Algorithms.Preprocess.Basic.*; import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.CalculateAUC; import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.PosProb; /** * <p> * This class is a wrapper to the LibSVM C-SVM classifier, in order to operate with KEEL data sets and parameters. * </p> */ public class svmClassifierCost { double[] mean = null; double[] std_dev = null; double tempData = 0; String[][] X = null; // matrix of transformed data // values String[] mostCommon; int ndatos = 0; int nentradas = 0; int tipo = 0; int direccion = 0; int nvariables = 0; int nsalidas = 0; int nneigh = 1; // number of neighbours InstanceSet IS; InstanceSet ISval; String input_train_name = new String(); String input_validation_name; String input_test_name = new String(); String output_train_name = new String(); String output_test_name = new String(); String output_AUC_name = new String(); String temp = new String(); String data_out = new String(""); String svmType; String kernelType; double C; double eps; int degree; double gamma; double coef0; double nu; double p; int shrinking; int probability = 0; long seed; int nr_weight = 0; boolean computeAUC; /* Values for AUC computation */ private PosProb[] valsForAUCTrain ; private PosProb[] valsForAUCTest ; /** Creates a new instance of svmClassifier * * @param fileParam The path to the configuration file with all the parameters in KEEL format */ public svmClassifierCost(String fileParam) { config_read(fileParam); IS = new InstanceSet(); ISval = new InstanceSet(); } /** Writes data matrix X to disk, in KEEL format * * @param output The text of the data matrix X in KEEL format * @param positive_class Integer identifier of the instances associated to the positive class */ private void write_results (String output, int positive_class) { // File OutputFile = new File(output_train_name.substring(1, // output_train_name.length()-1)); /*int tp = 0; int tn = 0; int fp = 0; int fn = 0; double tp_rate, fp_rate, auc;*/ try { FileWriter file_write = new FileWriter(output); file_write.write(IS.getHeader()); // now, print the normalized data file_write.write("@data\n"); for (int i = 0; i < ndatos; i++) { file_write.write(X[i][0]); for (int j = 1; j < 2; j++) { file_write.write(" " + X[i][j]); } file_write.write("\n"); /*int aux; if (Character.isDigit(X[i][0].charAt(0))) { aux = Integer.parseInt(X[i][0]); } else { if (X[i][0].contains("positive")) { aux = positive_class; } else { aux = positive_class+1; } } if (X[i][0].equals(X[i][1])) { if (aux == positive_class) { tp++; } else { tn++; } } else { if (aux == positive_class) { fn++; } else { fp++; } }*/ } /*tp_rate = (double)tp/(double)(tp+fn); fp_rate = (double)fp/(double)(fp+tn); auc = (1+tp_rate-fp_rate)/2; System.out.println("TP: " + tp + " TN: " + tn + " FP: " + fp + " FN: " + fn + " Area Under the ROC Curve is "+auc);*/ file_write.close(); } catch (IOException e) { System.out.println("IO exception = " + e); System.exit( -1); } } /** Reads the associated data to launch a SVM classifier * * @param fileParam KEEL configuration file that contains all the associated data for the experiment */ private void config_read (String fileParam) { parseParameters parameters; parameters = new parseParameters(); parameters.parseConfigurationFile(fileParam); input_train_name = parameters.getTrainingInputFile(); input_validation_name = parameters.getValidationInputFile(); input_test_name = parameters.getTestInputFile(); output_train_name = parameters.getTrainingOutputFile(); output_test_name = parameters.getTestOutputFile(); output_AUC_name = parameters.getOutputFile(0); seed = Long.parseLong(parameters.getParameter(0)); kernelType = parameters.getParameter(1); C = Double.parseDouble(parameters.getParameter(2)); eps = Double.parseDouble(parameters.getParameter(3)); degree = Integer.parseInt(parameters.getParameter(4)); gamma = Double.parseDouble(parameters.getParameter(5)); coef0 = Double.parseDouble(parameters.getParameter(6)); nu = Double.parseDouble(parameters.getParameter(7)); p = Double.parseDouble(parameters.getParameter(8)); shrinking = Integer.parseInt(parameters.getParameter(9)); String aux = parameters.getParameter(10); // Computation of the AUC integral computeAUC = false; if (aux.compareToIgnoreCase("TRUE") == 0) { computeAUC = true; } } /** * <p> * Process the training and test files provided in the parameters file to the constructor. * </p> */ public void process () { double[] outputs; double[] outputs2; Instance neighbor; double dist, mean; int actual; int[] N = new int[nneigh]; double[] Ndist = new double[nneigh]; boolean allNull; svm_problem SVMp = null; svm_parameter SVMparam = new svm_parameter(); svm_model svr = null; svm_node SVMn[]; double[] outputsCandidate = null; boolean same = true; Vector instancesSelected = new Vector(); Vector instancesSelected2 = new Vector(); int n_pos = 0; int n_neg = 0; int positive_class = -1; int posIndex = -1; int posIndexSVM = -1; double positive_cost, negative_cost; //SVM PARAMETERS SVMparam.C = C; SVMparam.cache_size = 10; //10MB of cache SVMparam.degree = degree; SVMparam.eps = eps; SVMparam.gamma = gamma; SVMparam.nr_weight = 0; SVMparam.nu = nu; SVMparam.p = p; SVMparam.shrinking = shrinking; if (computeAUC) { SVMparam.probability = 1; // Needed to allow for AUC calculations } else { SVMparam.probability = 0; } if (kernelType.compareTo("LINEAR") == 0) { SVMparam.kernel_type = svm_parameter.LINEAR; } else if (kernelType.compareTo("POLY") == 0) { SVMparam.kernel_type = svm_parameter.POLY; } else if (kernelType.compareTo("RBF") == 0) { SVMparam.kernel_type = svm_parameter.RBF; } else if (kernelType.compareTo("SIGMOID") == 0) { SVMparam.kernel_type = svm_parameter.SIGMOID; } //if(svmType.compareTo("C_SVC")==0){ SVMparam.svm_type = svm_parameter.C_SVC; /*}else if(svmType.compareTo("NU_SVC")==0){ SVMparam.svm_type = svm_parameter.NU_SVC; }*/ try { // Load in memory a dataset that contains a classification problem IS.readSet(input_train_name, true); int in = 0; int out = 0; ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); X = new String[ndatos][2]; // matrix with transformed data mostCommon = new String[nvariables]; SVMp = new svm_problem(); SVMp.l = ndatos; SVMp.y = new double[SVMp.l]; SVMp.x = new svm_node[SVMp.l][nentradas + 1]; for (int l = 0; l < SVMp.l; l++) { for (int n = 0; n < Attributes.getInputNumAttributes() + 1; n++) { SVMp.x[l][n] = new svm_node(); } } positive_class = 0; for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); SVMp.y[i] = inst.getAllOutputValues()[0]; if (SVMp.y[i] == 0.0) { n_pos++; } else { n_neg++; } for (int n = 0; n < Attributes.getInputNumAttributes(); n++) { SVMp.x[i][n].index = n; SVMp.x[i][n].value = inst.getAllInputValues()[n]; SVMp.y[i] = inst.getAllOutputValues()[0]; } //end of instance SVMp.x[i][nentradas].index = -1; } // Class 0 was not the minority class if (n_pos > n_neg) { int tmp = n_pos; n_pos = n_neg; n_neg = tmp; positive_class = 1; } /* * Remark: the order of the classes in SVM will be determined * based on the order in which they appear in the dataset, i.e. the * class of the first instance gets number 0 and so on. * Since we will be using different weights for each class, we need * to take this into account. In the binary classification problem, * there are 4 possible scenarios: * - positive_class=0 and the first instance belongs to this class: * nothing to do * - positive_class=1 and the first instance does not belong to * this class: * nothing to do * - positive_class=0 and the first instance does not belong to * this class: * in the SVM, the positive class will be labeled by 1, * --> we will set positive_class to 1 * - positive_class=1 and the first instance belongs to this class: * in the SVM, the positive class will be labeled by 0, * --> we will set positive_class to 0 */ if(positive_class == 0 && (int) IS.getOutputNumericValue(0, 0) != positive_class){ positive_class = 1; } else if(positive_class == 1 && (int) IS.getOutputNumericValue(0, 0) == positive_class){ positive_class = 0; } // Add the costs to the SVM mechanism positive_cost = ((double)n_neg/(double)n_pos); negative_cost = 1; SVMparam.nr_weight = 2; SVMparam.weight = new double[SVMparam.nr_weight]; for (int a=0; a<SVMparam.nr_weight; a++) { if (a == positive_class) { SVMparam.weight[a] = positive_cost; } else { SVMparam.weight[a] = negative_cost; } } if (svm.svm_check_parameter(SVMp, SVMparam) != null) { System.err.print("SVM parameter error in training: "); System.err.println(svm.svm_check_parameter(SVMp, SVMparam)); System.exit( -1); } //train the SVM if (ndatos > 0) { svr = svm.svm_train(SVMp, SVMparam); } ISval.readSet(input_validation_name, false); ndatos = ISval.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); /* * We allocate again the matrix with the data to allocate the * validation set (it can be larger than the original training set) */ X = new String[ndatos][2]; // matrix with transformed data if (computeAUC) { valsForAUCTrain = new PosProb[ndatos]; } // Index of the positive (minority) class in the dataset int[] classFreq = new int[svm.svm_get_nr_class(svr)]; for(int i = 0; i < ISval.getNumInstances(); i++){ classFreq[(int) ISval.getOutputNumericValue(i, 0)]++; } posIndex = 0; for(int i = 0; i < classFreq.length; i++){ if(classFreq[i] < classFreq[posIndex]){ posIndex = i; } } // Index of the positive class in the svm int [] labels = new int[svm.svm_get_nr_class(svr)]; svm.svm_get_labels(svr, labels); posIndexSVM = 0; if(labels[1] == posIndex){ posIndexSVM = 1 ; } for (int i = 0; i < ISval.getNumInstances(); i++) { Instance inst = ISval.getInstance(i); Attribute a = Attributes.getOutputAttribute(0); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (tipo != Attribute.NOMINAL) { X[i][0] = new String(""+(int) ISval.getOutputNumericValue(i, 0)); //new String(String.valueOf((int) inst.getAllOutputValues()[0])); } else { X[i][0] = ISval.getOutputNominalValue(i, 0); //new String(inst.getOutputNominalValues(0)); } // the values used for regression SVMn = new svm_node[Attributes.getInputNumAttributes() + 1]; for (int n = 0; n < Attributes.getInputNumAttributes(); n++) { SVMn[n] = new svm_node(); SVMn[n].index = n; SVMn[n].value = inst.getAllInputValues()[n]; } SVMn[nentradas] = new svm_node(); SVMn[nentradas].index = -1; // Is this a positive instance? boolean isPositive = (int) ISval.getOutputNumericValue(i, 0) == posIndex; /* * Predict the class */ if (tipo != Attribute.NOMINAL) { if (computeAUC) { double[] prob_estimates= new double[svm.svm_get_nr_class(svr)]; svm.svm_predict_probability(svr, SVMn, prob_estimates); valsForAUCTrain[i] = new PosProb(isPositive, prob_estimates[posIndexSVM]); } X[i][1] = new String(String.valueOf((int) Math.round(svm. svm_predict(svr, SVMn)))); } else { if (computeAUC) { double[] prob_estimates= new double[svm.svm_get_nr_class(svr)]; svm.svm_predict_probability(svr, SVMn, prob_estimates); valsForAUCTrain[i] = new PosProb(isPositive, prob_estimates[posIndexSVM]); } X[i][1] = new String(a.getNominalValue((int) Math.round(svm. svm_predict(svr, SVMn)))); } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit( -1); } write_results(output_train_name, positive_class); /** ************************************************************************************ */ try { // Load in memory a dataset that contains a classification // problem IS.readSet(input_test_name, false); int in = 0; int out = 0; ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); X = new String[ndatos][2]; // matrix with transformed data // data mostCommon = new String[nvariables]; if (computeAUC) { valsForAUCTest = new PosProb[ndatos]; } for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); Attribute a = Attributes.getOutputAttribute(0); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (tipo != Attribute.NOMINAL) { X[i][0] = new String(""+(int) IS.getOutputNumericValue(i, 0)); //new String(String.valueOf((int) inst.getAllOutputValues()[0])); } else { X[i][0] = IS.getOutputNominalValue(i, 0); //new String(inst.getOutputNominalValues(0)); } SVMn = new svm_node[Attributes.getInputNumAttributes() + 1]; for (int n = 0; n < Attributes.getInputNumAttributes(); n++) { SVMn[n] = new svm_node(); SVMn[n].index = n; SVMn[n].value = inst.getAllInputValues()[n]; } SVMn[nentradas] = new svm_node(); SVMn[nentradas].index = -1; // Is this a positive instance? boolean isPositive = (int) IS.getOutputNumericValue(i, 0) == posIndex; /* * Predict the class */ if (tipo != Attribute.NOMINAL) { if (computeAUC) { double[] prob_estimates= new double[svm.svm_get_nr_class(svr)]; svm.svm_predict_probability(svr, SVMn, prob_estimates); valsForAUCTest[i] = new PosProb(isPositive, prob_estimates[posIndexSVM]); } X[i][1] = new String(String.valueOf((int) Math.round(svm. svm_predict(svr, SVMn)))); } else { if (computeAUC) { double[] prob_estimates= new double[svm.svm_get_nr_class(svr)]; svm.svm_predict_probability(svr, SVMn, prob_estimates); valsForAUCTest[i] = new PosProb(isPositive, prob_estimates[posIndexSVM]); } X[i][1] = new String(a.getNominalValue((int) Math.round(svm. svm_predict(svr, SVMn)))); } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit( -1); } write_results(output_test_name, positive_class); writeAUCresults(output_AUC_name); } /** * Writes the AUC results in an aditional output file if the integral approximation of the AUC needs to be computed * * @param file_name Name of the file where the AUC results will be placed */ public void writeAUCresults (String file_name) { // Write in the AUC file try { FileWriter file_write = new FileWriter(file_name); if (computeAUC) { // AUC approximation based on the integral double auc; auc = getTrainAUC(); file_write.write("@AUC in training set: " + auc); file_write.write("\n"); auc = getTestAUC(); file_write.write("@AUC in test set: " + auc); file_write.write("\n"); } else { file_write.write("AUC computation not requested\n"); } file_write.close(); } catch (IOException e) { System.out.println("IO exception = " + e); System.exit(-1); } } /* * Calculates the AUC for the training set * * @return The AUC value associated to the training set */ public double getTrainAUC(){ return CalculateAUC.calculate(valsForAUCTrain); } /* * Calculates the AUC for the test set * * @return The AUC value associated to the test set */ public double getTestAUC(){ return CalculateAUC.calculate(valsForAUCTest); } }