/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 J. Alcal�-Fdez (jalcala@decsai.ugr.es) A. Fern�ndez (alberto.fernandez@ujaen.es) S. Garc�a (sglopez@ujaen.es) F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * * File: RSTAlgorithm.java * * Main class for RST methods. Provides a basic framework for reading * and preprocessing data, conduct the experiment and report results * * @author Written by Joaqu�n Derrac (University of Granada) 20/04/2010 * @version 1.0 * @since JDK1.5 * */ package keel.Algorithms.RST_Learning; import java.util.Arrays; import java.util.StringTokenizer; import keel.Dataset.*; import org.core.Files; public abstract class RSTAlgorithm { //Files protected String outFile[]; protected String testFile; protected String trainFile; protected String referenceFile; //Instance Sets protected InstanceSet train; protected InstanceSet test; protected InstanceSet reference; protected Instance temp; //Data protected int inputAtt; protected int trainSize; protected Attribute[] inputs; protected Attribute output; protected boolean[] nulls; protected boolean[] nominal; protected double trainData[][]; protected int trainOutput[]; protected double testData[][]; protected int testOutput[]; protected double referenceData[][]; protected int referenceOutput[]; protected String relation; protected int nClasses; protected int nInstances[]; //Naming protected String name; //Random seed protected long seed; //Classification protected int trainPrediction[]; protected int testPrediction[]; /** * Read the configuration and data files, and process it. * * @param script Name of the configuration script * */ protected void readDataFiles(String script){ //Read of the script file readConfiguracion(script); readParameters(script); //Read of training data files try { train = new InstanceSet(); train.readSet(trainFile, true); inputAtt = Attributes.getInputNumAttributes(); trainSize = train.getNumInstances(); inputs = Attributes.getInputAttributes(); output = Attributes.getOutputAttribute(0); //Normalize the data normalizeTrain(); } catch (Exception e) { System.err.println(e); System.exit(1); } //Read of test data files try { test = new InstanceSet(); test.readSet(testFile, false); //Normalize the data normalizeTest(); } catch (Exception e) { System.err.println(e); System.exit(1); } //Read of reference data files try { reference = new InstanceSet(); reference.readSet(referenceFile, false); //Normalize the data normalizeReference(); } catch (Exception e) { System.err.println(e); System.exit(1); } //Now, the data is loaded and preprocessed //Get the number of classes nClasses=Attributes.getOutputAttribute(0).getNumNominalValues(); //And the number of instances on each class nInstances=new int[nClasses]; Arrays.fill(nInstances, 0); for(int i=0;i<trainOutput.length;i++){ nInstances[trainOutput[i]]++; } //Initialize classification structures trainPrediction= new int [trainSize]; testPrediction= new int [test.getNumInstances()]; Arrays.fill(trainPrediction, -1); Arrays.fill(testPrediction, -1); }//end-method /** * Reads configuration script, and extracts its contents. * * @param script Name of the configuration script * */ protected void readConfiguracion (String script) { String file, cad, token; StringTokenizer fileLines, tokens; byte line[]; int i, j; outFile = new String[3]; file = Files.readFile(script); fileLines = new StringTokenizer (file,"\n\r"); fileLines.nextToken(); cad = fileLines.nextToken(); tokens = new StringTokenizer (cad, "="); tokens.nextToken(); token = tokens.nextToken(); //Getting the names of training and test files //reference file will be used as comparision line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); trainFile = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); referenceFile = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); testFile = new String (line,i,j-i); //Getting the path and base name of the results files cad = fileLines.nextToken(); tokens = new StringTokenizer (cad, "="); tokens.nextToken(); token = tokens.nextToken(); //Getting the names of output files line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[0] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[1] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[2] = new String (line,i,j-i); } //end-method /** * Reads the parameters of the algorithm. * Must be implemented in the subclass. * * @param script Configuration script * */ protected abstract void readParameters(String script); /** * This function builds the data matrix for training data and normalizes inputs values */ protected void normalizeTrain() throws DataException { StringTokenizer tokens; double minimum[]; double range[]; //Check if dataset corresponding with a classification problem if (Attributes.getOutputNumAttributes() < 1) { throw new DataException ("This dataset haven�t outputs, so it not corresponding to a classification problem."); } else if (Attributes.getOutputNumAttributes() > 1) { throw new DataException ("This dataset have more of one output."); } if (Attributes.getOutputAttribute(0).getType() == Attribute.REAL) { throw new DataException ("This dataset have an input attribute with float values, so it not corresponding to a classification problem."); } //Copy the data tokens = new StringTokenizer (train.getHeader()," \n\r"); tokens.nextToken(); relation = tokens.nextToken(); trainData = new double[train.getNumInstances()][inputAtt]; trainOutput = new int[train.getNumInstances()]; for (int i=0; i<train.getNumInstances(); i++) { temp = train.getInstance(i); trainData[i] = temp.getAllInputValues(); trainOutput[i] = (int)temp.getOutputRealValues(0); nulls = temp.getInputMissingValues(); //Clean missing values for (int j=0; j<nulls.length; j++){ if (nulls[j]) { trainData[i][j]=0.0; } } } //Normalice the data minimum=new double[inputAtt]; range=new double[inputAtt]; for (int i=0; i<inputAtt; i++) { if (Attributes.getInputAttribute(i).getType() != Attribute.NOMINAL) { minimum[i]=Attributes.getInputAttribute(i).getMinAttribute(); range[i]=Attributes.getInputAttribute(i).getMaxAttribute()-minimum[i]; } } //Both real and nominal data are normaliced in [0,1] for (int i=0; i<train.getNumInstances(); i++) { for (int j = 0; j < inputAtt; j++) { if (Attributes.getInputAttribute(j).getType() == Attribute.NOMINAL) { if(Attributes.getInputAttribute(j).getNominalValuesList().size()>1){ trainData[i][j] /= Attributes.getInputAttribute(j).getNominalValuesList().size()-1; } }else{ trainData[i][j] -= minimum[j]; trainData[i][j] /= range[j]; } } } } //end-method /** * This function builds the data matrix for test data and normalizes inputs values */ protected void normalizeTest() throws DataException { StringTokenizer tokens; double minimum[]; double range[]; //Check if dataset corresponding with a classification problem if (Attributes.getOutputNumAttributes() < 1) { throw new DataException ("This dataset haven�t outputs, so it not corresponding to a classification problem."); } else if (Attributes.getOutputNumAttributes() > 1) { throw new DataException ("This dataset have more of one output."); } if (Attributes.getOutputAttribute(0).getType() == Attribute.REAL) { throw new DataException ("This dataset have an input attribute with float values, so it not corresponding to a classification problem."); } //Copy the data tokens = new StringTokenizer (test.getHeader()," \n\r"); tokens.nextToken(); tokens.nextToken(); testData = new double[test.getNumInstances()][inputAtt]; testOutput = new int[test.getNumInstances()]; for (int i=0; i<test.getNumInstances(); i++) { temp = test.getInstance(i); testData[i] = temp.getAllInputValues(); testOutput[i] = (int)temp.getOutputRealValues(0); nulls = temp.getInputMissingValues(); //Clean missing values for (int j=0; j<nulls.length; j++){ if (nulls[j]) { testData[i][j]=0.0; } } } //Normalice the data minimum=new double[inputAtt]; range=new double[inputAtt]; for (int i=0; i<inputAtt; i++) { if (Attributes.getInputAttribute(i).getType() != Attribute.NOMINAL) { minimum[i]=Attributes.getInputAttribute(i).getMinAttribute(); range[i]=Attributes.getInputAttribute(i).getMaxAttribute()-minimum[i]; } } //Both real and nominal data are normaliced in [0,1] for (int i=0; i<test.getNumInstances(); i++) { for (int j = 0; j < inputAtt; j++) { if (Attributes.getInputAttribute(j).getType() == Attribute.NOMINAL) { if(Attributes.getInputAttribute(j).getNominalValuesList().size()>1){ testData[i][j] /= Attributes.getInputAttribute(j).getNominalValuesList().size()-1; } } else{ testData[i][j] -= minimum[j]; testData[i][j] /= range[j]; } } } } //end-method /** * This function builds the data matrix for reference data and normalizes inputs values */ protected void normalizeReference() throws DataException { StringTokenizer tokens; double minimum[]; double range[]; //Check if dataset corresponding with a classification problem if (Attributes.getOutputNumAttributes() < 1) { throw new DataException ("This dataset haven�t outputs, so it not corresponding to a classification problem."); } else if (Attributes.getOutputNumAttributes() > 1) { throw new DataException ("This dataset have more of one output."); } if (Attributes.getOutputAttribute(0).getType() == Attribute.REAL) { throw new DataException ("This dataset have an input attribute with float values, so it not corresponding to a classification problem."); } //Copy the data tokens = new StringTokenizer (reference.getHeader()," \n\r"); tokens.nextToken(); tokens.nextToken(); referenceData = new double[reference.getNumInstances()][inputAtt]; referenceOutput = new int[reference.getNumInstances()]; for (int i=0; i<reference.getNumInstances(); i++) { temp = reference.getInstance(i); referenceData[i] = temp.getAllInputValues(); referenceOutput[i] = (int)temp.getOutputRealValues(0); nulls = temp.getInputMissingValues(); //Clean missing values for (int j=0; j<nulls.length; j++){ if (nulls[j]) { referenceData[i][j]=0.0; } } } //Normalice the data minimum=new double[inputAtt]; range=new double[inputAtt]; for (int i=0; i<inputAtt; i++) { if (Attributes.getInputAttribute(i).getType() != Attribute.NOMINAL) { minimum[i]=Attributes.getInputAttribute(i).getMinAttribute(); range[i]=Attributes.getInputAttribute(i).getMaxAttribute()-minimum[i]; } } //Both real and nominal data are normaliced in [0,1] for (int i=0; i<reference.getNumInstances(); i++) { for (int j = 0; j < inputAtt; j++) { if (Attributes.getInputAttribute(j).getType() == Attribute.NOMINAL) { if(Attributes.getInputAttribute(j).getNominalValuesList().size()>1){ referenceData[i][j] /= Attributes.getInputAttribute(j).getNominalValuesList().size()-1; } }else{ referenceData[i][j] -= minimum[j]; referenceData[i][j] /= range[j]; } } } }//end-method /** * Prints KEEL standard output files. * * @param filename Name of output file * @param realClass Real output of instances * @param prediction Predicted output for instances */ protected void writeOutput(String filename, int [] realClass, int [] prediction) { String text = ""; /*Printing input attributes*/ text += "@relation "+ relation +"\n"; for (int i=0; i<inputs.length; i++) { text += "@attribute "+ inputs[i].getName()+" "; if (inputs[i].getType() == Attribute.NOMINAL) { text += "{"; for (int j=0; j<inputs[i].getNominalValuesList().size(); j++) { text += (String)inputs[i].getNominalValuesList().elementAt(j); if (j < inputs[i].getNominalValuesList().size() -1) { text += ", "; } } text += "}\n"; } else { if (inputs[i].getType() == Attribute.INTEGER) { text += "integer"; } else { text += "real"; } text += " ["+String.valueOf(inputs[i].getMinAttribute()) + ", " + String.valueOf(inputs[i].getMaxAttribute())+"]\n"; } } /*Printing output attribute*/ text += "@attribute "+ output.getName()+" "; if (output.getType() == Attribute.NOMINAL) { text += "{"; for (int j=0; j<output.getNominalValuesList().size(); j++) { text += (String)output.getNominalValuesList().elementAt(j); if (j < output.getNominalValuesList().size() -1) { text += ", "; } } text += "}\n"; } else { text += "integer ["+String.valueOf(output.getMinAttribute()) + ", " + String.valueOf(output.getMaxAttribute())+"]\n"; } /*Printing data*/ text += "@data\n"; Files.writeFile(filename, text); if (output.getType() == Attribute.INTEGER) { text = ""; for (int i=0; i<realClass.length; i++) { text += "" + realClass[i] + " "; text += "" + prediction[i] + " "; text += "\n"; if((i%10)==9){ Files.addToFile(filename, text); text = ""; } } if((realClass.length%10)!=0){ Files.addToFile(filename, text); } } else{ text = ""; for (int i=0; i<realClass.length; i++) { text += "" + (String)output.getNominalValuesList().elementAt(realClass[i]) + " "; if(prediction[i]>-1){ text += "" + (String)output.getNominalValuesList().elementAt(prediction[i]) + " "; } else{ text += "" + "Unclassified" + " "; } text += "\n"; if((i%10)==9){ Files.addToFile(filename, text); text = ""; } } if((realClass.length%10)!=0){ Files.addToFile(filename, text); } } }//end-method }//end-class