/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
* <p>
* @author Written by Luciano S�nchez (University of Oviedo) 15/02/2004
* @author Modified by Enrique A. de la Cal (University of Oviedo) 13/12/2008
* @version 1.0
* @since JDK1.4
* </p>
*/
// Wrapper for KEEL's Dataset class
package keel.Algorithms.Shared.Parsing;
import java.io.*;
import org.core.*;
import java.util.StringTokenizer;
import java.util.Vector;
import keel.Dataset.*;
public class ProcessDataset {
/**
* <p>
* Wrapper for KEEL's Dataset class.
* </p>
*/
//Input examples
private double[][] X = null;
//Missing examples
private boolean[][] missing = null;
//Output results
private double[] Y = null;
//Classes
private int [] C = null;
//Maximum input value for each variable
private double[] iMaximum;
//Minimum input value for each variable
private double[] iMinimum;
//Maximum output value
private double oMaximum;
//Maximum output value
private double oMinimum;
private int nData; // Number of examples
private int nVariables; // Number of variables
private int nInputs; // Number of inputs
private int nClasses; // Number of classes
final static boolean debug = false;
/**
* <p>
* Returns input examples.
*
* </p>
* @return vector with input examples.
*/
public double[][] getX() {
return X;
}
/**
* <p>
* Returns input examples.
*
* </p>
* @return vector with input examples.
*/
public double[] getY() {
return Y;
}
/**
* <p>
* Returns classes for classification problems.
*
* </p>
* @return vector with classes.
*/
public int[] getC() {
return C;
}
/**
* <p>
* Returns maximum value for each variable.
*
* </p>
* @return vector maximum value for each variable.
*/
public double[] getImaximum() {
return iMaximum;
}
/**
* <p>
* Returns maximum value for each variable.
*
* </p>
* @return vector maximum value for each variable.
*/
public double[] getIminimum() {
return iMinimum;
}
/**
* <p>
* Returns maximum value for output.
*
* </p>
* @return maximum value for output.
*/
public double getOmaximum() {
return oMaximum;
}
/**
* <p>
* Returns maximum value for output.
*
* </p>
* @return maximum value for output.
*/
public double getOminimum() {
return oMinimum;
}
/**
* <p>
* Returns the size of input data.
*
* </p>
* @return the size of input data.
*/
public int getNdata() {
return nData;
}
/**
* <p>
* Returns the number of input variables plus output variables.
*
* </p>
* @return the number of variables.
*/
public int getNvariables() {
return nVariables;
}
/**
* <p>
* Returns the number of input variables.
*
* </p>
* @return the number of input variables.
*/
public int getNinputs() {
return nInputs;
}
/**
* <p>
* Returns the number of classes for classification problems.
*
* </p>
* @return the number of classes.
*/
public int getNclasses() {
return nClasses;
}
/**
* <p>
* Returns if an example is missing.
*
* </p>
* @return true is the value is missing (0 in the table); 0 otherwise;
*/
public boolean isMissing(int i, int j) {
// True is the value is missing (0 in the table)
return missing[i][j];
}
//Data read for Keel Format file.
private InstanceSet IS;
/**
* <p>
* A constructor that inits a new set of instances
*
* </p>
*/
public ProcessDataset() {
// Init a new set of instances
IS = new InstanceSet();
}
/**
* <p>
* Process a dataset file for a classification problem.
*
* </p>
* @param nfejemplos Name of the dataset file
* @param train The dataset file is for training or for test
*
*/
public void processClassifierDataset(String nfejemplos, boolean train) throws
IOException {
try {
// Load in memory a dataset that contains a classification problem
IS.readSet(nfejemplos, train);
nData = IS.getNumInstances();
nInputs = Attributes.getInputNumAttributes();
nVariables = nInputs + Attributes.getOutputNumAttributes();
// Check that there is only one output variable and
// it is nominal
if (Attributes.getOutputNumAttributes() > 1) {
System.out.println(
"This algorithm can not process MIMO datasets");
System.out.println(
"All outputs but the first one will be removed");
}
boolean noOutputs = false;
if (Attributes.getOutputNumAttributes() < 1) {
System.out.println(
"This algorithm can not process datasets without outputs");
System.out.println("Zero-valued output generated");
noOutputs = true;
}
// Initialize and fill our own tables
X = new double[nData][nInputs];
missing = new boolean[nData][nInputs];
C = new int[nData];
// Maximum and minimum of inputs
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
// Maximum and minimum for output data
oMaximum = 0;
oMinimum = 0;
// All values are casted into double/integer
nClasses = 0;
for (int i = 0; i < X.length; i++) {
Instance inst = IS.getInstance(i);
for (int j = 0; j < nInputs; j++) {
X[i][j] = IS.getInputNumericValue(i, j);
missing[i][j] = inst.getInputMissingValues(j);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
if (noOutputs) {
C[i] = 0;
} else {
C[i] = (int) IS.getOutputNumericValue(i, 0);
}
if (C[i] > nClasses) {
nClasses = C[i];
}
}
nClasses++;
System.out.println("Number of classes=" + nClasses);
} catch (Exception e) {
System.out.println("DBG: Exception in readSet");
e.printStackTrace();
}
}
/**
* <p>
* Process a dataset file for a modelling problem.
*
* </p>
* @param nfexamples Name of the dataset file
* @param train The dataset file is for training or for test
*
*/
public void processModelDataset(String nfexamples, boolean train) throws
IOException {
try {
// Load in memory a dataset that contains a classification problem
IS.readSet(nfexamples, train);
nData = IS.getNumInstances();
nInputs = Attributes.getInputNumAttributes();
nVariables = nInputs + Attributes.getOutputNumAttributes();
if (Attributes.getOutputNumAttributes() > 1) {
System.out.println(
"This algorithm can not process MIMO datasets");
System.out.println(
"All outputs but the first one will be removed");
}
boolean noOutputs = false;
if (Attributes.getOutputNumAttributes() < 1) {
System.out.println(
"This algorithm can not process datasets without outputs");
System.out.println("Zero-valued output generated");
noOutputs = true;
}
// Initialize and fill our own tables
X = new double[nData][nInputs];
missing = new boolean[nData][nInputs];
Y = new double[nData];
// Maximum and minimum of inputs
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
// Maximum and minimum for output data
oMaximum = 0;
oMinimum = 0;
// All values are casted into double/integer
nClasses = 0;
for (int i = 0; i < X.length; i++) {
Instance inst = IS.getInstance(i);
for (int j = 0; j < nInputs; j++) {
X[i][j] = IS.getInputNumericValue(i, j);
missing[i][j] = inst.getInputMissingValues(j);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
if (noOutputs) {
Y[i] = 0;
} else {
Y[i] = IS.getOutputNumericValue(i, 0);
}
if (Y[i] > oMaximum || i == 0) {
oMaximum = Y[i];
}
if (Y[i] < oMinimum || i == 0) {
oMinimum = Y[i];
}
}
} catch (Exception e) {
System.out.println("DBG: Exception in readSet");
e.printStackTrace();
}
}
/**
* <p>
* Process a dataset file for a clustering problem.
*
* </p>
* @param nfexamples Name of the dataset file
* @param train The dataset file is for training or for test
*
*/
public void processClusterDataset(String nfexamples, boolean train) throws
IOException {
try {
// Load in memory a dataset that contains a classification problem
IS.readSet(nfexamples, train);
nData = IS.getNumInstances();
nInputs = Attributes.getInputNumAttributes();
nVariables = nInputs + Attributes.getOutputNumAttributes();
if (Attributes.getOutputNumAttributes() != 0) {
System.out.println(
"This algorithm can not process datasets with outputs");
System.out.println("All outputs will be removed");
}
// Initialize and fill our own tables
X = new double[nData][nInputs];
missing = new boolean[nData][nInputs];
// Maximum and minimum of inputs
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
// Maximum and minimum for output data
oMaximum = 0;
oMinimum = 0;
// All values are casted into double/integer
nClasses = 0;
for (int i = 0; i < X.length; i++) {
Instance inst = IS.getInstance(i);
for (int j = 0; j < nInputs; j++) {
X[i][j] = IS.getInputNumericValue(i, j);
missing[i][j] = inst.getInputMissingValues(j);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
}
} catch (Exception e) {
System.out.println("DBG: Exception in readSet");
e.printStackTrace();
}
}
/**
* <p>
* Process a old format dataset file for a modelling problem.
*
* </p>
* @param nfejemplos Name of the dataset file
*
*/
public void oldClassificationProcess(String nfejemplos) {
// Dataset reading for modelling problems
try {
String line;
BufferedReader in = new BufferedReader(new FileReader(nfejemplos));
line = in.readLine();
nData = Integer.parseInt(line);
line = in.readLine();
nVariables = Integer.parseInt(line);
nInputs = nVariables - 1;
X = new double[nData][nInputs];
Y = new double[nData];
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
oMaximum = 0; // Maximum and minimum for output data
oMinimum = 0;
for (int i = 0; i < nData; i++) {
line = in.readLine();
StringTokenizer tokens = new StringTokenizer(line, " ,\t");
for (int j = 0; j < nInputs; j++) {
String tmp = tokens.nextToken();
X[i][j] = Double.parseDouble(tmp);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
Y[i] = Double.parseDouble(tokens.nextToken());
if (Y[i] > oMaximum || i == 0) {
oMaximum = Y[i];
}
if (Y[i] < oMinimum || i == 0) {
oMinimum = Y[i];
}
}
} catch (FileNotFoundException e) {
System.err.println(e + " Fichero de ejemplos no encontrado");
} catch (IOException e) {
System.err.println(e + " Error lectura");
}
}
/**
* <p>
* Process an old format dataset file for a classification problem.
*
* </p>
* @param nfejemplos Name of the dataset file.
*
*/
public void oldClusteringProcess(String nfejemplos) {
// Dataset reading for modelling problems
try {
String line;
BufferedReader in = new BufferedReader(new FileReader(nfejemplos));
line = in.readLine();
nData = Integer.parseInt(line);
line = in.readLine();
nVariables = Integer.parseInt(line);
nInputs = nVariables - 1;
X = new double[nData][nInputs];
C = new int[nData];
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
int cMaximum = 0; // Maximum and minimum for output data
int cMinimum = 0;
for (int i = 0; i < nData; i++) {
line = in.readLine();
StringTokenizer tokens = new StringTokenizer(line, " ,\t");
for (int j = 0; j < nInputs; j++) {
String tmp = tokens.nextToken();
X[i][j] = Double.parseDouble(tmp);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
C[i] = Integer.parseInt(tokens.nextToken());
if (C[i] > cMaximum || i == 0) {
cMaximum = C[i];
}
if (C[i] < cMinimum || i == 0) {
cMinimum = C[i];
}
}
if (cMaximum == cMinimum) {
throw new IOException("0 clases");
}
nClasses = cMaximum - cMinimum + 1;
// It enumerates classes from 0
for (int i = 0; i < nData; i++) {
C[i] = (C[i] - cMinimum) / (cMaximum - cMinimum);
}
} catch (FileNotFoundException e) {
System.err.println(e + " Fichero de ejemplos no encontrado");
} catch (IOException e) {
System.err.println(e + " Error lectura");
}
}
/**
* <p>
* Process an old format dataset file for a clustering problem.
*
* </p>
* @param nfejemplos Name of the dataset file.
*
*/
public void procesa_clustering_old(String nfejemplos) {
// Dataset reading for clustering problems
try {
String line;
BufferedReader in = new BufferedReader(new FileReader(nfejemplos));
line = in.readLine();
nData = Integer.parseInt(line);
line = in.readLine();
nVariables = Integer.parseInt(line);
nInputs = nVariables;
X = new double[nData][nInputs];
iMaximum = new double[nInputs];
iMinimum = new double[nInputs];
for (int i = 0; i < nData; i++) {
line = in.readLine();
StringTokenizer tokens = new StringTokenizer(line, " ,\t");
for (int j = 0; j < nInputs; j++) {
String tmp = tokens.nextToken();
X[i][j] = Double.parseDouble(tmp);
if (X[i][j] > iMaximum[j] || i == 0) {
iMaximum[j] = X[i][j];
}
if (X[i][j] < iMinimum[j] || i == 0) {
iMinimum[j] = X[i][j];
}
}
}
} catch (FileNotFoundException e) {
System.err.println(e + " Fichero de ejemplos no encontrado");
} catch (IOException e) {
System.err.println(e + " Error lectura");
}
}
/**
* <p>
* prints to standard output statistics about the dataset.
*
* </p>
*
*/
public void showDatasetStatistics() {
double sumaX[] = new double[X[0].length];
double sumaY = 0;
for (int i = 0; i < X.length; i++) {
for (int j = 0; j < X[i].length; j++) {
sumaX[j] += X[i][j];
}
if (Y != null) {
sumaY += Y[i];
}
}
for (int j = 0; j < X[0].length; j++) {
sumaX[j] /= X.length;
}
if (Y != null) {
sumaY /= Y.length;
}
System.out.print("Mean of inputs: ");
for (int j = 0; j < X[0].length; j++) {
System.out.print(sumaX[j] + " ");
}
System.out.println();
if (Y != null) {
System.out.println("Mean of outputs: " + sumaY);
}
}
/**
* Returns the header of the data set with the attributes' information
* @return The header of the data set
*/
public String getHeader(){
return IS.getHeader();
}
}