/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
* <p>
* @author Written by Juli�n Luengo Mart�n 05/11/2006
* @version 0.2
* @since JDK 1.5
* </p>
*/
package keel.Algorithms.Preprocess.Missing_Values.knnImpute;
import java.io.*;
import java.util.*;
import keel.Dataset.*;
import keel.Algorithms.Preprocess.Basic.*;
/**
* <p>
* This class computes the mean (numerical) or mode (nominal) value of the attributes with missing values for the selected
* neighbours for a given instance with missing values
* </p>
*/
public class knnImpute {
double [] mean = null;
double [] std_dev = null;
double tempData = 0;
String[][] X = null; //matrix of transformed data
FreqList[] timesSeen = null; //matrix with frequences of attribute values
String[] mostCommon;
int ndatos = 0;
int nentradas = 0;
int tipo = 0;
int direccion = 0;
int nvariables = 0;
int nsalidas = 0;
int nneigh = 1; //number of neighbours
InstanceSet IS,IStest;
String input_train_name = new String();
String input_test_name = new String();
String output_train_name = new String();
String output_test_name = new String();
String temp = new String();
String data_out = new String("");
/** Creates a new instance of MostCommonValue
* @param fileParam The path to the configuration file with all the parameters in KEEL format
*/
public knnImpute(String fileParam) {
config_read(fileParam);
IS = new InstanceSet();
IStest = new InstanceSet();
}
//Write data matrix X to disk, in KEEL format
private void write_results(String output){
//File OutputFile = new File(output_train_name.substring(1, output_train_name.length()-1));
try {
FileWriter file_write = new FileWriter(output);
file_write.write(IS.getHeader());
//now, print the normalized data
file_write.write("@data\n");
for(int i=0;i<ndatos;i++){
file_write.write(X[i][0]);
for(int j=1;j<nvariables;j++){
file_write.write(","+X[i][j]);
}
file_write.write("\n");
}
file_write.close();
} catch (IOException e) {
System.out.println("IO exception = " + e );
System.exit(-1);
}
}
//Read the pattern file, and parse data into strings
private void config_read(String fileParam){
File inputFile = new File(fileParam);
if (inputFile == null || !inputFile.exists()) {
System.out.println("parameter "+fileParam+" file doesn't exists!");
System.exit(-1);
}
//begin the configuration read from file
try {
FileReader file_reader = new FileReader(inputFile);
BufferedReader buf_reader = new BufferedReader(file_reader);
//FileWriter file_write = new FileWriter(outputFile);
String line;
do{
line = buf_reader.readLine();
}while(line.length()==0); //avoid empty lines for processing -> produce exec failure
String out[]= line.split("algorithm = ");
//alg_name = new String(out[1]); //catch the algorithm name
//input & output filenames
do{
line = buf_reader.readLine();
}while(line.length()==0);
out= line.split("inputData = ");
out = out[1].split("\\s\"");
input_train_name = new String(out[0].substring(1, out[0].length()-1));
input_test_name = new String(out[1].substring(0, out[1].length()-1));
if(input_test_name.charAt(input_test_name.length()-1)=='"')
input_test_name = input_test_name.substring(0,input_test_name.length()-1);
do{
line = buf_reader.readLine();
}while(line.length()==0);
out = line.split("outputData = ");
out = out[1].split("\\s\"");
output_train_name = new String(out[0].substring(1, out[0].length()-1));
output_test_name = new String(out[1].substring(0, out[1].length()-1));
if(output_test_name.charAt(output_test_name.length()-1)=='"')
output_test_name = output_test_name.substring(0,output_test_name.length()-1);
//parameters
do{
line = buf_reader.readLine();
}while(line.length()==0);
out = line.split("k = ");
nneigh = (new Integer(out[1])).intValue(); //parse the string into a double
file_reader.close();
} catch (IOException e) {
System.out.println("IO exception = " + e );
e.printStackTrace();
System.exit(-1);
}
}
/**
* <p>
* Computes the distance between two instances (without previous normalization)
* </p>
* @param i First instance
* @param j Second instance
* @return The Euclidean distance between i and j
*/
private double distance(Instance i,Instance j){
double dist = 0;
int in = 0;
int out = 0;
for(int l = 0; l < nvariables;l++){
Attribute a = Attributes.getAttribute(l);
direccion = a.getDirectionAttribute();
tipo = a.getType();
if(direccion == Attribute.INPUT){
if(tipo != Attribute.NOMINAL && !i.getInputMissingValues(in)){
//real value, apply euclidean distance
dist += (i.getInputRealValues(in)-j.getInputRealValues(in))*(i.getInputRealValues(in)-j.getInputRealValues(in));
} else{
if(!i.getInputMissingValues(in) && i.getInputNominalValues(in)!=j.getInputNominalValues(in))
dist += 1;
}
in++;
}else{
if(direccion == Attribute.OUTPUT){
if(tipo != Attribute.NOMINAL && !i.getOutputMissingValues(out)){
dist += (i.getOutputRealValues(out)-j.getOutputRealValues(out))*(i.getOutputRealValues(out)-j.getOutputRealValues(out));
} else{
if(!i.getOutputMissingValues(out) && i.getOutputNominalValues(out)!=j.getOutputNominalValues(out))
dist += 1;
}
out++;
}
}
}
return Math.sqrt(dist);
}
/**
* <p>
* Checks if two instances present MVs for the same attributes
* </p>
* @param inst1 the first instance
* @param inst2 the second instance
* @return true if both instances have missing values for the same attributes, false otherwise
*/
protected boolean sameMissingInputAttributes(Instance inst1, Instance inst2){
boolean sameMVs = true;
for(int i = 0;i < Attributes.getInputNumAttributes() && sameMVs ;i++){
if(inst1.getInputMissingValues(i) != inst2.getInputMissingValues(i))
sameMVs = false;
}
return sameMVs;
}
/**
* Finds the nearest neighbor with a valid value in the specified attribute
* @param inst the instance to be taken as reference
* @param a the attribute which will be checked
* @return the nearest instance that has a valid value in the attribute 'a'
*/
protected Instance nearestValidNeighbor(Instance inst, int a){
double distance = Double.POSITIVE_INFINITY;
Instance inst2;
int nn = 0;
for(int i = 0;i<IS.getNumInstances();i++){
inst2 = IS.getInstance(i);
if(inst!= inst2 && !inst2.getInputMissingValues(a) && distance(inst,inst2)<distance){
distance = distance(inst,inst2);
nn = i;
}
}
return IS.getInstance(nn);
}
/**
* <p>
* Takes a value and checks if it belongs to the attribute interval. If not, it returns the nearest limit.
* IT DOES NOT CHECK IF THE ATTRIBUTE IS NOT NOMINAL
* </p>
* @param value the value to be checked
* @param a the attribute to which the value will be checked against
* @return the original value if it was in the interval limits of the attribute, or the nearest boundary limit otherwise.
*/
public double boundValueToAttributeLimits(double value, Attribute a){
if(value < a.getMinAttribute())
value = a.getMinAttribute();
else if(value > a.getMaxAttribute())
value = a.getMaxAttribute();
return value;
}
/**
* <p>
* Process the training and test files provided in the parameters file to the constructor.
* </p>
*/
public void process(){
double []outputs;
double []outputs2;
Instance neighbor;
double dist,mean,dist2;
int actual,totalN,nn_aux;
int [] N = new int[nneigh];
double []Ndist = new double [nneigh];
boolean allNull;
try {
// Load in memory a dataset that contains a classification problem
IS.readSet(input_train_name,true);
int in = 0;
int out = 0;
ndatos = IS.getNumInstances();
nvariables = Attributes.getNumAttributes();
nentradas = Attributes.getInputNumAttributes();
nsalidas = Attributes.getOutputNumAttributes();
X = new String[ndatos][nvariables];//matrix with transformed data
timesSeen = new FreqList[nvariables];
mostCommon = new String[nvariables];
for(int i = 0;i < ndatos;i++){
Instance inst = IS.getInstance(i);
in = 0;
out = 0;
if(inst.existsAnyMissingValue()){
//since exists MVs, first we must compute the nearest
//neighbours for our instance
for(int n = 0;n<nneigh;n++){
Ndist[n] = Double.MAX_VALUE;
N[n] = -1;
}
for(int k=0;k<ndatos;k++){
neighbor = IS.getInstance(k);
if(!sameMissingInputAttributes(inst, neighbor)){
dist = distance(inst, neighbor);
actual = -1;
for(int n = 0;n<nneigh;n++){
if(dist < Ndist[n]){
if(actual!=-1){
if(Ndist[n]>Ndist[actual]){
actual = n;
}
}
else
actual = n;
}
}
if(actual!=-1){
N[actual] = k;
Ndist[actual] = dist;
}
}
}
}
for(int j = 0; j < nvariables;j++){
Attribute a = Attributes.getAttribute(j);
direccion = a.getDirectionAttribute();
tipo = a.getType();
if(direccion == Attribute.INPUT){
if(tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)){
X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
} else{
if(!inst.getInputMissingValues(in))
X[i][j] = inst.getInputNominalValues(in);
else{
allNull = true;
timesSeen[j] = new FreqList();
if(tipo != Attribute.NOMINAL){
mean = 0.0;
totalN = 0;
for(int m = 0;m < nneigh;m++){
if(N[m]!=-1){
Instance inst2 = IS.getInstance(N[m]);
if(!inst2.getInputMissingValues(in)){
mean += inst2.getInputRealValues(in);
totalN++;
allNull = false;
}
}
}
if(!allNull){
mean = mean / (double)totalN;
if(tipo == Attribute.INTEGER)
mean = new Double(mean+0.5).intValue();
X[i][j] = new String(String.valueOf(mean));
}
else
//if no option left, lets take the nearest neighbor with a valid attribute value
X[i][j] = String.valueOf(nearestValidNeighbor(inst, in).getInputRealValues(in));
}else{
for(int m = 0;m < nneigh;m++){
Instance inst2 = IS.getInstance(N[m]);
if(N[m]!=-1 && !inst2.getInputMissingValues(in)){
timesSeen[j].AddElement( inst2.getInputNominalValues(in));
}
}
if(timesSeen[j].totalElements!=0)
X[i][j] = new String(timesSeen[j].mostCommon().getValue()); //replace missing data
else
X[i][j] = nearestValidNeighbor(inst, in).getInputNominalValues(in);
}
}
}
in++;
} else{
if(direccion == Attribute.OUTPUT){
if(tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)){
X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
} else{
if(!inst.getOutputMissingValues(out))
X[i][j] = inst.getOutputNominalValues(out);
else{
allNull = true;
timesSeen[j] = new FreqList();
if(tipo != Attribute.NOMINAL){
mean = 0.0;
totalN = 0;
for(int m = 0;m < nneigh;m++){
if(N[m]!=-1){
totalN++;
allNull = false;
Instance inst2 = IS.getInstance(N[m]);
mean += inst2.getOutputRealValues(out);
}
}
if(!allNull){
mean = mean / (double)totalN;
if(tipo == Attribute.INTEGER)
mean = new Double(mean+0.5).intValue();
mean = this.boundValueToAttributeLimits(mean, a);
X[i][j] = new String(String.valueOf(mean));
}
else{
//if no option left, lets take the nearest neighbor with a valid attribute value
X[i][j] = new String("<null>");
}
}else{
for(int m = 0;m < nneigh;m++){
Instance inst2 = IS.getInstance(N[m]);
if(N[m]!=-1){
timesSeen[j].AddElement( inst2.getOutputNominalValues(out));
}
}
if(timesSeen[j].totalElements!=0)
X[i][j] = new String(timesSeen[j].mostCommon().getValue()); //replace missing data
else
X[i][j] = new String("<null>");
}
}
}
out++;
}
}
}
}
}catch (Exception e){
System.out.println("Dataset exception = " + e );
e.printStackTrace();
System.exit(-1);
}
write_results(output_train_name);
/***************************************************************************************/
//does a test file associated exist?
if(input_train_name.compareTo(input_test_name)!=0){
try {
// Load in memory a dataset that contains a classification problem
IStest.readSet(input_test_name,false);
int in = 0;
int out = 0;
ndatos = IStest.getNumInstances();
nvariables = Attributes.getNumAttributes();
nentradas = Attributes.getInputNumAttributes();
nsalidas = Attributes.getOutputNumAttributes();
X = new String[ndatos][nvariables];//matrix with transformed data
timesSeen = new FreqList[nvariables];
mostCommon = new String[nvariables];
//now, search for missed data, and replace them with
//the most common value
for(int i = 0;i < ndatos;i++){
Instance inst = IStest.getInstance(i);
in = 0;
out = 0;
if(inst.existsAnyMissingValue()){
//since exists MVs, first we must compute the nearest
//neighbours for our instance
for(int n = 0;n<nneigh;n++){
Ndist[n] = Double.MAX_VALUE;
N[n] = -1;
}
for(int k=0;k<ndatos;k++){
neighbor = IS.getInstance(k);
if(!sameMissingInputAttributes(inst, neighbor)){
dist = distance(inst, neighbor);
actual = -1;
for(int n = 0;n<nneigh;n++){
if(dist < Ndist[n]){
if(actual!=-1){
if(Ndist[n]>Ndist[actual]){
actual = n;
}
}
else
actual = n;
}
}
if(actual!=-1){
N[actual] = k;
Ndist[actual] = dist;
}
}
}
}
for(int j = 0; j < nvariables;j++){
Attribute a = Attributes.getAttribute(j);
direccion = a.getDirectionAttribute();
tipo = a.getType();
if(direccion == Attribute.INPUT){
if(tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)){
X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
} else{
if(!inst.getInputMissingValues(in))
X[i][j] = inst.getInputNominalValues(in);
else{
allNull = true;
timesSeen[j] = new FreqList();
if(tipo != Attribute.NOMINAL){
mean = 0.0;
totalN = 0;
for(int m = 0;m < nneigh;m++){
if(N[m]!=-1){
Instance inst2 = IS.getInstance(N[m]);
if(!inst2.getInputMissingValues(in)){
mean += inst2.getInputRealValues(in);
totalN++;
allNull = false;
}
}
}
if(!allNull){
mean = mean / (double)totalN;
if(tipo == Attribute.INTEGER)
mean = new Double(mean+0.5).intValue();
mean = this.boundValueToAttributeLimits(mean, a);
X[i][j] = new String(String.valueOf(mean));
}
else
//if no option left, lets take the nearest neighbor with a valid attribute value
X[i][j] = String.valueOf(nearestValidNeighbor(inst, in).getInputRealValues(in));
}else{
for(int m = 0;m < nneigh;m++){
Instance inst2 = IS.getInstance(N[m]);
if(N[m]!=-1 && !inst2.getInputMissingValues(in)){
timesSeen[j].AddElement( inst2.getInputNominalValues(in));
}
}
if(timesSeen[j].totalElements!=0)
X[i][j] = new String(timesSeen[j].mostCommon().getValue()); //replace missing data
else
X[i][j] = nearestValidNeighbor(inst, in).getInputNominalValues(in);
}
}
}
in++;
} else{
if(direccion == Attribute.OUTPUT){
if(tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)){
X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
} else{
if(!inst.getOutputMissingValues(out))
X[i][j] = inst.getOutputNominalValues(out);
else{
allNull = true;
timesSeen[j] = new FreqList();
if(tipo != Attribute.NOMINAL){
mean = 0.0;
totalN = 0;
for(int m = 0;m < nneigh;m++){
if(N[m]!=-1){
totalN++;
allNull = false;
Instance inst2 = IS.getInstance(N[m]);
mean += inst2.getOutputRealValues(out);
}
}
if(!allNull){
mean = mean / (double)totalN;
if(tipo == Attribute.INTEGER)
mean = new Double(mean+0.5).intValue();
X[i][j] = new String(String.valueOf(mean));
}
else
X[i][j] = new String("<null>");
}else{
for(int m = 0;m < nneigh;m++){
Instance inst2 = IS.getInstance(N[m]);
if(N[m]!=-1){
timesSeen[j].AddElement( inst2.getOutputNominalValues(out));
}
}
if(timesSeen[j].totalElements!=0)
X[i][j] = new String(timesSeen[j].mostCommon().getValue()); //replace missing data
else
X[i][j] = new String("<null>");
}
}
}
out++;
}
}
}
}
}catch (Exception e){
System.out.println("Dataset exception = " + e );
e.printStackTrace();
System.exit(-1);
}
write_results(output_test_name);
}
}
}