/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
*
* File: KNN.java
*
* The KNN algorithm tries to find the K nearest instances in the
* training data, selecting the most present class.
*
* Euclidean (L2), Manhattan (L1) and HVDM distances can be used as
* distance function by the classifier.
*
*
* @author Written by Joaqu�n Derrac (University of Granada) 13/11/2008
* @author Modified by Joaqu�n Derrac (University of Granada) 10/18/2008
* @version 1.1
* @since JDK1.5
*
*/
package keel.Algorithms.Lazy_Learning.KNN;
import keel.Algorithms.Lazy_Learning.LazyAlgorithm;
import keel.Dataset.Attribute;
import org.core.Files;
import java.util.StringTokenizer;
public class KNN extends LazyAlgorithm{
//Parameters
private int k;
private int distanceType;
//Constants
private static final int MANHATTAN = 1;
private static final int EUCLIDEAN = 2;
private static final int HVDM = 3 ;
//Additional structures
private double stdDev [];
private double nominalDistance [][][];
/**
* The main method of the class
*
* @param script Name of the configuration script
*
*/
public KNN (String script) {
readDataFiles(script);
//Naming the algorithm
name="KNN";
//Initialization of auxiliary structures
if(distanceType==HVDM){
stdDev= new double [inputAtt];
calculateHVDM();
}
//Initialization stuff ends here. So, we can start time-counting
setInitialTime();
} //end-method
/**
* Reads configuration script, to extract the parameter's values.
*
* @param script Name of the configuration script
*
*/
protected void readParameters (String script) {
String file;
String line;
String type;
StringTokenizer fileLines, tokens;
file = Files.readFile (script);
fileLines = new StringTokenizer (file,"\n\r");
//Discard in/out files definition
fileLines.nextToken();
fileLines.nextToken();
fileLines.nextToken();
//Getting the number of neighbors
line = fileLines.nextToken();
tokens = new StringTokenizer (line, "=");
tokens.nextToken();
k = Integer.parseInt(tokens.nextToken().substring(1));
//Getting the type of distance function
line = fileLines.nextToken();
tokens = new StringTokenizer (line, "=");
tokens.nextToken();
type = tokens.nextToken().substring(1);
distanceType=EUCLIDEAN;
if(type.equalsIgnoreCase("MANHATTAN")){
distanceType=MANHATTAN;
}
if(type.equalsIgnoreCase("HVDM")){
distanceType=HVDM;
}
}//end-method
/**
* Evaluates a instance to predict its class.
*
* @param example Instance evaluated
* @return Class predicted
*
*/
protected int evaluate (double example[]) {
double minDist[];
int nearestN[];
int selectedClasses[];
double dist;
int prediction;
int predictionValue;
boolean stop;
nearestN = new int[k];
minDist = new double[k];
for (int i=0; i<k; i++) {
nearestN[i] = 0;
minDist[i] = Double.MAX_VALUE;
}
//KNN Method starts here
for (int i=0; i<trainData.length; i++) {
dist = distance(trainData[i],example);
if (dist > 0.0){ //leave-one-out
//see if it's nearer than our previous selected neighbors
stop=false;
for(int j=0;j<k && !stop;j++){
if (dist < minDist[j]) {
for (int l = k - 1; l >= j+1; l--) {
minDist[l] = minDist[l - 1];
nearestN[l] = nearestN[l - 1];
}
minDist[j] = dist;
nearestN[j] = i;
stop=true;
}
}
}
}
//we have check all the instances... see what is the most present class
selectedClasses= new int[nClasses];
for (int i=0; i<nClasses; i++) {
selectedClasses[i] = 0;
}
for (int i=0; i<k; i++) {
selectedClasses[trainOutput[nearestN[i]]]+=1;
}
prediction=0;
predictionValue=selectedClasses[0];
for (int i=1; i<nClasses; i++) {
if (predictionValue < selectedClasses[i]) {
predictionValue = selectedClasses[i];
prediction = i;
}
}
return prediction;
} //end-method
/**
* Computes the distance between two instances
*
* @param instance1 First instance
* @param instance2 Second instance
* @return Distance calculated
*
*/
private double distance(double instance1[],double instance2[]){
double dist=0.0;
switch (distanceType){
case HVDM: dist=HVDMDistance(instance1,instance2);
break;
case MANHATTAN:
dist=manhattanDistance(instance1,instance2);
break;
case EUCLIDEAN:
default:
dist=euclideanDistance(instance1,instance2);
break;
};
return dist;
} //end-method
/**
* Computes the distance matrixes for HVDM distance
*
*/
private void calculateHVDM(){
double mean;
double quad;
double VDM;
int Nax,Nay,Naxc,Nayc;
nominalDistance = new double[train.getAttributeDefinitions().getInputNumAttributes()][][];
for (int i=0; i<inputAtt; i++) {
//HVDM for numerical attributes uses the std dev of data
if (train.getAttributeDefinitions().getInputAttribute(i).getType() != Attribute.NOMINAL) {
mean = 0.0;
quad = 0.0;
for (int j=0; j<trainData.length; j++) {
mean += trainData[j][i];
quad += trainData[j][i]*trainData[j][i];
}
mean /= (double)trainData.length;
stdDev[i] = Math.sqrt((quad/((double)trainData.length)) - (mean*mean));
}
else{
nominalDistance[i] = new double[train.getAttributeDefinitions().getInputAttribute(i).getNumNominalValues()][train.getAttributeDefinitions().getInputAttribute(i).getNumNominalValues()];
for (int j=0; j<train.getAttributeDefinitions().getInputAttribute(i).getNumNominalValues(); j++) {
nominalDistance[i][j][j] = 0.0;
}
for (int j=0; j<train.getAttributeDefinitions().getInputAttribute(i).getNumNominalValues(); j++) {
for (int l=j+1; l<train.getAttributeDefinitions().getInputAttribute(i).getNumNominalValues(); l++) {
VDM = 0.0;
Nax = Nay = 0;
for (int m=0; m<trainData.length; m++) {
if (real2Nom(trainData[m][i],i) == j) {
Nax++;
}
if (real2Nom(trainData[m][i],i) == l) {
Nay++;
}
}
for (int m=0; m<nClasses; m++) {
Naxc = Nayc = 0;
for (int n=0; n<trainData.length; n++) {
if (real2Nom(trainData[n][i],i) == j && trainOutput[n] == m) {
Naxc++;
}
if (real2Nom(trainData[n][i],i) == l && trainOutput[n] == m) {
Nayc++;
}
}
VDM += (((double)Naxc / (double)Nax) - ((double)Nayc / (double)Nay)) * (((double)Naxc / (double)Nax) - ((double)Nayc / (double)Nay));
}
nominalDistance[i][j][l] = Math.sqrt(VDM);
nominalDistance[i][l][j] = Math.sqrt(VDM);
}
}
}//end-IF (nominal)
}
}//end-method
/**
* Computes the HVDM distance between two instances
*
* @param instance1 First instance
* @param instance2 Second instance
* @return The HVDM distance
*
*/
private double HVDMDistance(double [] instance1,double [] instance2){
double result=0.0;
for (int i=0; i<instance1.length; i++) {
if (train.getAttributeDefinitions().getInputAttribute(i).getType() == Attribute.NOMINAL) {
result += nominalDistance[i][real2Nom(instance1[i],i)][real2Nom(instance2[i],i)];
} else {
result += Math.abs(instance1[i]-instance2[i]) / (4.0*stdDev[i]);
}
}
result = Math.sqrt(result);
return result;
}//end-method
/**
* Converts a real value to its representation as Nominal in the data set
*
* @param real Real value
* @param att Attribute in the data set
*
* @return The HVDM distance
*
*/
private int real2Nom(double real,int att){
int result;
result=(int)(real*((train.getAttributeDefinitions().getInputAttribute(att).getNominalValuesList().size())-1));
return result;
}//end-method
} //end-class