/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
* <p>
* @author Written by Luciano Sanchez (University of Oviedo) 21/07/2008
* @author Modified by J.R. Villar (University of Oviedo) 19/12/2008
* @version 1.0
* @since JDK1.4
* </p>
*/
package keel.Algorithms.Clustering_Algorithms.ClusterKMeans;
import java.io.*;
import java.util.Random;
import java.util.StringTokenizer;
import java.util.Vector;
import keel.Algorithms.Shared.Parsing.*;
import org.core.*;
/**
* KMeans is a private class to cluterize a dataset using the k-means clustering
* algorithm. The initial centroids are chosen randomly between the examples in the dataset.
* Each centroid is updated as the mean value of its nearest examples in the dataset.
*
* @version 1.0
* @since JDK1.5
*/
class KMeans {
//the dataset to be clusterized
double train[][];
//the desired number of clusters
int clusters;
//the centroids of the clusters for all variables
double cclusters[][];
//Randomize object used in this class
static Randomize rand;
/**
* <p>
* KMeans constructor: the cluster centroids are obtained for the given dataset.
* Firstly, the cluster's centroids are randomly chosen. Then the centroids are
* updated as the mean vlaue of nearest examples in the dataset.
* The updating is carried out until no changes in the centroids is achieved.
* </p>
* @param X The dataset to be clusterized
* @param nclusters The desired number of clusters
* @param vrand The Randomize object to be used
*/
public KMeans(double [][]X, int nclusters, Randomize vrand) {
rand=vrand;
train=X; clusters=nclusters; cclusters=new double[nclusters][X[0].length];
for (int i=0;i<nclusters;i++) {
int pos=(int)(rand.Rand()*X.length);
for (int j=0;j<cclusters[i].length;j++) cclusters[i][j]=X[pos][j];
}
int []C = new int[X.length]; int []C_old = new int[X.length];
for (int i=0;i<X.length;i++) { C_old[i] = nearestCentroid(X[i]); }
centroidsUpdating(C_old);
int cambios=0, iteracion=0;
do {
iteracion++;
System.out.println("Iter="+iteracion+" changes="+cambios);
cambios=0;
for (int i=0;i<X.length;i++) {
C[i] = nearestCentroid(X[i]);
if (C[i]!=C_old[i]) cambios++;
C_old[i]=C[i];
}
centroidsUpdating(C);
} while(cambios>0);
}
/**
* <p>
* This method updates the centroids of the clusters as the mean value of the
* nearest examples to each centroid in the dataset.
* The list of the nearest centroid to each example is given as an argument.
* This method modifies cclusters.
* </p>
* @param C The list of the nearest centroid to each example in the dataset
*/
private void centroidsUpdating(int C[]) {
for (int c=0;c<clusters;c++) {
for (int j=0;j<cclusters[c].length;j++) cclusters[c][j]=0;
}
int []nejemplos = new int[clusters];
for (int i=0;i<nejemplos.length;i++) nejemplos[i]=0;
for (int i=0;i<C.length;i++) {
for (int j=0;j<cclusters[C[i]].length;j++) cclusters[C[i]][j]+=train[i][j];
nejemplos[C[i]]++;
}
for (int c=0;c<clusters;c++) {
for (int j=0;j<cclusters[c].length;j++) cclusters[c][j]/=nejemplos[c];
}
}
/**
* <p>
* This private method computes the distance between an example in the dataset
* and a cluster centroid.
* The distance is measure as the square root of the sum of the squares of the
* differences between the example and the cetroid for all the dimensions.
* </p>
* @param a The example in the dataset
* @param b The culster centroid
* @return The distance between a and b as a double precision float value.
*/
private static double distance(double a[], double b[]) {
//Euclid distance between two patterns
double d=0;
for (int i=0;i<a.length;i++) d+=(a[i]-b[i])*(a[i]-b[i]);
return (double)Math.sqrt(d);
}
/**
* <p>
* This method determines the nearest cluster centroid for a given example in the
* dataset. The distance is measure by means of the private method distance.
* </p>
* @param x The example in the dataset
* @return The index of the nearest cluster centroid as an integer values.
*/
public int nearestCentroid(double x[]) {
// A patters is classified respect cluster centroids
int cmin=0; double dmin=distance(x,cclusters[cmin]);
for (int i=1;i<cclusters.length;i++) {
double dx=distance(x,cclusters[i]);
if (dx<dmin) {
dmin=dx;
cmin=i;
}
}
return cmin;
}
/**
* <p>
* This method computes the distance between an example in the dataset
* and a cluster centroid.
* The distance is measure as the square root of the sum of the squares of the
* differences between the example and the cetroid for all the dimensions.
* </p>
* @param a The example in the dataset
* @param b The culster centroid
* @return The distance between a and b as a double precision float value.
*/
public void print() {
System.out.println("Number of clusters: " + cclusters.length);
if (cclusters.length <0) return;
boolean distintos = false;
int features = 0;
for(int i=0; i<cclusters.length && !distintos; i++){
if (i==0) features = cclusters[0].length;
else distintos = (features!=cclusters[i].length);
}
if (distintos) {
System.out.println("Distinto número de atributos por cluster...");
return;
}
for(int i=0; i < cclusters[0].length; i++) {
System.out.println("Feature: "+i+", number of clusters: "+cclusters.length);
for(int j=0; j < cclusters.length; j++) {
System.out.print(""+ cclusters[j][i]+" ");
}
System.out.println();
}
}
}
/**
* <p>
* ClusterKMeans is a class to cluterize a dataset using the k-means clustering
* algorithm. The initial centroids are chosen randomly between the examples in
* the dataset. Each centroid is updated as the mean value of its nearest
* examples in the dataset.
*
* The k-means algorithm is carried out by the KMEans class, while ClusterKMeans
* acts as the interface with the KEEL environment.
* </p>
*/
public class ClusterKMeans {
//The random numbers generator used in this process
static Randomize rand;
/**
* <p>
* This private method extract the dataset and the method's parameters from
* the KEEL environment, calculates the centroids using the KMeans class and
* print out the results with the validation dataset.
* </p>
* @param tty unused boolean parameter, kept for compatibility
* @param pc ProcessConfig object to obtain the train and test datasets
* and the method's parameters.
*/
private void clustering_kmeans(boolean tty, ProcessConfig pc) {
try {
String linea;
ProcessDataset pd=new ProcessDataset();
linea=(String)pc.parInputData.get(ProcessConfig.IndexTrain);
if (pc.parNewFormat) pd.processClusterDataset(linea,true);
else pd.procesa_clustering_old(linea);
int ndatos=pd.getNdata(); // Number of examples
int nvariables=pd.getNvariables(); // Number of variables
int nentradas=pd.getNinputs(); // Number of inputs
pd.showDatasetStatistics();
System.out.println("Number of examples="+ndatos);
System.out.println("Number of inputs="+nentradas);
double[][] X = pd.getX(); // Input data
double[] emaximo = pd.getImaximum(); // Maximum and Minimum for input data
double[] eminimo = pd.getIminimum();
int[] neparticion=new int[nentradas];
int s;
s=pc.parNClusters;
KMeans KM= new KMeans(X,s,rand);
double fallos=0;
try {
for (int i=0;i<X.length;i++) {
int clase=KM.nearestCentroid(X[i]);
// System.out.println("pattern="+i+" cluster="+clase);
}
} catch (Exception e) {
System.out.println(e.toString());
}
// Clusters in the test set
ProcessDataset pdt = new ProcessDataset();
int nprueba,npentradas,npvariables;
linea=(String)pc.parInputData.get(ProcessConfig.IndexTestKMeans);
if (pc.parNewFormat) pdt.processClusterDataset(linea,false);
else pdt.procesa_clustering_old(linea);
nprueba = pdt.getNdata();
npvariables = pdt.getNvariables();
npentradas = pdt.getNinputs();
pdt.showDatasetStatistics();
if (npentradas!=nentradas) throw new IOException("Error in test file");
double[][] Xp=pdt.getX();
int[] Co=new int[Xp.length];
// Test set is classified
try {
for (int i=0;i<Xp.length;i++) {
Co[i]=KM.nearestCentroid(Xp[i]);
// System.out.println("pattern test="+i+" cluster="+Co[i]);
}
} catch (Exception e) {
System.out.println(e.toString());
}
// Output format for clustering algorithms
pc.results(Xp,Co);
KM.print();
} catch(FileNotFoundException e) {
System.err.println(e+" Training data not found");
} catch(IOException e) {
System.err.println(e+" Read error");
}
}
/**
* <p>
* This public static method runs the algorithm that this class concerns with.
* </p>
* @param args Array of strings to sent parameters to the main program. The
* path of the algorithm's parameters file must be given.
*/
public static void main(String args[]) {
boolean tty=false;
ProcessConfig pc=new ProcessConfig();
System.out.println("Reading configuration file: "+args[0]);
if (pc.fileProcess(args[0])<0) return;
int algo=pc.parAlgorithmType;
rand=new Randomize();
rand.setSeed(pc.parSeed);
ClusterKMeans km=new ClusterKMeans();
km.clustering_kmeans(tty,pc);
}
}