package ids.clustering.utils;
import java.io.Serializable;
import java.util.Set;
import ids.clustering.model.Distance;
import ids.clustering.model.Domain;
import ids.utils.CommonUtils;
import ids.utils.SearchResult;
import ids.utils.UniqueResult;
@SuppressWarnings("serial")
public class ClusterValidation implements Serializable {
private CommonUtils utils;
private ClusterUtils clusterUtils;
private boolean verbose;
public ClusterValidation() {
verbose = false;
utils = new CommonUtils(false);
clusterUtils = new ClusterUtils(false);
}
public ClusterValidation(boolean vr) {
verbose = vr;
utils = new CommonUtils(verbose);
clusterUtils = new ClusterUtils(verbose);
}
/**
* Finds the value of the specified validation index
* @param d - input domain
* @param m - ground truth
* @param vi - validation index
* @return
*/
public double Validate(Domain d, int[] m, ValidationIndex vi) {
double res = 0;
switch (vi) {
case DB:
res = findDBIndex(d.data, d.idx, d.centroids, d.k, d.distance);
break;
case NMI:
res = findNMI(d.idx, m);
break;
//TODO finish for other validation indices
}
return res;
}
/* EXTERNAL */
/**
* Find the normalized mutual information (NMI)
* @param X
* @param y
* @param modeX
* @param modeY
* @return
*/
public double[] findNMI(double[][] X, double[] y, String modeX, String modeY) {
int n = X.length;
if (n == 0) return null;
if (n != y.length) return null;
int dim = X[0].length;
// initialization
double[] mu = new double[dim];
// pdf of Y
double[] yi = null;
double[] py = null;
if (modeY.toLowerCase().equals("unique")) {
CoordinatesPair cp = getDiscretePDF(y);
yi = cp.x_double;
py = cp.y_double;
} else if (modeY.toLowerCase().equals("bin")) {
CoordinatesPair cp = getHistBin(y);
yi = cp.x_double;
py = cp.y_double;
} else {
return null;
}
int npointsY = yi.length;
double hy = findH(py);
// for each dimension
double[] xi = null;
double[] px = null;
for (int i = 0; i < dim; i++) {
double[] x = utils.getColumn(X, i);
if (modeX.toLowerCase().equals("unique")) {
CoordinatesPair cp = getDiscretePDF(x);
xi = cp.x_double;
px = cp.y_double;
} else if (modeX.toLowerCase().equals("bin")) {
CoordinatesPair cp = getHistBin(x);
xi = cp.x_double;
px = cp.y_double;
} else {
return null;
}
int npointsX = xi.length;
double hx = findH(px);
// find pdf of joint probability
double[][] pxy = new double[npointsX][npointsY];
double stepX = xi[1] - xi[0];
double stepY = yi[1] - yi[0];
for (int k = 0; k < npointsX; k++) {
for (int j = 0; j < npointsY; j++) {
boolean[] a = null;
boolean[] b = null;;
if ((modeX.toLowerCase().equals("unique"))&(modeY.toLowerCase().equals("unique"))) {
a = utils.getIndexByValue(x, xi[k]);
b = utils.getIndexByValue(y, yi[j]);
} else if ((modeX.toLowerCase().equals("bin"))&(modeY.toLowerCase().equals("bin"))) {
a = utils.findValueBetweenAandB(x, xi[k], xi[k] + stepX);
b = utils.findValueBetweenAandB(y, yi[j], yi[j] + stepY);
} else if ((modeX.toLowerCase().equals("unique"))&(modeY.toLowerCase().equals("bin"))) {
a = utils.getIndexByValue(x, xi[k]);
b = utils.findValueBetweenAandB(y, yi[j], yi[j] + stepY);
} else if ((modeX.toLowerCase().equals("bin"))&(modeY.toLowerCase().equals("unique"))) {
a = utils.findValueBetweenAandB(x, xi[k], xi[k] + stepX);
b = utils.getIndexByValue(y, yi[j]);
}
pxy[k][j] = 1.0*utils.findSum(utils.findAandB(a, b))/n;
if ((pxy[k][j] > 0)&(px[k] != 0)&(py[j] != 0)) {
mu[i] += pxy[k][j]*Math.log(pxy[k][j]/px[k]/py[j])/Math.log(2.0);
}
}
}
// normalized
mu[i] = mu[i]/Math.sqrt(hy*hx);
}
if (verbose) {
System.out.println("Normalized Mutual Information per feature");
utils.printVector(mu);
}
return mu;
}
/**
* Find the normalized mutual information (NMI)
* @param x - input vector
* @param y
* @param modeX
* @param modeY
* @return
*/
public double findNMI(double[] x, double[] y, String modeX, String modeY) {
int n = x.length;
if (n == 0) return -1.0;
if (n != y.length) return -1.0;
// initialization
double mu = 0;
// pdf of Y
double[] yi = null;
double[] py = null;
if (modeY.toLowerCase().equals("unique")) {
CoordinatesPair cp = getDiscretePDF(y);
yi = cp.x_double;
py = cp.y_double;
} else if (modeY.toLowerCase().equals("bin")) {
CoordinatesPair cp = getHistBin(y);
yi = cp.x_double;
py = cp.y_double;
} else {
return -1.0;
}
int npointsY = yi.length;
double hy = findH(py);
// for each dimension
double[] xi = null;
double[] px = null;
if (modeX.toLowerCase().equals("unique")) {
CoordinatesPair cp = getDiscretePDF(x);
xi = cp.x_double;
px = cp.y_double;
} else if (modeX.toLowerCase().equals("bin")) {
CoordinatesPair cp = getHistBin(x);
xi = cp.x_double;
px = cp.y_double;
} else {
return -1.0;
}
int npointsX = xi.length;
double hx = findH(px);
// find pdf of joint probability
double[][] pxy = new double[npointsX][npointsY];
double stepX = xi[1] - xi[0];
double stepY = yi[1] - yi[0];
for (int k = 0; k < npointsX; k++) {
for (int j = 0; j < npointsY; j++) {
boolean[] a = null;
boolean[] b = null;;
if ((modeX.toLowerCase().equals("unique"))&(modeY.toLowerCase().equals("unique"))) {
a = utils.getIndexByValue(x, xi[k]);
b = utils.getIndexByValue(y, yi[j]);
} else if ((modeX.toLowerCase().equals("bin"))&(modeY.toLowerCase().equals("bin"))) {
a = utils.findValueBetweenAandB(x, xi[k], xi[k] + stepX);
b = utils.findValueBetweenAandB(y, yi[j], yi[j] + stepY);
} else if ((modeX.toLowerCase().equals("unique"))&(modeY.toLowerCase().equals("bin"))) {
a = utils.getIndexByValue(x, xi[k]);
b = utils.findValueBetweenAandB(y, yi[j], yi[j] + stepY);
} else if ((modeX.toLowerCase().equals("bin"))&(modeY.toLowerCase().equals("unique"))) {
a = utils.findValueBetweenAandB(x, xi[k], xi[k] + stepX);
b = utils.getIndexByValue(y, yi[j]);
}
pxy[k][j] = 1.0*utils.findSum(utils.findAandB(a, b))/n;
if ((pxy[k][j] > 0)&(px[k] != 0)&(py[j] != 0)) {
mu += pxy[k][j]*Math.log(pxy[k][j]/px[k]/py[j])/Math.log(2.0);
}
}
}
// normalized
mu = mu/Math.sqrt(hy*hx);
if (verbose) System.out.printf("Normalized Mutual Information: %5.4f\n", mu);
return mu;
}
/**
* Find the normalized mutual information (NMI)
* @param X
* @param y
* @param modeX
* @param modeY
* @return
*/
public double findNMI(int[] x, int[] y) {
int n = x.length;
if (n == 0) return -1.0;
if (n != y.length) return -1.0;
// initialization
double mu = 0;
// pdf of Y
CoordinatesPair cp_y = getDiscretePDF(y);
int[] yi = cp_y.x_int;
double[] py = cp_y.y_double;
int npointsY = yi.length;
double hy = findH(py);
// for each dimension
CoordinatesPair cp_x = getDiscretePDF(x);
int[] xi = cp_x.x_int;
double[] px = cp_x.y_double;
int npointsX = xi.length;
double hx = findH(px);
// find pdf of joint probability
double[][] pxy = new double[npointsX][npointsY];
for (int k = 0; k < npointsX; k++) {
for (int j = 0; j < npointsY; j++) {
boolean[] a = null;
boolean[] b = null;;
a = utils.getIndexByValue(x, xi[k]);
b = utils.getIndexByValue(y, yi[j]);
pxy[k][j] = 1.0*utils.findSum(utils.findAandB(a, b))/n;
if ((pxy[k][j] > 0)&(px[k] != 0)&(py[j] != 0)) {
mu += pxy[k][j]*Math.log(pxy[k][j]/px[k]/py[j])/Math.log(2.0);
}
}
}
// normalized
mu = mu/Math.sqrt(hy*hx);
// output
if (verbose) System.out.printf("Normalized Mutual Information: %5.4f\n", mu);
return mu;
}
private double findH(double[] py) {
int n = py.length;
if (n == 0) return .0;
double h = 0;
for (int i = 0; i < n; i++) {
if (py[i] > 0) h += py[i]*Math.log(py[i])/Math.log(2.0);
}
return h;
}
private double findH(int[] py) {
int n = py.length;
if (n == 0) return .0;
double h = 0;
for (int i = 0; i < n; i++) {
if (py[i] > 0) h += py[i]*Math.log(py[i])/Math.log(2.0);
}
return -1.0*h;
}
private CoordinatesPair getHistBin(double[] y) {
SearchResult<Double> sr_min = utils.getMinValue(y);
SearchResult<Double> sr_max = utils.getMaxValue(y);
double[] yi = utils.linspace(sr_min.getValue(), sr_max.getValue(), 100);
int[] py_int = utils.histc(y, yi);
double py_sum = utils.findSum(py_int);
double[] py = new double[yi.length];
for (int i = 0; i < yi.length; i++) py[i] = py_int[i]*1.0/py_sum;
return new CoordinatesPair(yi, py);
}
private CoordinatesPair getDiscretePDF(double[] y) {
int n = y.length;
if (n==0) return null;
UniqueResult<Double> un = utils.findUnique(y);
double[] yi = new double[un.frequency.length];
double[] py = new double[un.frequency.length];
for (int i = 0; i < un.frequency.length; i++) {
yi[i] = (double)un.domain[i];
py[i] = 1.0*un.frequency[i]/n;
}
return new CoordinatesPair(yi, py);
}
private CoordinatesPair getDiscretePDF(int[] y) {
int n = y.length;
if (n==0) return null;
UniqueResult<Integer> un = utils.findUnique(y);
int[] yi = new int[un.frequency.length];
double[] py = new double[un.frequency.length];
for (int i = 0; i < un.frequency.length; i++) {
yi[i] = (int)un.domain[i];
py[i] = 1.0*un.frequency[i]/n;
}
return new CoordinatesPair(yi, py);
}
private class CoordinatesPair {
public double[] x_double;
public double[] y_double;
public int[] x_int;
public CoordinatesPair(double[] x_, double[] y_) {
this.x_double = x_;
this.y_double = y_;
}
public CoordinatesPair(int[] x_, double[] y_) {
this.x_int = x_;
this.y_double = y_;
}
}
/* INTERNAL */
/**
* Finds Silhouette index
* @param data - input data set nxd
* @param idx - input membership vector nx1
* @param k - number of clusters
* @param distance - distance measure
* @return Silhouette index for each points nx1
*/
public double[] findSilhouette(double[][] data, int[] idx, int k, float[] pd) {
int n = data.length;
if (n == 0) return null;
// store silhouette index here
double[] s = new double[n];
// is the average distance from the i-th point to points in another cluster k.
// by default should be Infinity
double[][] avgd_between = new double[n][k];
utils.fillMatrix(avgd_between, Double.POSITIVE_INFINITY);
// is the average distance from the i-th point to the other points in its own cluster
double[] avgd_within = new double[n];
// for every point in the data set
for (int j = 0; j < n; j++) {
double[] distj = utils.getDistance(j, pd, n);
// compute average distance by cluster number
for (int i = 0; i < k; i++) {
Set<Integer> members = utils.getIndicesByValue(idx, i);
double distj_sum = 0;
for (Integer m : members) distj_sum += distj[m];
if (i == idx[j]) {
avgd_within[j] = distj_sum / Math.max(members.size()-1,1);
} else {
avgd_between[j][i] = distj_sum / members.size();
}
}
}
// Calculate the silhouette values
double[] minAvgDBetween = utils.getMinValueInRows(avgd_between);
for (int j = 0; j < n; j++) {
s[j] = (minAvgDBetween[j] - avgd_within[j]) / Math.max(avgd_within[j], minAvgDBetween[j]);
}
if (verbose) System.out.printf("Silhouette Index: %5.4f\n", utils.findMean(s));
return s;
}
/**
* Finds Silhouette index
* @param data - input data set nxd
* @param idx - input membership vector nx1
* @param k - number of clusters
* @param distance - distance measure
* @return Silhouette index for each points nx1
*/
public double[] findSilhouette(double[][] data, int[] idx, int k, Distance distance) {
int n = data.length;
if (n == 0) return null;
// store silhouette index here
double[] s = new double[n];
// is the average distance from the i-th point to points in another cluster k.
// by default should be Infinity
double[][] avgd_between = new double[n][k];
utils.fillMatrix(avgd_between, Double.POSITIVE_INFINITY);
// is the average distance from the i-th point to the other points in its own cluster
double[] avgd_within = new double[n];
// for every point in the data set
for (int j = 0; j < n; j++) {
double[] distj = utils.getDistance(data, data[j], distance);
// compute average distance by cluster number
for (int i = 0; i < k; i++) {
Set<Integer> members = utils.getIndicesByValue(idx, i);
double distj_sum = 0;
for (Integer m : members) distj_sum += distj[m];
if (i == idx[j]) {
avgd_within[j] = distj_sum / Math.max(members.size()-1,1);
} else {
avgd_between[j][i] = distj_sum / members.size();
}
}
}
// Calculate the silhouette values
double[] minAvgDBetween = utils.getMinValueInRows(avgd_between);
for (int j = 0; j < n; j++) {
s[j] = (minAvgDBetween[j] - avgd_within[j]) / Math.max(avgd_within[j], minAvgDBetween[j]);
}
if (verbose) System.out.printf("Silhouette Index: %5.4f\n", utils.findMean(s));
return s;
}
/**
* Finds Dunn`s index of the membership vector
* @param data - input data
* @param idx - input membership vector
* @param k - number of clusters
* @param ds - distance measure
* @return
*/
public double findDunnIndex(double[][] data, int[] idx, int k, Distance ds) {
int n = data.length;
if (n == 0) return 0;
// pre-compute distance between all points in the data set
float[] pd_dist = utils.getDistance(data, ds);
return findDunnIndex(data, idx, k, pd_dist);
}
/**
* Finds Dunn`s index of the membership vector
* @param data - input data
* @param idx - input membership vector
* @param k - number of clusters
* @param ds_dist - distance vector
* @return
*/
public double findDunnIndex(double[][] data, int[] idx, int k, float[] pd_dist) {
int n = data.length;
if (n == 0) return 0;
// find all clusters` diameters
double[] diam = new double[k];
for (int i = 0; i < k; i++) {
diam[i] = getClusterDiameter(pd_dist, idx, i);
if (verbose) System.out.printf("Cluster diameter: diam[%d] = %5.4f\n", i, diam[i]);
}
// find maximum cluster diameter
SearchResult<Double> sr = utils.getMaxValue(diam);
double max_diam = sr.getValue();
if (max_diam == 0) {
System.out.printf("Maximum cluster diameter is 0!\n");
return -1;
}
if (verbose) System.out.printf("Maximum cluster diameter is %5.4f\n", max_diam);
// find distance between all clusters
double[][] d = new double[k][k];
for (int i = 0; i < k; i++) d[i][i] = Double.MAX_VALUE;
for (int i = 0; i < k; i++) {
for (int j = i+1; j < k; j++) {
d[i][j] = getDistanceBetweenClusters(pd_dist, idx, i, j)/max_diam;
d[j][i] = d[i][j];
}
}
if (verbose) {
System.out.println("Distance between the clusters:");
utils.printMatrix(d);
}
// find Dunn`s index
double res = utils.getMinValue(d);
return res;
}
/**
* Finds minimum distance between any two points from two different clusters
* @param pd - distance vector
* @param idx - membership vector
* @param index_i - index of the first cluster
* @param index_j - index of the second cluster
* @return distance between clusters
*/
private double getDistanceBetweenClusters(float[] pd, int[] idx, int index_i, int index_j) {
double cluster_ij = Double.MAX_VALUE;
int n = idx.length;
if (n==0) return -1.0;
// get cluster data for cluster i and j
Set<Integer> cluster_data_index_i = utils.getIndicesByValue(idx, index_i);
Set<Integer> cluster_data_index_j = utils.getIndicesByValue(idx, index_j);
for (Integer s_i : cluster_data_index_i) {
for (Integer s_j : cluster_data_index_j) {
double d = utils.getDistanceFromDistanceVector(pd, s_i, s_j, n);
if (d<cluster_ij) cluster_ij = d;
}
}
return cluster_ij;
}
/**
* Finds cluster diameter, which is the longest distance within cluster
* @param pd
* @param idx
* @param i
* @return cluster`s diameter
*/
private double getClusterDiameter(float[] pd, int[] idx, int i) {
double cluster_d = Double.MIN_VALUE;
int n = idx.length;
// get cluster data
Set<Integer> cluster_data_index = utils.getIndicesByValue(idx, i);
for (Integer s1 : cluster_data_index) {
for (Integer s2 : cluster_data_index) {
double d = utils.getDistanceFromDistanceVector(pd, s1, s2, n);
if (d > cluster_d) cluster_d = d;
}
}
return cluster_d;
}
/**
* Finds Davies-Bouldin index of the membership vector
* @param data - input data
* @param idx - input membership vector
* @param centroid - cluster`s centeroids
* @param k - number of clusters
* @param ds - distance measure
* @return
*/
public double findDBIndex(double[][] data, int[] idx, double[][] centroid, int k, Distance ds) {
int n = data.length;
if (n == 0) return 0;
// average distance between all objects in the cluster and its centroid
double[] Sn = new double[k];
// distance between clusters centroids
double[][] S = new double[k][k];
// for every cluster
for (int i = 0; i < k; i++) {
SearchResult<Double> sr = clusterUtils.getClusterData(data, idx, i);
Sn[i] = findDispersion(sr.getData(), centroid[i], ds);
for (int j = i+1; j < k; j++) {
S[i][j] = utils.getDistance(centroid[i], centroid[j], ds);
S[j][i] = S[i][j];
}
}
// find DB index
double[][] r = new double[k][k];
for (int i = 0; i < k; i++) {
for (int j = i+1; j < k; j++) {
r[i][j] = (Sn[i] + Sn[j])/(S[i][j] + Double.MIN_VALUE);
r[j][i] = r[i][j];
}
}
return utils.findMean(utils.getMaxValue(r, 2));
}
private double findDispersion(double[][] data_i, double[] centoid, Distance ds) {
double[] pd = utils.getDistance(data_i, centoid, ds);
return utils.findMean(pd);
}
}