package ids.clustering.utils;
import ids.clustering.model.Clusters;
import ids.clustering.model.Distance;
import ids.utils.CommonUtils;
import ids.utils.HungarianAlgorithm;
import ids.utils.SearchResult;
import ids.utils.UniqueResult;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
@SuppressWarnings("serial")
public class ClusterUtils implements Serializable {
private boolean verbose;
private Logger log;
private CommonUtils utils;
public ClusterUtils(boolean verbose) {
this.verbose = verbose;
if (verbose) log = Logger.getLogger(getClass().getName());
utils = new CommonUtils(verbose);
}
/**
* Finds value of the objective function
* @param data - input data matrix
* @param centroids - data centroids
* @param idx - cluster memberships
* @param distance - type of distance measure
*/
public double getKMeansObjectiveFunction(double[][] data, double[][] centroids, int[] idx, Distance distance) {
// error check
if (data == null) return -1.0;
int n = data.length;
if (n == 0) return 0;
if (idx.length != n) {
System.out.println("The length of the membership vector is different that number of points in the data set.");
return -1.0;
}
// find objective function
double res = 0;
for (int i = 0; i < n; i++) res = res + utils.getDistance(data[i], centroids[idx[i]], distance);
return res;
}
/**
* Randomly generates cluster centroids
* @param data - input data
* @param k - number of centroids to generate
* @return
*/
public double[][] generateRandomClusterCentroids(double[][] data, int k) {
if (data == null) return null;
int n = data.length;
if (n == 0) return null;
int m = data[0].length;
// get number of points
double[][] centroids = new double[k][m];
int[] points = utils.getRandomPermutation(n, k);
for (int i = 0; i<k; i++) {
centroids[i] = data[points[i]];
}
return centroids;
}
/**
* Returns cluster centroid`s indices
* @param data - input data
* @param centroids - double[][] centroids
* @param distance - Distance measure
* @return indices
*/
public int[] getClusterCentroidsIndices(double[][] data, double[][] centroids, Distance distance) {
int k = centroids.length;
int[] indices = new int[k];
for (int i = 0; i < k; i++) {
double[] pd = utils.getDistance(data, centroids[i], distance);
SearchResult<Double> sr = utils.getMinValue(pd);
indices[i] = sr.getIndex();
if (verbose) {
System.out.printf("%d centroid:\n", i);
Arrays.toString(centroids[i]);
System.out.printf("Close point %d\n", indices[i]);
Arrays.toString(data[indices[i]]);
}
}
return indices;
}
public double[][] getClusterCentroidsByIndices(double[][] data, int[] centroidsIndices) {
return utils.getRows(data, centroidsIndices);
}
public Clusters getClusterCentoids(double[][] data, int[] idx, int k, Distance distance) {
int n = data.length;
if (n == 0) return null;
int dim = data[0].length;
return getClusterCentoids(data, n, dim, idx, k, distance);
}
public Clusters getClusterCentoids(double[][] data, int n, int dim, int[] idx, int k, Distance distance) {
double[][] centroids = new double[k][dim];
int[] clusterSizes = new int[k];
boolean generateNewClusters = false;
// for each cluster
for (int i = 0; i < k; i++) {
// get the data, which belong to cluster i
Set<Integer> cp = utils.getIndicesByValue(idx, i);
int number_points_i = cp.size();
// save trigger
if (number_points_i == 0) { // sometimes this happens specially with k-modes algorithm
generateNewClusters = true;
break; // exit from the for loop
}
// save cluster size
clusterSizes[i] = number_points_i;
double[][] data_i = new double[number_points_i][dim];
int index = 0;
for (Integer j : cp) {
data_i[index] = data[j];
index++;
}
// find centroids
if (distance==Distance.SQEUCLIDEAN) {
centroids[i] = utils.findMeanVector(data_i);
} else if (distance==Distance.EUCLIDEAN) {
centroids[i] = utils.findMeanVector(data_i);
} else if (distance==Distance.COSINE) {
// same as before but we need to normalize the centroids
double[] temp = utils.findMeanVector(data_i);
centroids[i] = utils.findNorm(temp, dim);
} else if (distance == Distance.MATCH) {
// find the mode of the data_i
for (int j = 0; j < dim; j++) { // for every feature
double[] col_vector = new double[number_points_i];
if (number_points_i==0) System.out.println("0 - Points");
for (int p = 0; p < number_points_i; p++) col_vector[p] = data_i[p][j];
// find all unique value and their frequencies
UniqueResult<Double> ur = utils.findUnique(col_vector);
SearchResult<Integer> sr = utils.getMaxValue(ur.frequency);
centroids[i][j] = ur.domain[sr.getIndex()];
}
} else {
System.out.println("Unknow distance");
if (verbose) log.severe("Unknown distance");
}
}
// safety trigger
if (generateNewClusters) {
// randomly generate k centroids and find its cluster size
int[] indices = utils.getRandomPermutation(data.length, k);
// get centroids
for (int i = 0; i < k; i++) {
centroids[i] = data[indices[i]];
}
// get membership
int[] n_idx = getClusterMemberships(data, centroids, distance);
for (int i = 0; i < k; i++) {
Set<Integer> cp = utils.getIndicesByValue(n_idx, i);
clusterSizes[i] = cp.size();
}
}
return new Clusters(centroids, clusterSizes);
}
// return the data part that belongs to the current cluster
public SearchResult<Double> getClusterData(double[][] data, int[] idx, int i) {
int dim = data[0].length;
// get the data, which belong to cluster i
Set<Integer> cp = utils.getIndicesByValue(idx, i);
int number_points_i = cp.size();
List<Integer> points_i = null;
double[][] data_i = null;
// if there are points
if (number_points_i > 0) {
points_i = new ArrayList<Integer>(cp);
data_i = new double[number_points_i][dim];
int index = 0;
for (Integer j : cp) {
data_i[index] = data[j];
index++;
}
}
return new SearchResult<Double>(data_i, points_i);
}
// cluster membership
public int[] getClusterMemberships(double[][] data, double[][] centroids, Distance distance) {
int n = data.length;
//int dim = data[0].length;
int k = centroids.length;
int[] idx = new int[n];
double objF = 0;
for (int i = 0; i<n; i++) {
double f_min = Double.MAX_VALUE;
int j_min = 0;
// assign points to clusters
for (int j=0; j<k; j++) {
double f = utils.getDistance(data[i], centroids[j], distance);
if (f<f_min) {
f_min = f;
j_min = j;
}
}
idx[i] = j_min;
objF += f_min;
}
if (verbose) log.info("The objective function is " + objF);
return idx;
}
/**
* Returns Jaccard Coefficient between membership vector idx and class distribution class_idx.
* Note: Function assumes that cluster and class indices start from index 0
* @param idx - cluster membership vector
* @param class_idx - class distribution
* @param numClusters - number of clusters
* @param numClasses - number of classes
* @return matrix of Jaccard Coefficients
*/
public double[][] findJaccardIndex(int[] idx, int[] class_idx, int numClusters, int numClasses) {
double[][] res = new double[numClusters][numClasses];
for (int i = 0; i < numClusters; i++) {
boolean[] a = utils.getIndexByValue(idx, i);
for (int j = 0; j < numClasses; j++) {
boolean[] b = utils.getIndexByValue(class_idx, j);
boolean[] ab = utils.findAandB(a, b);
int sum_ab = utils.findSum(ab);
double div = (utils.findSum(a) + utils.findSum(b) - sum_ab);
if (div != 0) {
res[i][j] = sum_ab/div;
} else {
System.out.println("ClusterUtils: findJaccardIndex: Divizion by Zero!");
res[i][j] = Double.MIN_VALUE;
}
}
}
return res;
}
/**
* Returns Jaccard Coefficient between membership vector idx and class distribution class_idx.
* Function automatically find number of clusters and classes
* @param idx - cluster membership vector
* @param class_idx - class distribution
* @param numClusters - number of clusters
* @param numClasses - number of classes
* @return matrix of Jaccard Coefficients
*/
public double[][] findJaccardIndex(int[] idx, int[] class_idx) {
// find domain for membership vector
UniqueResult<Integer> idxUR = utils.findUnique(idx);
int numClusters = idxUR.domain.length;
// find domain for class distribution
UniqueResult<Integer> class_idxUR = utils.findUnique(class_idx);
int numClasses = class_idxUR.domain.length;
double[][] res = new double[numClusters][numClasses];
int row = -1;
int col = -1;
// run
for (Integer i : idxUR.domain) {
col = -1;
row++;
boolean[] a = utils.getIndexByValue(idx, i);
for (Integer j : class_idxUR.domain) {
col++;
boolean[] b = utils.getIndexByValue(class_idx, j);
boolean[] ab = utils.findAandB(a, b);
int sum_ab = utils.findSum(ab);
double div = (utils.findSum(a) + utils.findSum(b) - sum_ab);
if (div != 0) {
res[row][col] = sum_ab/div;
} else {
System.out.println("ClusterUtils: findJaccardIndex: Divizion by Zero!");
res[row][col] = Double.POSITIVE_INFINITY;
}
}
}
// output
if (verbose) {
System.out.println("Jaccard coefficient:");
// print header
System.out.printf("\t");
for (int j = 0; j < numClasses; j++) System.out.printf("%d\t", class_idxUR.domain[j]);
System.out.printf("\n");
// print Jaccard matrix
for (int i = 0; i < numClusters; i++) {
System.out.printf("%d.\t", idxUR.domain[i]);
for (int j = 0; j < numClasses; j++) {
System.out.printf("%5.4f\t", res[i][j]);
}
System.out.printf("\n");
}
}
return res;
}
/**
* Finds cluster correspondence between cluster membership idx1 and cluster membership idx2 and renames cluster labels in
* idx2 to the same as in idx1
* @param idx1
* @param idx2
* @return renamed idx2 (with the same cluster indices as in idx1)
*/
public int[] findClusterCorrespondence(int[] idx1, int[] idx2) {
int n = idx1.length;
if (n != idx2.length) {
System.out.println("Cluster Utils: findClusterCorrespondence: idx1 and idx2 has to have same number of elements");
return null;
}
// find unique values in idx1 and idx2
UniqueResult<Integer> ur_idx1 = utils.findUnique(idx1);
UniqueResult<Integer> ur_idx2 = utils.findUnique(idx2);
if (ur_idx1.domain.length != ur_idx2.domain.length) {
System.out.println("Cluster Utils: findClusterCorrespondence: idx1 and idx2 has different number of clusters");
return null;
}
// find Jaccard coefficient
double[][] J = findJaccardIndex(idx1, idx2);
// use Hungarian method - use any kind of Hungarian algorithm
HungarianAlgorithm h = new HungarianAlgorithm();
@SuppressWarnings("static-access")
int[][] match = h.hgAlgorithm(J, "max");
// print match matrix
if (verbose) {
System.out.println("Hunguarian Cost Matrix");
utils.printMatrix(match);
}
// relabel the idx2
if (verbose) System.out.println("Class correspondents matrix:");
int[] target = idx1.clone();
for (int i = 0; i < match.length; i++) {
int idx1_index = ur_idx1.domain[match[i][0]];
int idx2_index = ur_idx2.domain[match[i][1]];
if (verbose) System.out.printf("%d\t%d", idx1_index, idx2_index);
if (idx1_index != idx2_index) {
if (verbose) System.out.printf("\t(will replace all %d in idx2 by %d)", idx2_index, idx1_index);
boolean[] index = utils.getIndexByValue(idx2, idx2_index);
for (int j = 0; j < n; j++) {
if (index[j]) target[j] = idx1_index;
}
}
if (verbose) System.out.printf("\n");
}
if (verbose) {
int v_tt = 15;
if (target.length < v_tt) v_tt = target.length;
System.out.printf("Printing first %d elements of input and output arrays\n", v_tt);
System.out.printf("IDX1\tIDX2\tFINAL IDX\n");
for (int i = 0; i < v_tt; i++) System.out.printf("%d\t%d\t%d\n", idx1[i], idx2[i], target[i]);
}
return target;
}
// converters
/**
* Converts data matrix nxdim to nx1 membership vector
* @return int[] idx - membership vector
*/
public int[] convertDataMatrixToIDX(double[][] data) {
int n = data.length;
if (n==0) return null;
// convert from double[] to int[]
int[] m = new int[n];
for (int i = 0; i < n; i++) m[i] = (int)data[i][0];
// make sure the cluster index starts from 0
UniqueResult<Integer> ur = utils.findUnique(m);
Integer[] domain = ur.domain;
int[] idx = new int[n];
for (int i = 0; i < domain.length; i++) {
boolean[] index = utils.getIndexByValue(m, domain[i]);
for (int j = 0; j < n; j++) {
if (index[j]) idx[j] = i;
}
}
return idx;
}
}