ClusterUtils.java example

Explorer
phd_code-master
- src
  - ids
package ids.clustering.utils;

import ids.clustering.model.Clusters;
import ids.clustering.model.Distance;
import ids.utils.CommonUtils;
import ids.utils.HungarianAlgorithm;
import ids.utils.SearchResult;
import ids.utils.UniqueResult;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;

@SuppressWarnings("serial")
public class ClusterUtils implements Serializable {

	private boolean verbose;
	private Logger log;
	private CommonUtils utils;
	
	public ClusterUtils(boolean verbose) {
		this.verbose = verbose;
		if (verbose) log = Logger.getLogger(getClass().getName());
		utils = new CommonUtils(verbose);
	}
	
	/**
	 * Finds value of the objective function
	 * @param data - input data matrix
	 * @param centroids - data centroids
	 * @param idx - cluster memberships
	 * @param distance - type of distance measure
	 */
	public double getKMeansObjectiveFunction(double[][] data, double[][] centroids, int[] idx, Distance distance) {
		// error check
		if (data == null) return -1.0;
		int n = data.length;
		if (n == 0) return 0;
		
		if (idx.length != n) {
			System.out.println("The length of the membership vector is different that number of points in the data set.");
			return -1.0;
		}
		
		// find objective function
		double res = 0;
		for (int i = 0; i < n; i++) res = res + utils.getDistance(data[i], centroids[idx[i]], distance);
		return res;
	}
	
	/**
	 * Randomly generates cluster centroids
	 * @param data - input data
	 * @param k - number of centroids to generate
	 * @return
	 */
	public double[][] generateRandomClusterCentroids(double[][] data, int k) {
		if (data == null) return null;
		int n = data.length;
		if (n == 0) return null;
		int m = data[0].length;
		
		// get number of points
		double[][] centroids = new double[k][m];
		int[] points = utils.getRandomPermutation(n, k);
		for (int i = 0; i<k; i++) {
			centroids[i] = data[points[i]];
		}
		
		return centroids;
	}
	
	/**
	 * Returns cluster centroid`s indices
	 * @param data - input data
	 * @param centroids - double[][] centroids
	 * @param distance - Distance measure
	 * @return indices
	 */
	public int[] getClusterCentroidsIndices(double[][] data, double[][] centroids, Distance distance) {
		int k = centroids.length;
		int[] indices = new int[k];
		for (int i = 0; i < k; i++) {
			double[] pd = utils.getDistance(data, centroids[i], distance);
			SearchResult<Double> sr = utils.getMinValue(pd);
			indices[i] = sr.getIndex();
			
			if (verbose) {
				System.out.printf("%d centroid:\n", i);
				Arrays.toString(centroids[i]);
				System.out.printf("Close point %d\n", indices[i]);
				Arrays.toString(data[indices[i]]);
			}
		}
		return indices;
	}
	
	public double[][] getClusterCentroidsByIndices(double[][] data, int[] centroidsIndices) {
		return utils.getRows(data, centroidsIndices);
	}
	
	public Clusters getClusterCentoids(double[][] data, int[] idx, int k, Distance distance) {
		int n = data.length;
		if (n == 0) return null;
		int dim = data[0].length;
		return getClusterCentoids(data, n, dim, idx, k, distance);
	}
	public Clusters getClusterCentoids(double[][] data, int n, int dim, int[] idx, int k, Distance distance) {
		double[][] centroids = new double[k][dim];
		int[] clusterSizes = new int[k];
		boolean generateNewClusters = false;
		
		// for each cluster
		for (int i = 0; i < k; i++) {
			// get the data, which belong to cluster i
			Set<Integer> cp = utils.getIndicesByValue(idx, i);
			int number_points_i = cp.size();
			
			// save trigger
			if (number_points_i == 0) { // sometimes this happens specially with k-modes algorithm
				generateNewClusters = true;
				break; // exit from the for loop
			}
			
			// save cluster size
			clusterSizes[i] = number_points_i;
			double[][] data_i = new double[number_points_i][dim];
			int index = 0;
			for (Integer j : cp) {
				data_i[index] = data[j];
				index++;
			}
			
			// find centroids
			if (distance==Distance.SQEUCLIDEAN) {
				centroids[i] = utils.findMeanVector(data_i); 
			} else if (distance==Distance.EUCLIDEAN) {
				centroids[i] = utils.findMeanVector(data_i);
			} else if (distance==Distance.COSINE) {
				// same as before but we need to normalize the centroids
				double[] temp = utils.findMeanVector(data_i);
				centroids[i] = utils.findNorm(temp, dim);
			} else if (distance == Distance.MATCH) {
				// find the mode of the data_i
				for (int j = 0; j < dim; j++) { // for every feature
					double[] col_vector = new double[number_points_i];
					
					if (number_points_i==0) System.out.println("0 - Points");
					
					for (int p = 0; p < number_points_i; p++) col_vector[p] = data_i[p][j];
					// find all unique value and their frequencies
					UniqueResult<Double> ur = utils.findUnique(col_vector);
					SearchResult<Integer> sr = utils.getMaxValue(ur.frequency);
					centroids[i][j] = ur.domain[sr.getIndex()];
				}
			} else {
				System.out.println("Unknow distance");
				if (verbose) log.severe("Unknown distance");
			}
		}		
		
		// safety trigger
		if (generateNewClusters) {
			// randomly generate k centroids and find its cluster size
			int[] indices = utils.getRandomPermutation(data.length, k);
			// get centroids
			for (int i = 0; i < k; i++) {
				centroids[i] = data[indices[i]];
			}
			// get membership
			int[] n_idx = getClusterMemberships(data, centroids, distance);
			for (int i = 0; i < k; i++) {
				Set<Integer> cp = utils.getIndicesByValue(n_idx, i);
				clusterSizes[i] = cp.size();
			}
		}
		
		return new Clusters(centroids, clusterSizes);
	}
	
	// return the data part that belongs to the current cluster
	public SearchResult<Double> getClusterData(double[][] data, int[] idx, int i) {
		int dim = data[0].length;		
		// get the data, which belong to cluster i
		Set<Integer> cp = utils.getIndicesByValue(idx, i);
		int number_points_i = cp.size();
		List<Integer> points_i = null;
		double[][] data_i = null;
		// if there are points
		if (number_points_i > 0) {
			points_i = new ArrayList<Integer>(cp);
			data_i = new double[number_points_i][dim];
			int index = 0;
			for (Integer j : cp) {
				data_i[index] = data[j];
				index++;
			}
		}
		return new SearchResult<Double>(data_i, points_i);
	}
	
	// cluster membership
	public int[] getClusterMemberships(double[][] data, double[][] centroids, Distance distance) {
		int n = data.length;
		//int dim = data[0].length;
		int k = centroids.length;
		int[] idx = new int[n];
		double objF = 0;
		
		for (int i = 0; i<n; i++) {
			double f_min = Double.MAX_VALUE;
			int j_min = 0;
			
			// assign points to clusters
			for (int j=0; j<k; j++) {
				double f = utils.getDistance(data[i], centroids[j], distance);
				if (f<f_min) {
					f_min = f;
					j_min = j;
				}
			}
			idx[i] = j_min;
			objF += f_min;
		}
		
		if (verbose) log.info("The objective function is " + objF);
		return idx;
	}
	
	/**
	 * Returns Jaccard Coefficient between membership vector idx and class distribution class_idx.
	 * Note: Function assumes that cluster and class indices start from index 0
	 * @param idx - cluster membership vector
	 * @param class_idx - class distribution
	 * @param numClusters - number of clusters
	 * @param numClasses - number of classes
	 * @return matrix of Jaccard Coefficients
	 */
	public double[][] findJaccardIndex(int[] idx, int[] class_idx, int numClusters, int numClasses) {
		double[][] res = new double[numClusters][numClasses];
		for (int i = 0; i < numClusters; i++) {
			boolean[] a = utils.getIndexByValue(idx, i);
			for (int j = 0; j < numClasses; j++) {
				boolean[] b = utils.getIndexByValue(class_idx, j);
				boolean[] ab = utils.findAandB(a, b);
				int sum_ab = utils.findSum(ab);
				double div = (utils.findSum(a) + utils.findSum(b) - sum_ab);
				if (div != 0) {
					res[i][j] = sum_ab/div;
				} else {
					System.out.println("ClusterUtils: findJaccardIndex: Divizion by Zero!");
					res[i][j] = Double.MIN_VALUE;
				}
			}
		}
		return res;
	}
	
	/**
	 * Returns Jaccard Coefficient between membership vector idx and class distribution class_idx.
	 * Function automatically find number of clusters and classes
	 * @param idx - cluster membership vector
	 * @param class_idx - class distribution
	 * @param numClusters - number of clusters
	 * @param numClasses - number of classes
	 * @return matrix of Jaccard Coefficients
	 */
	public double[][] findJaccardIndex(int[] idx, int[] class_idx) {
		// find domain for membership vector
		UniqueResult<Integer> idxUR = utils.findUnique(idx);
		int numClusters = idxUR.domain.length;
		
		// find domain for class distribution
		UniqueResult<Integer> class_idxUR = utils.findUnique(class_idx);
		int numClasses = class_idxUR.domain.length;
		
		double[][] res = new double[numClusters][numClasses];
		int row = -1;
		int col = -1;
		
		// run
		for (Integer i : idxUR.domain) {
			col = -1;
			row++;
			boolean[] a = utils.getIndexByValue(idx, i);
			for (Integer j : class_idxUR.domain) {
				col++;
				boolean[] b = utils.getIndexByValue(class_idx, j);
				boolean[] ab = utils.findAandB(a, b);
				int sum_ab = utils.findSum(ab);
				double div = (utils.findSum(a) + utils.findSum(b) - sum_ab);
				if (div != 0) {
					res[row][col] = sum_ab/div;
				} else {
					System.out.println("ClusterUtils: findJaccardIndex: Divizion by Zero!");
					res[row][col] = Double.POSITIVE_INFINITY;
				}
			}
		}
		
		// output
		if (verbose) {
			System.out.println("Jaccard coefficient:");
			// print header
			System.out.printf("\t");
			for (int j = 0; j < numClasses; j++) System.out.printf("%d\t", class_idxUR.domain[j]);
			System.out.printf("\n");
			// print Jaccard matrix
			for (int i = 0; i < numClusters; i++) {
				System.out.printf("%d.\t", idxUR.domain[i]);
				for (int j = 0; j < numClasses; j++) {
					System.out.printf("%5.4f\t", res[i][j]);
				}
				System.out.printf("\n");
			}
		}		
		
		return res;
	}
	
	/**
	 * Finds cluster correspondence between cluster membership idx1 and cluster membership idx2 and renames cluster labels in
	 * idx2 to the same as in idx1
	 * @param idx1
	 * @param idx2
	 * @return renamed idx2 (with the same cluster indices as in idx1)
	 */
	public int[] findClusterCorrespondence(int[] idx1, int[] idx2) {
		int n = idx1.length;
		if (n != idx2.length) {
			System.out.println("Cluster Utils: findClusterCorrespondence: idx1 and idx2 has to have same number of elements");
			return null;
		}
		
		// find unique values in idx1 and idx2
		UniqueResult<Integer> ur_idx1 = utils.findUnique(idx1);
		UniqueResult<Integer> ur_idx2 = utils.findUnique(idx2);
		if (ur_idx1.domain.length != ur_idx2.domain.length) {
			System.out.println("Cluster Utils: findClusterCorrespondence: idx1 and idx2 has different number of clusters");
			return null;
		}
		
		// find Jaccard coefficient
		double[][] J = findJaccardIndex(idx1, idx2);
		
		// use Hungarian method - use any kind of Hungarian algorithm
		HungarianAlgorithm h = new HungarianAlgorithm();
		@SuppressWarnings("static-access")
		int[][] match = h.hgAlgorithm(J, "max");
		
		// print match matrix
		if (verbose) {
			System.out.println("Hunguarian Cost Matrix");
			utils.printMatrix(match);
		}
		
		// relabel the idx2
		if (verbose) System.out.println("Class correspondents matrix:");
		int[] target = idx1.clone();
		for (int i = 0; i < match.length; i++) {
			int idx1_index = ur_idx1.domain[match[i][0]];
			int idx2_index = ur_idx2.domain[match[i][1]];
			if (verbose) System.out.printf("%d\t%d", idx1_index, idx2_index);			
			if (idx1_index != idx2_index) {
				if (verbose) System.out.printf("\t(will replace all %d in idx2 by %d)", idx2_index, idx1_index);
				boolean[] index = utils.getIndexByValue(idx2, idx2_index);
				for (int j = 0; j < n; j++) {
					if (index[j]) target[j] = idx1_index;
				}
			}
			if (verbose) System.out.printf("\n");
		}
		
		if (verbose) {
			int v_tt = 15;
			if (target.length < v_tt) v_tt = target.length;
			System.out.printf("Printing first %d elements of input and output arrays\n", v_tt);
			System.out.printf("IDX1\tIDX2\tFINAL IDX\n");
			for (int i = 0; i < v_tt; i++) System.out.printf("%d\t%d\t%d\n", idx1[i], idx2[i], target[i]);
		}
		
		return target;
	}

	// converters
	/**
	 * Converts data matrix nxdim to nx1 membership vector
	 * @return int[] idx - membership vector
	 */
	public int[] convertDataMatrixToIDX(double[][] data) {
		int n = data.length;
		if (n==0) return null;
		
		// convert from double[] to int[]
		int[] m = new int[n];
		for (int i = 0; i < n; i++) m[i] = (int)data[i][0];
		
		// make sure the cluster index starts from 0
		UniqueResult<Integer> ur = utils.findUnique(m);
		Integer[] domain = ur.domain;
		int[] idx = new int[n];
		for (int i = 0; i < domain.length; i++) {
			boolean[] index = utils.getIndexByValue(m, domain[i]);
			for (int j = 0; j < n; j++) {
				if (index[j]) idx[j] = i;
			}
		}
		return idx;
	}
}