package edu.cmu.sphinx.decoder.adaptation;
import java.util.ArrayList;
import java.util.Random;
import org.apache.commons.math3.util.FastMath;
import edu.cmu.sphinx.linguist.acoustic.tiedstate.Loader;
import edu.cmu.sphinx.linguist.acoustic.tiedstate.Pool;
/**
* Used for clustering gaussians. The clustering is performed by Euclidean
* distance criterion. The "k-means" clustering algorithm is used for clustering
* the gaussians.
*
* @author Bogdan Petcu
*/
public class ClusteredDensityFileData {
private int numberOfClusters;
private int[] corespondingClass;
public ClusteredDensityFileData(Loader loader, int numberOfClusters) {
this.numberOfClusters = numberOfClusters;
kMeansClustering(loader, 30);
}
public int getNumberOfClusters() {
return this.numberOfClusters;
}
/**
* Used for accessing the index that is specific to a gaussian.
*
* @param gaussian
* provided in a i * numStates + gaussianIndex form.
* @return class index
*/
public int getClassIndex(int gaussian) {
return corespondingClass[gaussian];
}
/**
* Computes euclidean distance between 2 n-dimensional points.
*
* @param a
* - n-dimensional "a" point
* @param b
* - n-dimensional "b" point
* @return the euclidean distance between a and b.
*/
private float euclidianDistance(float[] a, float[] b) {
double s = 0, d;
for (int i = 0; i < a.length; i++) {
d = a[i] - b[i];
s += d * d;
}
return (float) FastMath.sqrt(s);
}
/**
* Checks if the two float array have the same components
*
* @param a
* - float array a
* @param b
* - float array b
* @return true if values from a are equal to the ones in b, else false.
*/
private boolean isEqual(float[] a, float[] b) {
if (a.length != b.length) {
return false;
}
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i]) {
return false;
}
}
return true;
}
/**
* Performs k-means-clustering algorithm for clustering gaussians.
* Clustering is done using euclidean distance criterium.
*
* @param maxIterations
*/
private void kMeansClustering(Loader loader, int maxIterations) {
Pool<float[]> initialData = loader.getMeansPool();
ArrayList<float[]> oldCentroids = new ArrayList<float[]>(
numberOfClusters);
ArrayList<float[]> centroids = new ArrayList<float[]>(numberOfClusters);
int numberOfElements = initialData.size(), nrOfIterations = maxIterations, index;
int[] count = new int[numberOfClusters];
double distance, min;
float[] currentValue, centroid;
float[][][] array = new float[numberOfClusters][numberOfElements][];
boolean converged = false;
Random randomGenerator = new Random();
for (int i = 0; i < numberOfClusters; i++) {
index = randomGenerator.nextInt(numberOfElements);
centroids.add(initialData.get(index));
oldCentroids.add(initialData.get(index));
count[i] = 0;
}
index = 0;
while (!converged && nrOfIterations > 0) {
corespondingClass = new int[initialData.size()];
array = new float[numberOfClusters][numberOfElements][];
for (int i = 0; i < numberOfClusters; i++) {
oldCentroids.set(i, centroids.get(i));
count[i] = 0;
}
for (int i = 0; i < initialData.size(); i++) {
currentValue = initialData.get(i);
min = this.euclidianDistance(oldCentroids.get(0), currentValue);
index = 0;
for (int k = 1; k < numberOfClusters; k++) {
distance = this.euclidianDistance(oldCentroids.get(k),
currentValue);
if (distance < min) {
min = distance;
index = k;
}
}
array[index][count[index]] = currentValue;
corespondingClass[i] = index;
count[index]++;
}
for (int i = 0; i < numberOfClusters; i++) {
centroid = new float[initialData.get(0).length];
if (count[i] > 0) {
for (int j = 0; j < count[i]; j++) {
for (int k = 0; k < initialData.get(0).length; k++) {
centroid[k] += array[i][j][k];
}
}
for (int k = 0; k < initialData.get(0).length; k++) {
centroid[k] /= count[i];
}
centroids.set(i, centroid);
}
}
converged = true;
for (int i = 0; i < numberOfClusters; i++) {
converged = converged
&& (this.isEqual(centroids.get(i), oldCentroids.get(i)));
}
nrOfIterations--;
}
}
}