/* Copyright (c) 2009-2011 Speech Group at Informatik 5, Univ. Erlangen-Nuremberg, GERMANY Korbinian Riedhammer Tobias Bocklet This file is part of the Java Speech Toolkit (JSTK). The JSTK is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The JSTK is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with the JSTK. If not, see <http://www.gnu.org/licenses/>. */ package de.fau.cs.jstk.stat; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.log4j.Logger; import de.fau.cs.jstk.exceptions.TrainingException; import de.fau.cs.jstk.stat.Density.Flags; import de.fau.cs.jstk.stat.MleDensityAccumulator.MleOptions; import de.fau.cs.jstk.trans.PCA; import de.fau.cs.jstk.util.Arithmetics; import de.fau.cs.jstk.util.Distances; /** * Collection of algorithms to initialize a codebook * * @author sikoried * */ public abstract class Initialization { public static Logger logger = Logger.getLogger(Initialization.class); /** * Compute a fast initialization for a mixture density. Available strategies * are "sequential[_N]" (put the next N samples in the next cluster, rotate * through clusters as long as there are samples), "random[_N]" (as * sequential, just random cluster allocation), "uniform" (as sequential * however N is adjusted so that all data is linearly distributed on the * clusters), and "uniform_N" (uses sequences of N samples, drawn from positions * distributed linearly over all data) * @param data * @param nd number of densitites * @param diagonalCovariances use diagonal covariances? * @param method initialization method (see main doc) * @return * @throws TrainingException */ public static Mixture fastInit(List<Sample> data, int nd, boolean diagonalCovariances, String method) throws TrainingException { boolean random = method.startsWith("random"); // compute number of samples per chunk (default: 1) int n = 1; int max = Integer.MAX_VALUE; if (method.startsWith("sequential_")) n = Integer.parseInt(method.substring("sequential_".length())); else if (method.startsWith("random_")) n = Integer.parseInt(method.substring("random_".length())); else if (method.equals("uniform")) n = data.size() / nd; else if (method.startsWith("uniform_")){ n = data.size() / nd; max = Integer.parseInt(method.substring("uniform_".length())); System.err.println("maximally using " + max + " samples per cluster"); } logger.info("Initialization.fastInit(): This will be a " + method + " initialization; chunk-size=" + n); // init the lists LinkedList<LinkedList<Sample>> lists = new LinkedList<LinkedList<Sample>>(); for (int i = 0; i < nd; ++i) lists.add(new LinkedList<Sample>()); // distribute the data int ndx = 0; int fd = data.get(0).x.length; while (data.size() > 0) { int actual = ndx % nd; if (random) actual = (int)(Math.random() * nd); for (int i = 0; i < n && data.size() > 0; ++i){ Sample sample = data.remove(0); if (i < max) lists.get(actual).add(sample); } ndx++; } // ensure that each list is long enough (make that 50 features) int norm = 0; for (int i = 0; i < nd; ++i) { if (lists.get(i).size() < 3) throw new TrainingException("Failed in partitioning data (there was a list with less than 3 samples)"); else norm += lists.get(i).size(); } logger.info("Initialization.fastInit(): average number of samples per cluster: " + ((double) norm / nd)); // ML-estimate the components logger.info("Initialization.fastInit(): computing ML estimates..."); Mixture estimate = new Mixture(fd, nd, diagonalCovariances); for (int i = 0; i < nd; ++i) { estimate.components[i] = Trainer.ml(lists.get(i), diagonalCovariances); estimate.components[i].apr = (double) lists.get(i).size() / norm; } return estimate; } /** * Produce a LGB clustering for the cached data, by splitting the largest * cluster first. The resulting Mixture has weights based on the cluster * sizes. * @param data * @param numc number of clusters requested * @param eps value to shift the cluster centroids on split (1.+-eps)*c * @param conv convergence value for centroid reestimation (e.g. 0.0001) * @param diag diagonal or full covariances * @return */ public static Mixture lbg(List<Sample> data, int numc, double eps, double conv, boolean diag) throws ClassNotFoundException { // dimension int dim = data.get(0).x.length; double scale = 1. / (data.size() * dim); // initial centroid DensityDiagonal c0 = (DensityDiagonal) Trainer.ml(data, diag); // compute diversity double d0 = 0.; for (Sample s : data) d0 += scale * Arithmetics.norm2(-1., s.x, c0.mue); ArrayList<Density> c = new ArrayList<Density>(); c.add(c0); int [] occ = new int [] { data.size() }; // iterate over the splitting procedure while (c.size() < numc) { // split the largest cluster int p = 0; int v = occ[0]; for (int i = 1; i < occ.length; ++i) if (occ[i] > v) { p = i; v = occ[i]; } // split Density ca = c.get(p); Arithmetics.smul2(ca.mue, 1. + eps); Density cb = (diag ? new DensityDiagonal((DensityDiagonal) ca) : new DensityFull((DensityFull) ca)); Arithmetics.smul2(cb.mue, 1. - eps); c.add(cb); double d1 = d0; double d2 = 0.; occ = new int [c.size()]; // keep updating the clusters till they converge while (Math.abs((d1 - d2) / d1) > conv) { // init accumulators MleDensityAccumulator [] mlea = new MleDensityAccumulator [c.size()]; for (int i = 0; i < mlea.length; ++i) mlea[i] = new MleDensityAccumulator(c.get(i).fd, diag ? DensityDiagonal.class : DensityFull.class); // cluster by distance and accumulate for (Sample s : data) { int pos = 0; double min = Arithmetics.norm2(-1., s.x, c.get(0).mue); Iterator<Density> it = c.iterator(); it.next(); int i = 1; while (it.hasNext()) { double d = Arithmetics.norm2(-1., s.x, it.next().mue); if (d < min) { min = d; pos = i; } ++i; } s.y = (short) pos; mlea[pos].accumulate(1.0, s.x); occ[pos]++; } // update clusters double sum = 0; for (int i = 0; i < mlea.length; ++i) sum += mlea[i].occ; for (int i = 0; i < mlea.length; ++i) { Density nd = c.get(i).clone(); MleDensityAccumulator.MleUpdate(c.get(i), MleOptions.pDefaultOptions, Flags.fAllParams, mlea[i], mlea[i].occ / sum, nd); c.get(i).fill(nd); } d1 = d2; d2 = 0.; for (Sample s : data) d2 += scale * Arithmetics.norm2(-1., s.x, c.get(s.y).mue); } } // construct codebook Mixture m = new Mixture(dim, c.size(), diag); Iterator<Density> it = c.iterator(); for (int i = 0; i < c.size(); ++i) { Density dd = it.next(); m.components[i].fill(dd.apr, dd.mue, dd.cov); } return m; } /** * Perform a simple k-means clustering on the data. The number of cluster * needs to be specified in advance. * @param data * @param nd number of clusters * @return Primitive MixtureDensity * @throws TrainingException */ public static Mixture kMeansClustering(List<Sample> data, int nd, boolean diagonalCovariances) throws TrainingException { final long MAX_ITERATIONS = 100; final double MIN_ERROR = 1e-5; logger.info("Initialization.kMeansClustering(): convergence at MIN_ERROR=" + MIN_ERROR + " or MAX_ITERATIONS=" + MAX_ITERATIONS); if (data.size() < nd) throw new TrainingException("More clusters than samples in data set!"); int fd = data.get(0).x.length; Mixture md1 = new Mixture(fd, nd, diagonalCovariances); Mixture md2 = new Mixture(fd, nd, diagonalCovariances); LinkedList<LinkedList<Sample>> clusters = new LinkedList<LinkedList<Sample>>(); for (int i = 0; i < nd; ++i) clusters.add(new LinkedList<Sample>()); // randomly distribute the data to the clusters LinkedList<Sample> copy = new LinkedList<Sample>(data); int z = 0; while (copy.size() > 0) { int pick = (int)(Math.random()*copy.size()); clusters.get(z++ % nd).add(copy.remove(pick)); } for (int i = 0; i < nd; ++i) md1.components[i] = Trainer.ml(clusters.get(i), diagonalCovariances); clusters.clear(); // run k-means double c = 1.; long iteration = 0; while (c > MIN_ERROR && iteration < MAX_ITERATIONS) { logger.info("Initialization.kMeansClustering(): Iteration " + (iteration++) + "/" + MAX_ITERATIONS + " (c=" + c + ")"); for (int i = 0; i < nd; ++i) clusters.add(new LinkedList<Sample>()); // assign the data to the centroids for (Sample s : data) clusters.get(assignToCluster(s.x, md1.components)).add(s); // as we are seeking a partition, fill empty clusters by splitting the // biggest cluster in two parts for (int i = 0; i < nd; ++i) { if (clusters.get(i).size() == 0) { // get largest partition int c_i = -1; int c_s = -1; for (int j = 0; j < clusters.size(); ++j) { if (clusters.get(j).size() > c_s) { c_i = j; c_s = clusters.get(j).size(); } } // set the partitions LinkedList<Sample> old = clusters.get(c_i); LinkedList<Sample> split1 = new LinkedList<Sample>(old.subList(0, c_s / 2)); LinkedList<Sample> split2 = new LinkedList<Sample>(old.subList(c_s / 2 + 1, old.size() - 1)); clusters.set(i, split1); clusters.set(c_i, split2); } } // compute ML statistics using the assigned sets for (int i = 0; i < nd; ++i) { md2.components[i] = Trainer.ml(clusters.get(i), diagonalCovariances); md2.components[i].apr = (double) clusters.get(i).size() / data.size(); } // compute change c = 0; for (int i = 0; i < nd; ++i) c += Distances.euclidean(md1.components[i].mue, md2.components[i].mue); // set new iteration md1 = md2; md2 = new Mixture(fd, nd, diagonalCovariances); clusters.clear(); } return md1; } /** * Compute the ID of the nearest centroid. * * @param x * @param d * components * @return */ public static int assignToCluster(double[] x, Density[] d) { int id = 0; double dist = Distances.euclidean(x, d[0].mue); for (int i = 1; i < d.length; ++i) { double dist2 = Distances.euclidean(x, d[i].mue); if (dist2 < dist) { dist = dist2; id = i; } } return id; } /** * Compute the ID of the nearest centroid. * * @param x * @param list * @return */ public static int assignToCluster(double[] x, List<Density> list) { int id = list.get(0).id; double dist = Distances.euclidean(x, list.get(0).mue); for (int i = 1; i < list.size(); ++i) { double dist2 = Distances.euclidean(x, list.get(i).mue); if (dist2 < dist) { dist = dist2; id = list.get(i).id; } } return id; } private static boolean matchesCluster1(double [] x, double [] c1, double [] c2) { double dist1 = Distances.euclidean(x, c1); double dist2 = Distances.euclidean(x, c2); return dist1 < dist2; } /** * Perform a Gaussian-means (G-means) clustering on the given data set. The * number of mixture components is learned by testing the clusters for * Gaussian distribution and in case splitting clusters violating this * constraint (see also Hamerly03-LTK). * * @param data * @param alpha significance level in {0.1,0.05,0.025,0.01} * @param maxc maximum number of clusters * @return */ public static Mixture gMeansClustering(List<Sample> data, double alpha, int maxc, boolean diagonalCovariances) throws TrainingException { int fd = data.get(0).x.length; LinkedList<LinkedList<Sample>> clusters = new LinkedList<LinkedList<Sample>>(); // initialize with only one center Mixture md1 = new Mixture(fd, 1, diagonalCovariances); Mixture md2 = new Mixture(fd, 1, diagonalCovariances); md1.components[0] = Trainer.ml(data, diagonalCovariances); while (true) { // first, run k-means using current centroids double ch = 1.; while (ch > 1e-10) { // ensure clean clusters clusters.clear(); for (int i = 0; i < md1.nd; ++i) clusters.add(new LinkedList<Sample>()); // assign the data to the centroids for (Sample s : data) clusters.get(assignToCluster(s.x, md1.components)).add(s); // compute ML statistics using these sets for (int i = 0; i < md1.nd; ++i) { md2.components[i] = Trainer.ml(clusters.get(i), diagonalCovariances); md2.components[i].apr = (double) clusters.get(i).size() / data.size(); } // compute change ch = 0; for (int i = 0; i < md2.nd; ++i) ch += Distances.euclidean(md1.components[i].mue, md2.components[i].mue); // set new iteration md1 = md2; md2 = new Mixture(fd, md1.nd, diagonalCovariances); // cluster information is kept for A-D-Test -> no cleanup here! } if (md1.nd >= maxc) break; /* Anderson-Darling-Test to verify the asumption that each cluster * is generated by a normal distribution. The idea is to measure the * difference between a "real" Gaussian distribution and the observed * distribution. Depending on the significance value alpha, the hypothesis * (distribution = gaussian) is accepted or rejected. * * To split the clusters, the new centroids are computed using the * previous common centroid and adding/substracting the principal component * of the data. This should give the best result. */ LinkedList<Density> centroids = new LinkedList<Density>(); for (int i = 0; i < md1.nd; ++i) { // copy the affected data double[] co = md1.components[i].mue.clone(); LinkedList<Sample> work = new LinkedList<Sample>(); for (Sample s : clusters.get(i)) work.add(new Sample(s)); // compute the pca PCA pca = new PCA(work.get(0).x.length); pca.accumulate(work); pca.estimate(); // starting from the centroid, follow the principal component double[] m = Arithmetics.smul1(pca.getProjection()[0], Math.sqrt(2. * pca.getEigenvalues()[0] / Math.PI)); double[] c1 = Arithmetics.vadd1(co, m); double[] c2 = Arithmetics.vsub1(co, m); // run basic 2-means double [] _c1 = new double [c1.length]; double [] _c2 = new double [c2.length]; int _cnt1 = 0, _cnt2 = 0; double _ch = 1.; while (_ch > 1e-10) { // compute new means for (Sample _s : clusters.get(i)) { if (matchesCluster1(_s.x, c1, c2)) { Arithmetics.vadd2(_c1, _s.x); _cnt1++; } else { Arithmetics.vadd2(_c2, _s.x); _cnt2++; } } // normalize, compute change _ch = 0; for (int j = 0; j < _c1.length; ++j) { _c1[j] /= (double) _cnt1; _c2[j] /= (double) _cnt2; _ch += Math.abs(c1[j] - _c1[j]); _ch += Math.abs(c2[j] - _c2[j]); c1[j] = _c1[j]; _c1[j] = 0; c2[j] = _c2[j]; _c2[j] = 0; } _cnt1 = _cnt2 = 0; } /* Reduce the dimension of the features by projecting on the * data onto the principal component and apply the distribution * test to this 'flat' version of the cluster. */ double[] ps = new double[work.size()]; double[] v = Arithmetics.vsub1(c1, c2); double vn = Arithmetics.norm2(v); for (int j = 0; j < ps.length; ++j) ps[j] = Arithmetics.dotp(clusters.get(i).get(j).x, v) / vn; if (DistributionTest.andersonDarlingNormal(ps, alpha) || centroids.size() == maxc-1) { // cluster seems to be normal distributed centroids.add(md1.components[i]); } else { // cluster needs to be splitted Density d1 = diagonalCovariances ? new DensityDiagonal(md1.fd) : new DensityFull(md1.fd); Density d2 = diagonalCovariances ? new DensityDiagonal(md1.fd) : new DensityFull(md1.fd); System.arraycopy(c1, 0, d1.mue, 0, c1.length); System.arraycopy(c2, 0, d2.mue, 0, c2.length); // copy covariance centroids.add(d1); centroids.add(d2); } if (centroids.size() >= maxc) break; } // stop criterion: no more new clusters, current md1 will do if (md1.components.length == centroids.size()) break; // copy the new centroids md1 = new Mixture(md1.fd, centroids.size(), diagonalCovariances); md2 = new Mixture(md1.fd, centroids.size(), diagonalCovariances); for (int i = 0; i < centroids.size(); ++i) md1.components[i] = centroids.get(i); logger.info("retrying with " + centroids.size() + " clusters"); } for (Density d : md1.components) d.update(); return md1; } /** * Abstract DensityRanker: Ranks components ascending by their quality: Lower * score is better quality. * * @author sikoried */ static abstract class DensityRanker implements Comparator<Density> { HashMap<Integer, Double> scores = new HashMap<Integer, Double>(); /** sorts ascending by "quality" */ public int compare(Density d1, Density d2) { double diff = scores.get(d2.id) - scores.get(d1.id); // if this instance has lower covariance, it is better! if (diff == 0) return 0; else if (diff < 0.) return -1; else if (diff > 0.) return 1; return 0; } } static class ClusterSizeRanker extends DensityRanker { /** * Rank the clusters by number of assigned features. Split the one with * most samples. * * @param clusters * @param assignment */ ClusterSizeRanker(List<Density> clusters, HashMap<Integer, LinkedList<Sample>> assignment) { logger.info("ClusterSizeRanker: Sorting the clusters by number of assigned samples"); for (Density d : clusters) scores.put(d.id, (double) assignment.get(d.id).size()); } } /** * Rank components by their sum of (absolute) covariances. Fastest comparator. * * @author sikoried */ static class CovarianceRanker extends DensityRanker { CovarianceRanker(List<Density> clusters) { logger.info("CovarianceRanker: computing scores"); for (Density d : clusters) { double s = 0.; for (double c : d.cov) if (s < Math.abs(c)) s = Math.abs(c); scores.put(d.id, s); } } } /** * Rank components by the sum of the eigen values of the covariance matrix. * * @author sikoried */ static class SumOfEigenvaluesRanker extends DensityRanker { SumOfEigenvaluesRanker(List<Density> clusters) { logger.info("SumOfEigenvaluesRanker: computing scores"); long timestamp = System.currentTimeMillis(); for (Density d : clusters) { int fd = d.fd; Jama.Matrix cm = new Jama.Matrix(fd, fd); if (d instanceof DensityDiagonal) { for (int i = 0; i < fd; ++i) cm.set(i, i, d.cov[i]); } else { // lower triangular matrix... int k = 0; for (int i = 0; i < fd; ++i) for (int j = 0; j <= i; ++j) { cm.set(i, j, d.cov[k]); cm.set(j, i, d.cov[k]); k++; } } double [] ev = cm.eig().getRealEigenvalues(); double sum = 0.; // sum over the eigen values for (int i = 0; i < fd; ++i) sum += ev[i]; scores.put(d.id, sum); } logger.info("SumOfEigenvaluesRanker: " + clusters.size() + " scores in " + String.format("%.2f", (System.currentTimeMillis() - timestamp) / 1000.) + " sec"); } } /** * Rank components by comparing (largest_ev - smallest_ev) of the covariance * * @author sikoried */ static class EigenvalueDifferenceRanker extends DensityRanker { EigenvalueDifferenceRanker(LinkedList<Density> clusters) { logger.info("EigenvalueDifferenceRanker: computing scores"); long timestamp = System.currentTimeMillis(); for (Density d : clusters) { int fd = d.fd; Jama.Matrix cm = new Jama.Matrix(fd, fd); if (d instanceof DensityDiagonal) { for (int i = 0; i < fd; ++i) cm.set(i, i, d.cov[i]); } else { // lower triangular matrix... int k = 0; for (int i = 0; i < fd; ++i) for (int j = 0; j <= i; ++j) { cm.set(i, j, d.cov[k]); cm.set(j, i, d.cov[k]); k++; } } double [] ev = cm.eig().getRealEigenvalues(); double sum = ev[fd-1] - ev[0]; scores.put(d.id, sum); } logger.info("EigenvalueDifferenceRanker: " + clusters.size() + " scores in " + String.format("%.2f", (System.currentTimeMillis() - timestamp) / 1000.) + " sec"); } } /** * Rank components by comparing (largest_ev - smallest_ev) of the covariance * * @author sikoried */ static class EigenvalueRanker extends DensityRanker { EigenvalueRanker(List<Density> clusters) { logger.info("EigenvalueRanker: computing scores"); long timestamp = System.currentTimeMillis(); for (Density d : clusters) { int fd = d.fd; Jama.Matrix cm = new Jama.Matrix(fd, fd); if (d instanceof DensityDiagonal) { for (int i = 0; i < fd; ++i) cm.set(i, i, d.cov[i]); } else { // lower triangular matrix... int k = 0; for (int i = 0; i < fd; ++i) for (int j = 0; j <= i; ++j) { cm.set(i, j, d.cov[k]); cm.set(j, i, d.cov[k]); k++; } } double [] ev = cm.eig().getRealEigenvalues(); scores.put(d.id, ev[fd-1]); } logger.info("EigenvalueRanker: " + clusters.size() + " scores in " + String.format("%.2f", (System.currentTimeMillis() - timestamp) / 1000.) + " sec"); } } /** * Rank components by their Anderson Darling statistics. The larger this * value is, the more likely the data for the estimate is normally distributed. * * @author sikoried */ static class AndersonDarlingRanker extends DensityRanker { /** * We need to cache the Anderson Darling statistics: reference the clusters * and their assigned samples. * * @param clusters * @param assignments */ AndersonDarlingRanker(List<Density> clusters, HashMap<Integer, LinkedList<Sample>> assignment) { logger.info("AndersonDarlingRanker: computing AD statistics"); long timestamp = System.currentTimeMillis(); // compute the AD statistic for each cluster for (Density d : clusters) { // copy the data LinkedList<Sample> work = new LinkedList<Sample>(); for (Sample s : assignment.get(d.id)) work.add(new Sample(s)); if (work.size() == 0) { scores.put(d.id, 0.); continue; } // do the pca PCA pca = new PCA(work.get(0).x.length); pca.accumulate(work); pca.estimate(); // transform all samples double [] transformedData = new double [work.size()]; double [] xt = new double [1]; for (int i = 0; i < transformedData.length; ++i) { pca.transform(work.get(i).x, xt); transformedData[i] = xt[0]; } double ad = DistributionTest.andersonDarlingNormalCriticalValue(transformedData); scores.put(d.id, ad); } logger.info("AndersonDarlingRanker: " + clusters.size() + " scores in " + String.format("%.2f", (System.currentTimeMillis() - timestamp) / 1000.) + " sec"); } } public static enum DensityRankingMethod { /** sum of the (absolute) covariance values -- fastest */ COVARIANCE, /** sum of the eigen values of the covariance matrix */ SUM_EIGENVALUE, /** compare (largest EV - smallest EV) */ EV_DIFFERENCE, /** compare the largest EV only */ EV, /** Compare by Anderson-Darling statistics */ AD_STATISTIC, /** compare by number of assigned samples */ NUM_SAMPLES } /** * Perform a hierarchical Gaussian clustering: Beginning with only one density, * the worst density according to the Ranker is split. * * @param data List of data samples * @param maxc Maximum number of clusters * @param diagonalCovariances * @return ready-to-use MixtureDensity * @throws TrainingException */ public static Mixture hierarchicalGaussianClustering(List<Sample> data, int maxc, boolean diagonalCovariances, DensityRankingMethod rank) throws TrainingException { LinkedList<Density> clusters = new LinkedList<Density>(); HashMap<Integer, LinkedList<Sample>> assignment = new HashMap<Integer, LinkedList<Sample>>(); HashSet<Integer> unsplittable = new HashSet<Integer>(); // id for the next cluster int nextId = 1; // remember the number of samples int num_samples = data.size(); int fd = data.get(0).x.length; // initialization clusters.add(diagonalCovariances ? new DensityDiagonal(fd) : new DensityFull(fd)); assignment.put(0, new LinkedList<Sample>()); // iterate until desired number of clusters is reached while (clusters.size() < maxc) { // step 1: distribute the samples to the clusters for (LinkedList<Sample> ll : assignment.values()) ll.clear(); for (Sample s : data) assignment.get(assignToCluster(s.x, clusters)).add(s); // step 2: estimate components for (Density d : clusters) { logger.info("cluster " + d.id + " has " + assignment.get(d.id).size() + " samples"); Density tmp = Trainer.ml(assignment.get(d.id), diagonalCovariances); d.fill(tmp.apr, tmp.mue, tmp.cov); } // generate a ranker: first splitting candidate is first then! DensityRanker ranker = null; switch (rank) { case COVARIANCE: ranker = new CovarianceRanker(clusters); break; case SUM_EIGENVALUE: ranker = new SumOfEigenvaluesRanker(clusters); break; case EV_DIFFERENCE: ranker = new EigenvalueDifferenceRanker(clusters); break; case AD_STATISTIC: ranker = new AndersonDarlingRanker(clusters, assignment); break; case EV: ranker = new EigenvalueRanker(clusters); break; case NUM_SAMPLES: ranker = new ClusterSizeRanker(clusters, assignment); break; } // rank the estimates by quality logger.info("Initialization.hierarchicalGaussianClustering(): sorting clusters by quality..."); Collections.sort(clusters, ranker); // remove the worst cluster for splitting (if not unsplittable) int ind; Density split = null; for (ind = 0; ind < clusters.size(); ind++) { split = clusters.get(ind); if (!unsplittable.contains(split.id)) { if (ranker instanceof AndersonDarlingRanker) { if (ranker.scores.get(split.id) > 2.) break; } else break; } else split = null; } // any candidate? if (split == null) { logger.info("Initialization.hierarchicalGaussianClustering(): no splittable cluster remaining -- stop"); break; } logger.info("Initialization.hierarchicalGaussianClustering(): splitting cluster_" + split.id + "..."); // get a copy of the data to do the PCA LinkedList<Sample> work = new LinkedList<Sample>(); for (Sample s : assignment.get(split.id)) work.add(new Sample(s)); PCA pca = new PCA(work.get(0).x.length); pca.accumulate(work); pca.estimate(); // we need lists for these new components LinkedList<Sample> ass1 = new LinkedList<Sample>(); LinkedList<Sample> ass2 = new LinkedList<Sample>(); // starting from the old mean, follow the principal component double[] shift = Arithmetics.smul1(pca.getProjection()[0], Math.sqrt(2. * pca.getEigenvalues()[0] / Math.PI)); double[] c1 = Arithmetics.vadd1(split.mue, shift); double[] c2 = Arithmetics.vsub1(split.mue, shift); // distributer the cluster to the splits for (Sample _s : assignment.get(split.id)) { if (matchesCluster1(_s.x, c1, c2)) ass1.add(_s); else ass2.add(_s); } // check if we produced a too small cluster if (ass1.size() < 50 || ass2.size() < 50) { unsplittable.add(split.id); continue; } // compute the ML estimate using the assignments logger.info("Initialization.hierarchicalGaussianClustering(): estimate new clusters..."); Density d1 = Trainer.ml(ass1, diagonalCovariances); Density d2 = Trainer.ml(ass2, diagonalCovariances); // remove the splitted cluster and its assignments clusters.remove(ind); assignment.remove(split.id); // IDs for the new components int id1 = split.id; int id2 = nextId++; d1.id = id1; d2.id = id2; clusters.add(d1); clusters.add(d2); assignment.put(id1, ass1); assignment.put(id2, ass2); logger.info("Initialization.hierarchicalGaussianClustering(): num_samples_cluster_" + id1 + " " + ass1.size()); logger.info("Initialization.hierarchicalGaussianClustering(): num_samples_cluster_" + id2 + " " + ass2.size()); logger.info("Initialization.hierarchicalGaussianClustering(): number_of_clusters = " + clusters.size()); } // build mixture density, estimate priors from assignments Mixture md = new Mixture(fd, clusters.size(), diagonalCovariances); clusters.toArray(md.components); for (Density d : md.components) { d.apr = (double) assignment.get(d.id).size() / num_samples; d.update(); } return md; } }