PolynomialHierarchicalClusteringTrainer.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.machinelearning;

import java.awt.Color;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import javax.swing.JFrame;

import marytts.signalproc.display.FunctionGraph;
import marytts.util.math.Polynomial;

/**
 * Hierarchical clustering training algorithm
 * 
 * Reference: Stephen C. Johnson, 1967, "Hierarchical clustering schemes", Proc. Psychometrika, Vol. 32 No. 3, pp. 241-254.
 *
 * This version is adapted to work with a distance function between polynomials.
 *
 * @author Sathish Pammi
 */
public class PolynomialHierarchicalClusteringTrainer {

	private static final double INFINITE = 10000000.0;
	private static final int CLUSTER_DEFAULT_SIZE = 5;
	private HashSet<String> dataPointSet;
	private ArrayList<Cluster> clusterList;
	private HashMap<String, Double> distanceTableMap;
	private boolean isSimilarityMeasure; // true - similarity measure
											// false - dissimilarity measure
	private double MINDISTANCE;
	Polynomial[] polynomials;

	/**
	 * Constructor of the Hierarchical trainer
	 * 
	 * @param polynomials
	 *            - array of polynomial coefficients ( minimum length of polynomials should be three )
	 * @throws NullPointerException
	 *             if input polynomial array is null
	 * @throws IllegalArgumentException
	 *             if length of array of polynomial coefficients is less than three
	 */
	public PolynomialHierarchicalClusteringTrainer(Polynomial[] polynomials) {

		if (polynomials == null) {
			throw new NullPointerException("Input polynomial array should not be null");
		}

		if (polynomials.length <= 2) {
			throw new IllegalArgumentException("Number of samples for clustering should be more than two.");
		}

		dataPointSet = new HashSet<String>();
		distanceTableMap = new HashMap<String, Double>();
		clusterList = new ArrayList<Cluster>();
		this.polynomials = polynomials;
		// What is the type of measure? (similarity measure or distance measure)
		setSimilarityMeasure(true);
		computeSampleDistances();
		initializeClustering();

	}

	/**
	 * To find distance between two clusters
	 * 
	 * @param xCluster
	 *            a cluster that contains a set of polynomial coeffs.
	 * @param yCluster
	 *            a cluster that contains a set of polynomial coeffs.
	 * @param linkageType
	 *            the linkage type used for Hierarchical clustering. possible values: 'Complete', 'Average' or 'Short'
	 * @return distance between two clusters
	 * @throws NullPointerException
	 *             if received clusters are null
	 * @throws IllegalArgumentException
	 *             if linkageType is other than 'Complete', 'Average' or 'Short'
	 */
	private double getClusterDistance(Cluster xCluster, Cluster yCluster, String linkageType) {

		if (xCluster == null || yCluster == null) {
			throw new NullPointerException("Input clusters should not be null");
		}

		if (!("Short".equals(linkageType) || "Complete".equals(linkageType) || "Average".equals(linkageType))) {
			throw new IllegalArgumentException("Only Short, Complete, or Average linkage clustering supported");
		}

		ArrayList<String> xPoints = xCluster.getAllDataPoints();
		ArrayList<String> yPoints = yCluster.getAllDataPoints();
		ArrayList<Double> distanceList = new ArrayList<Double>();

		double distance = 0.0;
		int nDistances = 0;

		for (int i = 0; i < xPoints.size(); i++) {
			for (int j = 0; j < yPoints.size(); j++) {

				String xyDistance = xPoints.get(i) + "_" + yPoints.get(j);
				// System.out.println(xyDistance);
				if (distanceTableMap.containsKey(xyDistance)) {
					distanceList.add(distanceTableMap.get(xyDistance));
					distance = distanceTableMap.get(xyDistance) + distance;
					nDistances++;
				} else {
					xyDistance = yPoints.get(j) + "_" + xPoints.get(i);
					if (distanceTableMap.containsKey(xyDistance)) {
						distanceList.add(distanceTableMap.get(xyDistance));
						distance = distanceTableMap.get(xyDistance) + distance;
						nDistances++;
					}
				}
			}
		}

		if (linkageType.equals("Short")) {
			Double[] data = distanceList.toArray(new Double[0]);
			double min = Double.NaN;
			for (int i = 0; i < data.length; i++) {
				if (Double.isNaN(data[i]))
					continue;
				if (Double.isNaN(min) || data[i] < min)
					min = data[i];
			}
			return min;
		} else if (linkageType.equals("Complete")) {
			Double[] data = distanceList.toArray(new Double[0]);
			double max = Double.NaN;
			for (int i = 0; i < data.length; i++) {
				if (Double.isNaN(data[i]))
					continue;
				if (Double.isNaN(max) || data[i] > max)
					max = data[i];
			}
			return max;
		}

		// default 'Average'
		return ((double) distance / nDistances);
	}

	/**
	 * To initialize each sample as a single cluster
	 * 
	 */
	private void initializeClustering() {
		assert dataPointSet != null;
		assert clusterList != null;

		Iterator<String> it = dataPointSet.iterator();
		while (it.hasNext()) {
			ArrayList<String> dataSet = new ArrayList<String>();
			dataSet.add(it.next());
			Cluster aCluster = new Cluster(dataSet);
			clusterList.add(aCluster);
		}
	}

	/**
	 * To compute distances between all samples (i.e. all polynomials)
	 */
	private void computeSampleDistances() {

		assert polynomials != null;
		assert polynomials.length > 2;
		assert dataPointSet != null;
		assert distanceTableMap != null;

		int observations = polynomials.length;
		int polynomialOrder = polynomials[0].getOrder();
		int[] clusterIndices = new int[observations];

		// compute distace between two indices
		double[][] dist = new double[observations][observations];
		for (int i = 0; i < observations; i++) {
			dataPointSet.add("" + i);
			for (int j = 0; j < observations; j++) {
				dist[i][j] = Polynomial.polynomialPearsonProductMomentCorr(polynomials[i].coeffs, polynomials[j].coeffs);
				distanceTableMap.put(i + "_" + j, (new Double(dist[i][j])));
			}
		}
	}

	/**
	 * To get the type of measure used for cluster data
	 * 
	 * @return true, if similarity metric used to cluster the data false, if dissimilarity metric used to cluster the data
	 */
	private boolean hasSimilarityMeasure() {
		return this.isSimilarityMeasure;
	}

	/**
	 * To compute distance between to two clusters
	 * 
	 * @param xCluster
	 *            cluster one that contains a set of polynomial coeffs.
	 * @param yCluster
	 *            cluster two that contains a set of polynomial coeffs.
	 * @return double - distance between two clusters By default, it uses 'average' approach to compute distance
	 * @throws IllegalArgumentException
	 *             if input clusters are null
	 */
	private double getClusterDistance(Cluster xCluster, Cluster yCluster) {
		return getClusterDistance(xCluster, yCluster, "Average");
	}

	/**
	 * clustering with default target cluster size and default linkage type It uses 'Average' linkage clustering approach as
	 * default
	 */
	private void clustering() {
		clustering(CLUSTER_DEFAULT_SIZE, "Average");
	}

	/**
	 * clustering with default linkage type It uses 'Average' linkage clustering approach as default
	 * 
	 * @param tagetClusterSize
	 *            target cluster size
	 */
	private void clustering(int tagetClusterSize) {
		clustering(tagetClusterSize, "Average");
	}

	/**
	 * Clustering with user-defined target cluster size
	 * 
	 * @param tagetClusterSize
	 *            target cluster size
	 * @param linkageType
	 *            the linkage type used for Hierarchical clustering. Possible values are 'Average', 'Complete', and 'Short'
	 */
	private void clustering(int tagetClusterSize, String linkageType) {

		assert clusterList != null;

		int minClusterOne = 0;
		int minClusterTwo = 0;
		double minDistance;

		for (int i = clusterList.size(); i > tagetClusterSize; i--) {

			minDistance = this.MINDISTANCE;

			for (int j = 0; j < clusterList.size(); j++) {
				Cluster clusterOne = clusterList.get(j);
				for (int k = (j + 1); k < clusterList.size(); k++) {
					Cluster clusterTwo = clusterList.get(k);
					double distance = getClusterDistance(clusterOne, clusterTwo, linkageType);

					if (hasSimilarityMeasure()) {
						if (distance < minDistance) {
							minDistance = distance;
							minClusterOne = j;
							minClusterTwo = k;
						}
					} else {
						if (distance > minDistance) {
							minDistance = distance;
							minClusterOne = j;
							minClusterTwo = k;
						}
					}
				}
			}

			Cluster clusterOne = clusterList.get(minClusterOne);
			Cluster clusterTwo = clusterList.get(minClusterTwo);
			clusterOne.mergeCluster(clusterTwo); // merge two clusters
			clusterList.remove(clusterTwo); // remove one of them
		}
		printClusterData();
	}

	/**
	 * Print cluster information
	 */
	private void printClusterData() {

		assert clusterList != null;

		System.out.println("Total No of Clusters: " + clusterList.size());
		Iterator<Cluster> it = clusterList.iterator();
		for (int noCluster = 1; it.hasNext(); noCluster++) {
			Cluster aCluster = it.next();
			ArrayList<String> listPoints = aCluster.getAllDataPoints();
			System.out.println("Cluster Number : " + noCluster);
			for (int i = 0; i < listPoints.size(); i++) {
				System.out.print(listPoints.get(i) + " ");
			}
			System.out.println();
		}

	}

	/**
	 * Set the type of measure used to cluster the data
	 * 
	 * @param isSimilarityMeasure
	 *            whether the measure is a similarity measure (value true) or a dissimilarity measure (value false)
	 */
	private void setSimilarityMeasure(boolean isSimilarityMeasure) {
		this.isSimilarityMeasure = isSimilarityMeasure;
		if (this.isSimilarityMeasure) {
			this.MINDISTANCE = INFINITE;
		} else {
			this.MINDISTANCE = -1 * INFINITE;
		}
	}

	/**
	 * This function clusters polynomials using Hierarchical (agglomerative approach) clustering procedure, using a polynomial
	 * distance function. Training consists of four steps: 1. Convert object features to distance matrix. 2. Set each object as a
	 * cluster (thus if we have 6 objects, we will have 6 clusters in the beginning) 3. Iterate until number of clusters is equal
	 * to the given target number of clusters - Merge two closest clusters - Update distance matrix
	 * 
	 * @param tagetClusterSize
	 *            the target cluster size
	 * @param linkageType
	 *            the linkage type used for Hierarchical clustering ('Average', 'Complete', or 'Short')
	 * 
	 * @return the trained clusters
	 * @throws IllegalArgumentException
	 *             if target cluster size is not less than initialized number of samples
	 */
	public PolynomialCluster[] train(int tagetClusterSize, String linkageType) {

		if (clusterList.size() <= tagetClusterSize) {
			throw new IllegalArgumentException("taget cluster size should be less than number of samples");
		}

		if (!("Short".equals(linkageType) || "Complete".equals(linkageType) || "Average".equals(linkageType))) {
			throw new IllegalArgumentException("Only Short, Complete, or Average linkage clustering supported");
		}

		clustering(tagetClusterSize, linkageType);

		// Now fill the clusters with their means and members:
		PolynomialCluster[] clusters = new PolynomialCluster[tagetClusterSize];

		int noClusters = clusterList.size();

		// if below condition fails, it is a BUG
		assert clusterList.size() == tagetClusterSize : "After clustering, number of clusters and the target cluster size should be same, but now the number of clusters are "
				+ clusterList.size();

		for (int i = 0; i < tagetClusterSize; i++) {

			Cluster cl = clusterList.get(i);
			ArrayList<String> dataPoints = cl.getAllDataPoints();
			Polynomial[] members = new Polynomial[dataPoints.size()];

			for (int j = 0; j < dataPoints.size(); j++) {
				members[j] = this.polynomials[(new Integer(dataPoints.get(j))).intValue()];
			}

			Polynomial meanMembers = Polynomial.mean(members);
			clusters[i] = new PolynomialCluster(meanMembers, members);

		}

		return clusters;
	}

	/**
	 * Main method
	 * 
	 * @param args
	 *            args
	 */
	public static void main(String[] args) {
		// Test clustering with random polynomials, and visualise result
		int order = 3;
		int numPolynomials = 100;
		int numClusters = 5;

		// Initialise with random data:
		Polynomial[] ps = new Polynomial[numPolynomials];
		for (int i = 0; i < numPolynomials; i++) {
			double[] coeffs = new double[order + 1];
			for (int c = 0; c < coeffs.length; c++) {
				coeffs[c] = Math.random();
			}
			ps[i] = new Polynomial(coeffs);
		}

		PolynomialHierarchicalClusteringTrainer phCT = new PolynomialHierarchicalClusteringTrainer(ps);
		// Train:
		PolynomialCluster[] clusters = phCT.train(5, "Average");

		// Visualise:
		FunctionGraph clusterGraph = new FunctionGraph(0, 1, new double[1]);
		clusterGraph.setYMinMax(0, 5);
		clusterGraph.setPrimaryDataSeriesStyle(Color.BLUE, FunctionGraph.DRAW_DOTS, FunctionGraph.DOT_FULLCIRCLE);
		JFrame jf = clusterGraph.showInJFrame("", false, true);
		for (int i = 0; i < clusters.length; i++) {
			double[] meanValues = clusters[i].getMeanPolynomial().generatePolynomialValues(100, 0, 1);
			clusterGraph.updateData(0, 1. / meanValues.length, meanValues);

			Polynomial[] members = clusters[i].getClusterMembers();
			for (int m = 0; m < members.length; m++) {
				double[] pred = members[m].generatePolynomialValues(meanValues.length, 0, 1);
				clusterGraph.addDataSeries(pred, Color.GRAY, FunctionGraph.DRAW_LINE, -1);
				jf.repaint();
			}

			jf.setTitle("Cluster " + (i + 1) + " of " + clusters.length + ": " + members.length + " members");
			jf.repaint();

			try {
				Thread.sleep(5000);
			} catch (InterruptedException ie) {
			}
		}
		// System.exit(0);
	}

	/**
	 * A class that contains samples of a cluster
	 * 
	 * @author sathish
	 *
	 */
	class Cluster {

		private ArrayList<String> dataPoints;
		private int clusterSize;

		/**
		 * Cluster constructor
		 * 
		 * @param dataSet
		 *            a arraylist of samples
		 * @throws NullPointerException
		 *             if input dataset is null
		 */
		public Cluster(ArrayList<String> dataSet) {
			if (dataSet == null) {
				throw new NullPointerException("Input dataset for a cluster should not be null");
			}
			this.dataPoints = dataSet;
			this.clusterSize = dataSet.size();
		}

		/**
		 * To return all datapoints in this cluster
		 * 
		 * @return ArrayList<String> of datapoints
		 */
		public ArrayList<String> getAllDataPoints() {
			return this.dataPoints;
		}

		/**
		 * Given cluster will be merged into this cluster
		 * 
		 * @param xCluster
		 *            a cluster that contains a set of polynomial coeffs.
		 * @throws NullPointerException
		 *             if given cluster is null
		 */
		public void mergeCluster(Cluster xCluster) {

			if (xCluster == null) {
				throw new NullPointerException("Input cluster should not be null");
			}

			ArrayList<String> xDataPoints = xCluster.getAllDataPoints();
			Iterator<String> it = xDataPoints.iterator();
			while (it.hasNext()) {
				this.dataPoints.add(it.next());
			}
			this.clusterSize = this.dataPoints.size();
		}

	}

}