ClusteredDataGenerator.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.machinelearning;

import java.util.Arrays;

import marytts.util.math.MathUtils;

/**
 * 
 * Generates clustered data for testing machine learning algorithms
 * 
 * @author Oytun Türk
 */
public class ClusteredDataGenerator {
	public static final int DEFAULT_NUM_SAMPLES_IN_CLUSTERS = 50;
	public static final int DEFAULT_NUM_CLUSTERS = 10;
	public static final double DEFAULT_INIT_MEAN = 10.0;
	public static final double DEFAULT_VARIANCE = 1.0;
	public double[] data;

	public ClusteredDataGenerator() {
		double[] clusterMeans = new double[DEFAULT_NUM_CLUSTERS];
		for (int i = 0; i < DEFAULT_NUM_CLUSTERS; i++)
			clusterMeans[i] = (i + 1) * 10.0;
		init(clusterMeans);
	}

	public ClusteredDataGenerator(int numClusters, int numSamplesInClusters) {
		this(numClusters, numSamplesInClusters, DEFAULT_INIT_MEAN);
	}

	public ClusteredDataGenerator(int numClusters, int numSamplesInClusters, double initMean) {
		this(numClusters, numSamplesInClusters, initMean, DEFAULT_VARIANCE);
	}

	public ClusteredDataGenerator(int numClusters, int numSamplesInClusters, double initMean, double variance) {
		double[] clusterMeans = new double[numClusters];
		for (int i = 0; i < numClusters; i++)
			clusterMeans[i] = (i + 1) * initMean;

		init(clusterMeans, variance, numSamplesInClusters);
	}

	public ClusteredDataGenerator(double[] clusterMeans) {
		this(clusterMeans, DEFAULT_VARIANCE);
	}

	public ClusteredDataGenerator(double[] clusterMeans, double variance) {
		init(clusterMeans, variance);
	}

	public ClusteredDataGenerator(double[] clusterMeans, double[] variances) {
		init(clusterMeans, variances, DEFAULT_NUM_SAMPLES_IN_CLUSTERS);
	}

	public ClusteredDataGenerator(double[] clusterMeans, double[] variances, int numSamplesPerCluster) {
		init(clusterMeans, variances, numSamplesPerCluster);
	}

	public void init(double[] clusterMeans) {
		init(clusterMeans, DEFAULT_VARIANCE);
	}

	public void init(double[] clusterMeans, double variance) {
		init(clusterMeans, variance, DEFAULT_NUM_SAMPLES_IN_CLUSTERS);
	}

	public void init(double[] clusterMeans, double variance, int numClusters) {
		double[] variances = new double[clusterMeans.length];
		Arrays.fill(variances, variance);

		init(clusterMeans, variances, numClusters);
	}

	public void init(double[] clusterMeans, double[] variances, int numSamplesPerCluster) {
		data = new double[numSamplesPerCluster * clusterMeans.length];
		for (int i = 0; i < clusterMeans.length; i++) {
			double[] tmp = MathUtils.random(numSamplesPerCluster);
			MathUtils.adjustMean(tmp, clusterMeans[i]);
			MathUtils.adjustVariance(tmp, variances[i]);
			System.arraycopy(tmp, 0, data, i * numSamplesPerCluster, numSamplesPerCluster);
			System.out.println("Target mean=" + String.valueOf(clusterMeans[i]) + " Target variance="
					+ String.valueOf(variances[i]) + " - Mean=" + String.valueOf(MathUtils.mean(tmp)) + " Variance="
					+ String.valueOf(MathUtils.variance(tmp)));
		}

		double m = MathUtils.mean(data);
		double v = MathUtils.variance(data, m);
		System.out.println(String.valueOf(m) + " " + String.valueOf(v));
	}

	public static void main(String[] args) {
		ClusteredDataGenerator c = new ClusteredDataGenerator();
	}
}