/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.machinelearning; import java.util.Arrays; import marytts.util.math.MathUtils; /** * * Generates clustered data for testing machine learning algorithms * * @author Oytun Türk */ public class ClusteredDataGenerator { public static final int DEFAULT_NUM_SAMPLES_IN_CLUSTERS = 50; public static final int DEFAULT_NUM_CLUSTERS = 10; public static final double DEFAULT_INIT_MEAN = 10.0; public static final double DEFAULT_VARIANCE = 1.0; public double[] data; public ClusteredDataGenerator() { double[] clusterMeans = new double[DEFAULT_NUM_CLUSTERS]; for (int i = 0; i < DEFAULT_NUM_CLUSTERS; i++) clusterMeans[i] = (i + 1) * 10.0; init(clusterMeans); } public ClusteredDataGenerator(int numClusters, int numSamplesInClusters) { this(numClusters, numSamplesInClusters, DEFAULT_INIT_MEAN); } public ClusteredDataGenerator(int numClusters, int numSamplesInClusters, double initMean) { this(numClusters, numSamplesInClusters, initMean, DEFAULT_VARIANCE); } public ClusteredDataGenerator(int numClusters, int numSamplesInClusters, double initMean, double variance) { double[] clusterMeans = new double[numClusters]; for (int i = 0; i < numClusters; i++) clusterMeans[i] = (i + 1) * initMean; init(clusterMeans, variance, numSamplesInClusters); } public ClusteredDataGenerator(double[] clusterMeans) { this(clusterMeans, DEFAULT_VARIANCE); } public ClusteredDataGenerator(double[] clusterMeans, double variance) { init(clusterMeans, variance); } public ClusteredDataGenerator(double[] clusterMeans, double[] variances) { init(clusterMeans, variances, DEFAULT_NUM_SAMPLES_IN_CLUSTERS); } public ClusteredDataGenerator(double[] clusterMeans, double[] variances, int numSamplesPerCluster) { init(clusterMeans, variances, numSamplesPerCluster); } public void init(double[] clusterMeans) { init(clusterMeans, DEFAULT_VARIANCE); } public void init(double[] clusterMeans, double variance) { init(clusterMeans, variance, DEFAULT_NUM_SAMPLES_IN_CLUSTERS); } public void init(double[] clusterMeans, double variance, int numClusters) { double[] variances = new double[clusterMeans.length]; Arrays.fill(variances, variance); init(clusterMeans, variances, numClusters); } public void init(double[] clusterMeans, double[] variances, int numSamplesPerCluster) { data = new double[numSamplesPerCluster * clusterMeans.length]; for (int i = 0; i < clusterMeans.length; i++) { double[] tmp = MathUtils.random(numSamplesPerCluster); MathUtils.adjustMean(tmp, clusterMeans[i]); MathUtils.adjustVariance(tmp, variances[i]); System.arraycopy(tmp, 0, data, i * numSamplesPerCluster, numSamplesPerCluster); System.out.println("Target mean=" + String.valueOf(clusterMeans[i]) + " Target variance=" + String.valueOf(variances[i]) + " - Mean=" + String.valueOf(MathUtils.mean(tmp)) + " Variance=" + String.valueOf(MathUtils.variance(tmp))); } double m = MathUtils.mean(data); double v = MathUtils.variance(data, m); System.out.println(String.valueOf(m) + " " + String.valueOf(v)); } public static void main(String[] args) { ClusteredDataGenerator c = new ClusteredDataGenerator(); } }