/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * EMDataGenerator.java * Copyright (C) 2002 Mark Hall * */ package weka.gui.boundaryvisualizer; import weka.core.*; import weka.clusterers.*; import java.io.*; import java.util.Random; /** * Class that uses EM to build a probabilistic clustering model of * supplied input data and then generates new random instances based * that model. * * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a> * @version $Revision: 1.1.1.1 $ */ public class EMDataGenerator implements DataGenerator, Serializable { // the instances to cluster private Instances m_instancesToCluster; // the clusterer private EM m_clusterer; // number of clusters generated by EM private int m_numClusters = -1; // parameters of normal distributions for each attribute in each cluster private double [][][] m_normalDistributions; // prior probabilities for each cluster private double [] m_clusterPriors; // random number seed private int m_seed = 1; // random number generator private Random m_random; // the cluster from which to generate the next instance from private int m_clusterToGenerateFrom = 0; // which dimensions to use for computing a weight for each generated // instance private boolean [] m_weightingDimensions; // the values for the weighting dimensions to use for computing the weight // for the next instance to be generated private double [] m_weightingValues; // created once only - for generating instances fast private Instance m_instance; private double [] m_instanceVals; // cumulative distribution for cluster priors private double [] m_cumDist; private static double m_normConst = Math.sqrt(2*Math.PI); /** * Builds the data generator * * @param inputInstances Instances to construct the clusterer with * @exception Exception if an error occurs */ public void buildGenerator(Instances inputInstances) throws Exception { m_clusterer = new EM(); m_random = new Random(m_seed); m_clusterToGenerateFrom = 0; m_instancesToCluster = inputInstances; m_clusterer.buildClusterer(m_instancesToCluster); m_numClusters = m_clusterer.numberOfClusters(); m_normalDistributions = m_clusterer.getClusterModelsNumericAtts(); m_clusterPriors = m_clusterer.getClusterPriors(); System.err.println(m_clusterer); m_instanceVals = new double [m_instancesToCluster.numAttributes()]; m_instance = new Instance(1.0, m_instanceVals); // Compute cumulative distribution for cluster priors m_cumDist = computeCumulativeDistribution(m_clusterPriors); } /** * Return a cumulative distribution from a discrete distribution * * @param dist the distribution to use * @return the cumulative distribution */ private double [] computeCumulativeDistribution(double [] dist) { double [] cumDist = new double[dist.length]; double sum = 0; for (int i = 0; i < dist.length; i++) { sum += dist[i]; cumDist[i] = sum; } return cumDist; } /** * Generate a new instance. Returns the instance in an brand new * Instance object. * * @return an <code>Instance</code> value * @exception Exception if an error occurs */ public Instance generateInstance() throws Exception { return generateInstance(false); } /** * Generate a new instance. Reuses an existing instance object to * speed up the process. * * @return an <code>Instance</code> value * @exception Exception if an error occurs */ public Instance generateInstanceFast() throws Exception { return generateInstance(true); } /** * Randomly generates an instance from one cluster's model. Successive * calls to this method cycle through the clusters * * @return an <code>Instance</code> value * @exception Exception if an error occurs */ private Instance generateInstance(boolean fast) throws Exception { if (m_clusterer == null) { throw new Exception("Generator has not been built yet!"); } Instance newInst; if (fast) { newInst = m_instance; } else { m_instanceVals = new double [m_instancesToCluster.numAttributes()]; newInst = new Instance(1.0, m_instanceVals); } // choose cluster to generate from double randomCluster = m_random.nextDouble(); for (int i = 0; i < m_cumDist.length; i++) { if (randomCluster <= m_cumDist[i]) { m_clusterToGenerateFrom = i; break; } } if (m_weightingDimensions.length != m_instancesToCluster.numAttributes()) { throw new Exception("Weighting dimension array != num attributes!"); } // set instance values and weight double weight = 1; for (int i = 0; i < m_instancesToCluster.numAttributes(); i++) { if (!m_weightingDimensions[i]) { if (m_instancesToCluster.attribute(i).isNumeric()) { double val = m_random.nextGaussian(); // System.err.println("val "+val); // de-standardize with respect to this normal distribution val *= m_normalDistributions[m_clusterToGenerateFrom][i][1]; val += m_normalDistributions[m_clusterToGenerateFrom][i][0]; // newInst.setValue(i, val); m_instanceVals[i] = val; } else { // nominal attribute } } else { weight *= normalDens(m_weightingValues[i], m_normalDistributions[m_clusterToGenerateFrom][i][0], m_normalDistributions[m_clusterToGenerateFrom][i][1]); m_instanceVals[i] = m_weightingValues[i]; } } newInst.setWeight(weight); // advance the cluster // m_clusterToGenerateFrom = (m_clusterToGenerateFrom + 1) % m_numClusters; return newInst; } /** * Set which dimensions to use when computing a weight for the next * instance to generate * * @param dims an array of booleans indicating which dimensions to use */ public void setWeightingDimensions(boolean [] dims) { m_weightingDimensions = dims; } /** * Set the values for the weighting dimensions to be used when computing * the weight for the next instance to be generated * * @param vals an array of doubles containing the values of the * weighting dimensions (corresponding to the entries that are set to * true throw setWeightingDimensions) */ public void setWeightingValues(double [] vals) { m_weightingValues = vals; } /** * Density function of normal distribution. * @param x input value * @param mean mean of distribution * @param stdDev standard deviation of distribution */ private double normalDens (double x, double mean, double stdDev) { double diff = x - mean; return (1/(m_normConst*stdDev))*Math.exp(-(diff*diff/(2*stdDev*stdDev))); } /** * Return the EM model of the data * * @return an <code>EM</code> value */ public EM getEMModel() { return m_clusterer; } /** * Return the number of clusters generated by EM * * @return an <code>int</code> value */ public int getNumGeneratingModels() { return m_numClusters; } /** * Return the number of the cluster from which the next instance * will be generated from * * @return an <code>int</code> value */ public int getClusterUsedToGenerateLastInstanceFrom() { return m_clusterToGenerateFrom; } /** * Main method for tesing this class * * @param args a <code>String[]</code> value */ public static void main(String [] args) { try { Reader r = null; if (args.length != 1) { throw new Exception("Usage: EMDataGenerator <filename>"); } else { r = new BufferedReader(new FileReader(args[0])); Instances insts = new Instances(r); EMDataGenerator dg = new EMDataGenerator(); dg.buildGenerator(insts); Instances header = new Instances(insts,0); System.out.println(header); for (int i = 0; i < insts.numInstances(); i++) { Instance newInst = dg.generateInstance(); newInst.setDataset(header); System.out.println(newInst); } } } catch (Exception ex) { ex.printStackTrace(); } } }