EM.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    EM.java
 *    Copyright (C) 1999 Mark Hall
 *
 */

package  weka.clusterers;

import  java.io.*;
import  java.util.*;
import  weka.core.*;
import  weka.estimators.*;

/**
 * Simple EM (expectation maximisation) class. <p>
 * 
 * EM assigns a probability distribution to each instance which
 * indicates the probability of it belonging to each of the clusters.
 * EM can decide how many clusters to create by cross validation, or you
 * may specify apriori how many clusters to generate. <p>
 * <br>
 * The cross validation performed to determine the number of clusters
 * is done in the following steps:<br>
 * 1. the number of clusters is set to 1<br>
 * 2. the training set is split randomly into 10 folds.<br>
 * 3. EM is performed 10 times using the 10 folds the usual CV way.<br>
 * 4. the loglikelihood is averaged over all 10 results.<br>
 * 5. if loglikelihood has increased the number of clusters is increased by 1
 * and the program continues at step 2. <br>
 *<br>
 * The number of folds is fixed to 10, as long as the number of instances in
 * the training set is not smaller 10. If this is the case the number of folds
 * is set equal to the number of instances.<p>
 *
 * Valid options are:<p>
 *
 * -V <br>
 * Verbose. <p>
 *
 * -N <number of clusters> <br>
 * Specify the number of clusters to generate. If omitted,
 * EM will use cross validation to select the number of clusters
 * automatically. <p>
 *
 * -I <max iterations> <br>
 * Terminate after this many iterations if EM has not converged. <p>
 *
 * -S <seed> <br>
 * Specify random number seed. <p>
 *
 * -M <num> <br>
 * Set the minimum allowable standard deviation for normal density calculation.
 * <p>
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision: 1.1.1.1 $
 */
public class EM
  extends DistributionClusterer
  implements OptionHandler
{

  /** hold the discrete estimators for each cluster */
  private Estimator m_model[][];

  /** hold the normal estimators for each cluster */
  private double m_modelNormal[][][];

  /** default minimum standard deviation */
  private double m_minStdDev = 1e-6;

  /** hold the weights of each instance for each cluster */
  private double m_weights[][];

  /** the prior probabilities for clusters */
  private double m_priors[];

  /** the loglikelihood of the data */
  private double m_loglikely;

  /** training instances */
  private Instances m_theInstances = null;

  /** number of clusters selected by the user or cross validation */
  private int m_num_clusters;

  /** the initial number of clusters requested by the user--- -1 if
      xval is to be used to find the number of clusters */
  private int m_initialNumClusters;

  /** number of attributes */
  private int m_num_attribs;

  /** number of training instances */
  private int m_num_instances;

  /** maximum iterations to perform */
  private int m_max_iterations;

  /** attribute min values */
  private double [] m_minValues;

  /** attribute max values */
  private double [] m_maxValues;

  /** random numbers and seed */
  private Random m_rr;
  private int m_rseed;

  /** Constant for normal distribution. */
  private static double m_normConst = Math.sqrt(2*Math.PI);

  /** Verbose? */
  private boolean m_verbose;

  /**
   * Returns a string describing this clusterer
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using expectation maximization";
  }


  /**
   * Returns an enumeration describing the available options.. <p>
   *
   * Valid options are:<p>
   *
   * -V <br>
   * Verbose. <p>
   *
   * -N <number of clusters> <br>
   * Specify the number of clusters to generate. If omitted,
   * EM will use cross validation to select the number of clusters
   * automatically. <p>
   *
   * -I <max iterations> <br>
   * Terminate after this many iterations if EM has not converged. <p>
   *
   * -S <seed> <br>
   * Specify random number seed. <p>
   *
   * -M <num> <br>
   *  Set the minimum allowable standard deviation for normal density 
   * calculation. <p>
   *
   * @return an enumeration of all the available options.
   *
   **/
  public Enumeration listOptions () {
    Vector newVector = new Vector(6);
    newVector.addElement(new Option("\tnumber of clusters. If omitted or" 
				    + "\n\t-1 specified, then cross " 
				    + "validation is used to\n\tselect the " 
				    + "number of clusters.", "N", 1
				    , "-N <num>"));
    newVector.addElement(new Option("\tmax iterations.\n(default 100)", "I"
				    , 1, "-I <num>"));
    newVector.addElement(new Option("\trandom number seed.\n(default 1)"
				    , "S", 1, "-S <num>"));
    newVector.addElement(new Option("\tverbose.", "V", 0, "-V"));
    newVector.addElement(new Option("\tminimum allowable standard deviation "
				    +"for normal density computation "
				    +"\n\t(default 1e-6)"
				    ,"M",1,"-M <num>"));
    return  newVector.elements();
  }


  /**
   * Parses a given list of options.
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions (String[] options)
    throws Exception {
    resetOptions();
    setDebug(Utils.getFlag('V', options));
    String optionString = Utils.getOption('I', options);

    if (optionString.length() != 0) {
      setMaxIterations(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('N', options);

    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('S', options);

    if (optionString.length() != 0) {
      setSeed(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('M', options);
    if (optionString.length() != 0) {
      setMinStdDev((new Double(optionString)).doubleValue());
    }
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String minStdDevTipText() {
    return "set minimum allowable standard deviation";
  }

  /**
   * Set the minimum value for standard deviation when calculating
   * normal density. Reducing this value can help prevent arithmetic
   * overflow resulting from multiplying large densities (arising from small
   * standard deviations) when there are many singleton or near singleton
   * values.
   * @param m minimum value for standard deviation
   */
  public void setMinStdDev(double m) {
    m_minStdDev = m;
  }

  /**
   * Get the minimum allowable standard deviation.
   * @return the minumum allowable standard deviation
   */
  public double getMinStdDev() {
    return m_minStdDev;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String seedTipText() {
    return "random number seed";
  }


  /**
   * Set the random number seed
   *
   * @param s the seed
   */
  public void setSeed (int s) {
    m_rseed = s;
  }


  /**
   * Get the random number seed
   *
   * @return the seed
   */
  public int getSeed () {
    return  m_rseed;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String numClustersTipText() {
    return "set number of clusters. -1 to select number of clusters "
      +"automatically by cross validation.";
  }

  /**
   * Set the number of clusters (-1 to select by CV).
   *
   * @param n the number of clusters
   * @exception Exception if n is 0
   */
  public void setNumClusters (int n)
    throws Exception {
    
    if (n == 0) {
      throw  new Exception("Number of clusters must be > 0. (or -1 to " 
			   + "select by cross validation).");
    }

    if (n < 0) {
      m_num_clusters = -1;
      m_initialNumClusters = -1;
    }
    else {
      m_num_clusters = n;
      m_initialNumClusters = n;
    }
  }


  /**
   * Get the number of clusters
   *
   * @return the number of clusters.
   */
  public int getNumClusters () {
    return  m_initialNumClusters;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String maxIterationsTipText() {
    return "maximum number of iterations";
  }

  /**
   * Set the maximum number of iterations to perform
   *
   * @param i the number of iterations
   * @exception Exception if i is less than 1
   */
  public void setMaxIterations (int i)
    throws Exception {
    if (i < 1) {
      throw  new Exception("Maximum number of iterations must be > 0!");
    }

    m_max_iterations = i;
  }


  /**
   * Get the maximum number of iterations
   *
   * @return the number of iterations
   */
  public int getMaxIterations () {
    return  m_max_iterations;
  }


  /**
   * Set debug mode - verbose output
   *
   * @param v true for verbose output
   */
  public void setDebug (boolean v) {
    m_verbose = v;
  }


  /**
   * Get debug mode
   *
   * @return true if debug mode is set
   */
  public boolean getDebug () {
    return  m_verbose;
  }


  /**
   * Gets the current settings of EM.
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions () {
    String[] options = new String[9];
    int current = 0;

    if (m_verbose) {
      options[current++] = "-V";
    }

    options[current++] = "-I";
    options[current++] = "" + m_max_iterations;
    options[current++] = "-N";
    options[current++] = "" + getNumClusters();
    options[current++] = "-S";
    options[current++] = "" + m_rseed;
    options[current++] = "-M";
    options[current++] = ""+getMinStdDev();

    while (current < options.length) {
      options[current++] = "";
    }

    return  options;
  }

  /**
   * Initialise estimators and storage.
   *
   * @param inst the instances
   * @param num_cl the number of clusters
   **/
  private void EM_Init (Instances inst, int num_cl)
    throws Exception {
    int i, j, k;
    m_weights = new double[inst.numInstances()][num_cl];
    m_model = new DiscreteEstimator[num_cl][m_num_attribs];
    m_modelNormal = new double[num_cl][m_num_attribs][3];
    m_priors = new double[num_cl];

    for (i = 0; i < num_cl; i++) {
      for (j = 0; j < m_num_attribs; j++) {
	if (inst.attribute(j).isNominal()) {
	  m_model[i][j] = new DiscreteEstimator(m_theInstances.
						attribute(j).numValues()
						, true);
	  for (k=0; k<m_theInstances.attribute(j).numValues(); k++) {
	    m_model[i][j].addValue(k, 10*m_rr.nextDouble());
	  }
	}
	else {
	  double delta_init = m_maxValues[j]-m_minValues[j];
	  m_modelNormal[i][j][0] = m_minValues[j]+delta_init*m_rr.nextDouble();
	  m_modelNormal[i][j][1] = delta_init/(2*num_cl);
	  m_modelNormal[i][j][2] = 1.0;
	}
      }
    }
    
    // initially equal priors
    for (j = 0; j < num_cl; j++) {
      m_priors[j] += 1.0;
    }
    Utils.normalize(m_priors);
  }


  /**
   * calculate prior probabilites for the clusters
   *
   * @param inst the instances
   * @param num_cl the number of clusters
   * @exception Exception if priors can't be calculated
   **/
  private void estimate_priors (Instances inst, int num_cl)
    throws Exception {
    for (int i = 0; i < num_cl; i++) {
      m_priors[i] = 0.0;
    }

    for (int i = 0; i < inst.numInstances(); i++) {
      for (int j = 0; j < num_cl; j++) {
        m_priors[j] += m_weights[i][j];
      }
    }

    Utils.normalize(m_priors);
  }


  /**
   * Density function of normal distribution.
   * @param x input value
   * @param mean mean of distribution
   * @param stdDev standard deviation of distribution
   */
  private double normalDens (double x, double mean, double stdDev) {
    double diff = x - mean;
   
    return  (1/(m_normConst*stdDev))*Math.exp(-(diff*diff/(2*stdDev*stdDev)));
  }


  /**
   * New probability estimators for an iteration
   *
   * @param num_cl the numbe of clusters
   */
  private void new_estimators (int num_cl) {
    for (int i = 0; i < num_cl; i++) {
      for (int j = 0; j < m_num_attribs; j++) {
        if (m_theInstances.attribute(j).isNominal()) {
          m_model[i][j] = new DiscreteEstimator(m_theInstances.
						attribute(j).numValues()
						, true);
        }
        else {
          m_modelNormal[i][j][0] = m_modelNormal[i][j][1] = 
	    m_modelNormal[i][j][2] = 0.0;
        }
      }
    }
  }


  /**
   * The M step of the EM algorithm.
   * @param inst the training instances
   * @param num_cl the number of clusters
   */
  private void M (Instances inst, int num_cl)
    throws Exception {
    int i, j, l;
    new_estimators(num_cl);

    for (i = 0; i < num_cl; i++) {
      for (j = 0; j < m_num_attribs; j++) {
        for (l = 0; l < inst.numInstances(); l++) {
          if (!inst.instance(l).isMissing(j)) {
            if (inst.attribute(j).isNominal()) {
              m_model[i][j].addValue(inst.instance(l).value(j), 
				     m_weights[l][i]);
            }
            else {
              m_modelNormal[i][j][0] += (inst.instance(l).value(j) * 
					 m_weights[l][i]);
              m_modelNormal[i][j][2] += m_weights[l][i];
              m_modelNormal[i][j][1] += (inst.instance(l).value(j) * 
					 inst.instance(l).value(j)*m_weights[l][i]);
            }
          }
        }
      }
    }
    
    // calcualte mean and std deviation for numeric attributes
    for (j = 0; j < m_num_attribs; j++) {
      if (!inst.attribute(j).isNominal()) {
        for (i = 0; i < num_cl; i++) {
          if (m_modelNormal[i][j][2] < 0) {
            m_modelNormal[i][j][1] = 0;
          } else {

	  // variance
	    m_modelNormal[i][j][1] = (m_modelNormal[i][j][1] - 
				      (m_modelNormal[i][j][0] * 
				       m_modelNormal[i][j][0] / 
				       m_modelNormal[i][j][2])) / 
	      m_modelNormal[i][j][2];

	    // std dev      
	    m_modelNormal[i][j][1] = Math.sqrt(m_modelNormal[i][j][1]); 

	    if (m_modelNormal[i][j][1] <= m_minStdDev 
		|| Double.isNaN(m_modelNormal[i][j][1])) {
	      m_modelNormal[i][j][1] = 
		m_minStdDev;
	    }
	    
	    // mean
	    if (m_modelNormal[i][j][2] > 0.0) {
	      m_modelNormal[i][j][0] /= m_modelNormal[i][j][2];
	    }
	  }        
        }
      }
    }
  }


  /**
   * The E step of the EM algorithm. Estimate cluster membership 
   * probabilities.
   *
   * @param inst the training instances
   * @param num_cl the number of clusters
   * @return the average log likelihood
   */
  private double E (Instances inst, int num_cl)
    throws Exception {
    int i, j, l;
    double prob;
    double loglk = 0.0;

    for (l = 0; l < inst.numInstances(); l++) {
      for (i = 0; i < num_cl; i++) {
	m_weights[l][i] = m_priors[i];
      }
      for (j = 0; j < m_num_attribs; j++) {
	double max = 0;
	for (i = 0; i < num_cl; i++) {
	  
          if (!inst.instance(l).isMissing(j)) {
            if (inst.attribute(j).isNominal()) {
              m_weights[l][i] *= 
		m_model[i][j].getProbability(inst.instance(l).value(j));
	      
            }
            else {
              // numeric attribute
              m_weights[l][i] *= normalDens(inst.instance(l).value(j), 
					    m_modelNormal[i][j][0], 
					    m_modelNormal[i][j][1]);
	      if (Double.isInfinite(m_weights[l][i])) {
		throw new Exception("Joint density has overflowed. Try "
				    +"increasing the minimum allowable "
				    +"standard deviation for normal "
				    +"density calculation.");
	      }
            }
	    if (m_weights[l][i] > max) {
	      max = m_weights[l][i];
	    }
          }
        }
	if (max > 0 && max < 1e-75) { // check for underflow
	  for (int zz = 0; zz < num_cl; zz++) {
	    // rescale
	    m_weights[l][zz] *= 1e75;
	  }
	}
      }
      
      double temp1 = 0;
      
      for (i = 0; i < num_cl; i++) {
        temp1 += m_weights[l][i];
      }
      
      if (temp1 > 0) {
        loglk += Math.log(temp1);
      }
      
      // normalise the weights for this instance
      try {
	Utils.normalize(m_weights[l]);
      } catch (Exception e) {
	throw new Exception("An instance has zero cluster memberships. Try "
			    +"increasing the minimum allowable "
			    +"standard deviation for normal "
			    +"density calculation.");
      }
    }
    
    // reestimate priors
    estimate_priors(inst, num_cl);
    return  loglk/inst.numInstances();
  }
  
  
  /**
   * Constructor.
   *
   **/
  public EM () {
    resetOptions();
  }


  /**
   * Reset to default options
   */
  protected void resetOptions () {
    m_minStdDev = 1e-6;
    m_max_iterations = 100;
    m_rseed = 100;
    m_num_clusters = -1;
    m_initialNumClusters = -1;
    m_verbose = false;
  }

  /**
   * Return the normal distributions for the cluster models
   *
   * @return a <code>double[][][]</code> value
   */
  public double [][][] getClusterModelsNumericAtts() {
    return m_modelNormal;
  }

  /**
   * Return the priors for the clusters
   *
   * @return a <code>double[]</code> value
   */
  public double [] getClusterPriors() {
    return m_priors;
  }

  /**
   * Outputs the generated clusters into a string.
   */
  public String toString () {
    StringBuffer text = new StringBuffer();
    text.append("\nEM\n==\n");
    if (m_initialNumClusters == -1) {
      text.append("\nNumber of clusters selected by cross validation: "
		  +m_num_clusters+"\n");
    } else {
      text.append("\nNumber of clusters: " + m_num_clusters + "\n");
    }

    for (int j = 0; j < m_num_clusters; j++) {
      text.append("\nCluster: " + j + " Prior probability: " 
		  + Utils.doubleToString(m_priors[j], 4) + "\n\n");

      for (int i = 0; i < m_num_attribs; i++) {
        text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");

        if (m_theInstances.attribute(i).isNominal()) {
          if (m_model[j][i] != null) {
            text.append(m_model[j][i].toString());
          }
        }
        else {
          text.append("Normal Distribution. Mean = " 
		      + Utils.doubleToString(m_modelNormal[j][i][0], 4) 
		      + " StdDev = " 
		      + Utils.doubleToString(m_modelNormal[j][i][1], 4) 
		      + "\n");
        }
      }
    }

    return  text.toString();
  }


  /**
   * verbose output for debugging
   * @param inst the training instances
   */
  private void EM_Report (Instances inst) {
    int i, j, l, m;
    System.out.println("======================================");

    for (j = 0; j < m_num_clusters; j++) {
      for (i = 0; i < m_num_attribs; i++) {
	System.out.println("Clust: " + j + " att: " + i + "\n");

	if (m_theInstances.attribute(i).isNominal()) {
	  if (m_model[j][i] != null) {
	    System.out.println(m_model[j][i].toString());
	  }
	}
	else {
	  System.out.println("Normal Distribution. Mean = " 
			     + Utils.doubleToString(m_modelNormal[j][i][0]
						    , 8, 4) 
			     + " StandardDev = " 
			     + Utils.doubleToString(m_modelNormal[j][i][1]
						    , 8, 4) 
			     + " WeightSum = " 
			     + Utils.doubleToString(m_modelNormal[j][i][2]
						    , 8, 4));
	}
      }
    }
    
    for (l = 0; l < inst.numInstances(); l++) {
      m = Utils.maxIndex(m_weights[l]);
      System.out.print("Inst " + Utils.doubleToString((double)l, 5, 0) 
		       + " Class " + m + "\t");
      for (j = 0; j < m_num_clusters; j++) {
	System.out.print(Utils.doubleToString(m_weights[l][j], 7, 5) + "  ");
      }
      System.out.println();
    }
  }


  /**
   * estimate the number of clusters by cross validation on the training
   * data.
   *
   * @return the number of clusters selected
   */
  private int CVClusters ()
    throws Exception {
    double CVLogLikely = -Double.MAX_VALUE;
    double templl, tll;
    boolean CVincreased = true;
    int num_cl = 1;
    int i;
    Random cvr;
    Instances trainCopy;
    int numFolds = (m_theInstances.numInstances() < 10) 
      ? m_theInstances.numInstances() 
      : 10;

    while (CVincreased) {
      CVincreased = false;
      cvr = new Random(m_rseed);
      trainCopy = new Instances(m_theInstances);
      trainCopy.randomize(cvr);
      // theInstances.stratify(10);
      templl = 0.0;

      for (i = 0; i < numFolds; i++) {
	Instances cvTrain = trainCopy.trainCV(numFolds, i);
	Instances cvTest = trainCopy.testCV(numFolds, i);
	EM_Init(cvTrain, num_cl);
	iterate(cvTrain, num_cl, false);
	tll = E(cvTest, num_cl);

	if (m_verbose) {
	  System.out.println("# clust: " + num_cl + " Fold: " + i 
			     + " Loglikely: " + tll);
	}

	templl += tll;
      }

      templl /= (double)numFolds;

      if (m_verbose) {
	System.out.println("===================================" 
			   + "==============\n# clust: " 
			   + num_cl 
			   + " Mean Loglikely: " 
			   + templl 
			   + "\n================================" 
			   + "=================");
      }

      if (templl > CVLogLikely) {
	CVLogLikely = templl;
	CVincreased = true;
	num_cl++;
      }
    }

    if (m_verbose) {
      System.out.println("Number of clusters: " + (num_cl - 1));
    }

    return  num_cl - 1;
  }


  /**
   * Returns the number of clusters.
   *
   * @return the number of clusters generated for a training dataset.
   * @exception Exception if number of clusters could not be returned
   * successfully
   */
  public int numberOfClusters ()
    throws Exception {
    if (m_num_clusters == -1) {
      throw  new Exception("Haven't generated any clusters!");
    }

    return  m_num_clusters;
  }

 /**
  * Updates the minimum and maximum values for all the attributes
  * based on a new instance.
  *
  * @param instance the new instance
  */
  private void updateMinMax(Instance instance) {
    
    for (int j = 0; j < m_theInstances.numAttributes(); j++) {
      if (!instance.isMissing(j)) {
	if (Double.isNaN(m_minValues[j])) {
	  m_minValues[j] = instance.value(j);
	  m_maxValues[j] = instance.value(j);
	} else {
	  if (instance.value(j) < m_minValues[j]) {
	    m_minValues[j] = instance.value(j);
	  } else {
	    if (instance.value(j) > m_maxValues[j]) {
	      m_maxValues[j] = instance.value(j);
	    }
	  }
	}
      }
    }
  }
  
  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer
   * that are not being set via options.
   *
   * @param data set of instances serving as training data 
   * @exception Exception if the clusterer has not been 
   * generated successfully
   */
  public void buildClusterer (Instances data)
    throws Exception {
    if (data.checkForStringAttributes()) {
      throw  new Exception("Can't handle string attributes!");
    }
    
    m_theInstances = data;
    
    // calculate min and max values for attributes
    m_minValues = new double [m_theInstances.numAttributes()];
    m_maxValues = new double [m_theInstances.numAttributes()];
    for (int i = 0; i < m_theInstances.numAttributes(); i++) {
      m_minValues[i] = m_maxValues[i] = Double.NaN;
    }
    for (int i = 0; i < m_theInstances.numInstances(); i++) {
      updateMinMax(m_theInstances.instance(i));
    }

    doEM();
    
    // save memory
    m_theInstances = new Instances(m_theInstances,0);
  }

  /**
   * Computes the density for a given instance.
   * 
   * @param inst the instance to compute the density for
   * @return the density.
   * @exception Exception if the density could not be computed
   * successfully
   */
  public double densityForInstance(Instance inst) throws Exception {
    return Utils.sum(weightsForInstance(inst));
  }

  /**
   * Predicts the cluster memberships for a given instance.
   *
   * @param data set of test instances
   * @param instance the instance to be assigned a cluster.
   * @return an array containing the estimated membership 
   * probabilities of the test instance in each cluster (this 
   * should sum to at most 1)
   * @exception Exception if distribution could not be 
   * computed successfully
   */
  public double[] distributionForInstance (Instance inst)
    throws Exception {
    double [] distrib = weightsForInstance(inst);
    Utils.normalize(distrib);
    return distrib;
  }

  /**
   * Returns the weights (indicating cluster membership) for a given instance
   * 
   * @param inst the instance to be assigned a cluster
   * @return an array of weights
   * @exception Exception if weights could not be computed
   */
  protected double[] weightsForInstance(Instance inst)
    throws Exception {

    int i, j;
    double prob;
    double[] wghts = new double[m_num_clusters];

    for (i = 0; i < m_num_clusters; i++) {
      prob = 1.0;

      for (j = 0; j < m_num_attribs; j++) {
	if (!inst.isMissing(j)) {
	  if (inst.attribute(j).isNominal()) {
	    prob *= m_model[i][j].getProbability(inst.value(j));
	  }
	  else { // numeric attribute
	    prob *= normalDens(inst.value(j), 
			       m_modelNormal[i][j][0], 
			       m_modelNormal[i][j][1]);
	  }
	}
      }

      wghts[i] = (prob*m_priors[i]);
    }

    return  wghts;
  }


  /**
   * Perform the EM algorithm
   */
  private void doEM ()
    throws Exception {
    if (m_verbose) {
      System.out.println("Seed: " + m_rseed);
    }

    m_rr = new Random(m_rseed);

    // throw away numbers to avoid problem of similar initial numbers
    // from a similar seed
    for (int i=0; i<10; i++) m_rr.nextDouble();

    m_num_instances = m_theInstances.numInstances();
    m_num_attribs = m_theInstances.numAttributes();

    if (m_verbose) {
      System.out.println("Number of instances: " 
			 + m_num_instances 
			 + "\nNumber of atts: " 
			 + m_num_attribs 
			 + "\n");
    }

    // setDefaultStdDevs(theInstances);
    // cross validate to determine number of clusters?
    if (m_initialNumClusters == -1) {
      if (m_theInstances.numInstances() > 9) {
	m_num_clusters = CVClusters();
      } else {
	m_num_clusters = 1;
      }
    }

    // fit full training set
    EM_Init(m_theInstances, m_num_clusters);
    m_loglikely = iterate(m_theInstances, m_num_clusters, m_verbose);
  }


  /**
   * iterates the E and M steps until the log likelihood of the data
   * converges.
   *
   * @param inst the training instances.
   * @param num_cl the number of clusters.
   * @param report be verbose.
   * @return the log likelihood of the data
   */
  private double iterate (Instances inst, int num_cl, boolean report)
    throws Exception {
    int i;
    double llkold = 0.0;
    double llk = 0.0;

    if (report) {
      EM_Report(inst);
    }

    for (i = 0; i < m_max_iterations; i++) {
      llkold = llk;
      llk = E(inst, num_cl);

      if (report) {
	System.out.println("Loglikely: " + llk);
      }

      if (i > 0) {
	if ((llk - llkold) < 1e-6) {
	  break;
	}
      }
      M(inst, num_cl);
    }

    if (report) {
      EM_Report(inst);
    }

    return  llk;
  }


  // ============
  // Test method.
  // ============
  /**
   * Main method for testing this class.
   *
   * @param argv should contain the following arguments: <p>
   * -t training file [-T test file] [-N number of clusters] [-S random seed]
   */
  public static void main (String[] argv) {
    try {
      System.out.println(ClusterEvaluation.
			 evaluateClusterer(new EM(), argv));
    }
    catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
    }
  }
}