sIB.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    sIB.java
 *    Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.matrix.Matrix;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.DenseInstance;

/**
 <!-- globalinfo-start -->
 * Cluster data using the sequential information bottleneck algorithm.<br/>
 * <br/>
 * Note: only hard clustering scheme is supported. sIB assign for each instance the cluster that have the minimum cost/distance to the instance. The trade-off beta is set to infinite so 1/beta is zero.<br/>
 * <br/>
 * For more information, see:<br/>
 * <br/>
 * Noam Slonim, Nir Friedman, Naftali Tishby: Unsupervised document classification using sequential information maximization. In: Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval, 129-136, 2002.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * @inproceedings{Slonim2002,
 *    author = {Noam Slonim and Nir Friedman and Naftali Tishby},
 *    booktitle = {Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval},
 *    pages = {129-136},
 *    title = {Unsupervised document classification using sequential information maximization},
 *    year = {2002}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -I <num>
 *  maximum number of iterations
 *  (default 100).</pre>
 * 
 * <pre> -M <num>
 *  minimum number of changes in a single iteration
 *  (default 0).</pre>
 * 
 * <pre> -N <num>
 *  number of clusters.
 *  (default 2).</pre>
 * 
 * <pre> -R <num>
 *  number of restarts.
 *  (default 5).</pre>
 * 
 * <pre> -U
 *  set not to normalize the data
 *  (default true).</pre>
 * 
 * <pre> -V
 *  set to output debug info
 *  (default false).</pre>
 * 
 * <pre> -S <num>
 *  Random number seed.
 *  (default 1)</pre>
 * 
 <!-- options-end --> 
 *
 * @author Noam Slonim
 * @author <a href="mailto:lh92@cs.waikato.ac.nz">Anna Huang</a> 
 * @version $Revision: 5538 $
 */
public class sIB
  extends RandomizableClusterer
  implements TechnicalInformationHandler {
  
  /** for serialization. */
  private static final long    serialVersionUID = -8652125897352654213L;
  
  /**
   * Inner class handling status of the input data
   * 
   * @see Serializable
   */
  private class Input
    implements Serializable, RevisionHandler {
    
    /** for serialization */
    static final long serialVersionUID = -2464453171263384037L;
    
    /** Prior probability of each instance */
    private double[] Px;
    
    /** Prior probability of each attribute */
    private double[] Py;
    
    /** Joint distribution of attribute and instance */
    private Matrix Pyx;
    
    /** P[y|x] */
    private Matrix Py_x;
    
    /** Mutual information between the instances and the attributes */
    private double Ixy;
    
    /** Entropy of the attributes */ 
    private double Hy;
    
    /** Entropy of the instances */
    private double Hx;
    
    /** Sum values of the dataset */
    private double sumVals;
    
    /**
     * Returns the revision string.
     * 
     * @return		the revision
     */
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 5538 $");
    }
  }
  
  /**
   * Internal class handling the whole partition
   * 
   * @see Serializable
   */
  private class Partition
    implements Serializable, RevisionHandler {
    
    /** for serialization */
    static final long serialVersionUID = 4957194978951259946L;
    
    /** Cluster assignment for each instance */
    private int[]    Pt_x;
    
    /** Prior probability of each cluster */
    private double[] Pt;
    
    /** sIB equation score, to evaluate the quality of the partition */
    private double   L;
    
    /** Number of changes during the generation of this partition */
    private int      counter;
    
    /** Attribute probablities for each cluster */
    private Matrix   Py_t;

    /** 
     * Create a new empty <code>Partition</code> instance.
     */
    public Partition() {
      Pt_x = new int[m_numInstances];
      for (int i = 0; i < m_numInstances; i++) {
	Pt_x[i] = -1;
      }
      Pt = new double[m_numCluster];
      Py_t = new Matrix(m_numAttributes, m_numCluster);
      counter = 0;
    }

    /**
     * Find all the instances that have been assigned to cluster i
     * @param i index of the cluster
     * @return an arraylist of the instance ids that have been assigned to cluster i
     */
    private ArrayList<Integer> find(int i) {
      ArrayList<Integer> indices = new ArrayList<Integer>();
      for (int x = 0; x < Pt_x.length; x++) {
	if (Pt_x[x] == i) {
	  indices.add(x);
	}
      }
      return indices;
    }

    /**
     * Find the size of the cluster i
     * @param i index of the cluster
     * @return the size of cluster i
     */
    private int size(int i) {
      int count = 0;
      for (int x = 0; x < Pt_x.length; x++) {
	if (Pt_x[x] == i) {
	  count++;
	}
      }
      return count;
    }

    /**
     * Copy the current partition into T
     * @param T the target partition object
     */
    private void copy(Partition T) {
      if (T == null) {
	T = new Partition();
      }
      System.arraycopy(Pt_x, 0, T.Pt_x, 0, Pt_x.length);
      System.arraycopy(Pt, 0, T.Pt, 0, Pt.length);
      T.L = L;
      T.counter = counter;

      double[][] mArray = Py_t.getArray();
      double[][] tgtArray = T.Py_t.getArray();
      for (int i = 0; i < mArray.length; i++) {
	System.arraycopy(mArray[i], 0, tgtArray[i], 0, mArray[0].length);
      }
    }

    /**
     * Output the current partition
     * @param insts
     * @return a string that describes the partition
     */
    public String toString() {
      StringBuffer text = new StringBuffer();
      text.append("score (L) : " + Utils.doubleToString(L, 4) + "\n");
      text.append("number of changes : " + counter +"\n");
      for (int i = 0; i < m_numCluster; i++) {
	text.append("\nCluster "+i+"\n");
	text.append("size : "+size(i)+"\n");
	text.append("prior prob : "+Utils.doubleToString(Pt[i], 4)+"\n");
      }
      return text.toString();
    }
    
    /**
     * Returns the revision string.
     * 
     * @return		the revision
     */
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 5538 $");
    }
  }

  /** Training data */
  private Instances m_data;

  /** Number of clusters */
  private int m_numCluster = 2;

  /** Number of restarts */
  private int m_numRestarts = 5;
  
  /** Verbose? */
  private boolean m_verbose = false;

  /** Uniform prior probability of the documents */
  private boolean m_uniformPrior = true;

  /** Max number of iterations during each restart */
  private int m_maxLoop = 100;

  /** Minimum number of changes */
  private int m_minChange = 0;

  /** Globally replace missing values */
  private ReplaceMissingValues m_replaceMissing;

  /** Number of instances */
  private int m_numInstances;

  /** Number of attributes */
  private int m_numAttributes;
  
  /** Randomly generate initial partition */
  private Random random;
  
  /** Holds the best partition built */
  private Partition bestT;
  
  /** Holds the statistics about the input dataset */
  private Input input;
  
  /**
   * Generates a clusterer. 
   *
   * @param data the training instances 
   * @throws Exception if something goes wrong
   */
  public void buildClusterer(Instances data) throws Exception {
    // can clusterer handle the data ?
    getCapabilities().testWithFail(data);

    m_replaceMissing = new ReplaceMissingValues();
    Instances instances = new Instances(data);
    instances.setClassIndex(-1);
    m_replaceMissing.setInputFormat(instances);
    data = weka.filters.Filter.useFilter(instances, m_replaceMissing);
    instances = null;
    
    // initialize all fields that are not being set via options
    m_data = data;
    m_numInstances = m_data.numInstances();
    m_numAttributes = m_data.numAttributes();
    random = new Random(getSeed()); 
    
    // initialize the statistics of the input training data
    input = sIB_ProcessInput();
    
    // object to hold the best partition
    bestT = new Partition();

    // the real clustering
    double bestL = Double.NEGATIVE_INFINITY;
    for (int k = 0; k < m_numRestarts; k++) {   
      if(m_verbose) {
	System.out.format("restart number %s...\n", k);
      }
      
      // initialize the partition and optimize it
      Partition tmpT = sIB_InitT(input);      
      tmpT = sIB_OptimizeT(tmpT, input);

      // if a better partition is found, save it
      if (tmpT.L > bestL) {
	tmpT.copy(bestT);
	bestL = bestT.L;
      }
      
      if(m_verbose) {
	System.out.println("\nPartition status : ");
	System.out.println("------------------");
	System.out.println(tmpT.toString()+"\n");
      }
    }
    
    if(m_verbose){
      System.out.println("\nBest Partition");
      System.out.println("===============");      
      System.out.println(bestT.toString());
    }
    
    // save memory
    m_data = new Instances(m_data, 0);
  }
  
  /**
   * Cluster a given instance, this is the method defined in Clusterer
   * interface do nothing but just return the cluster assigned to it
   */
  public int clusterInstance(Instance instance) throws Exception {
    double prior = (double) 1 / input.sumVals;
    double[] distances = new double[m_numCluster]; 
    for(int i = 0; i < m_numCluster; i++){
      double Pnew = bestT.Pt[i] + prior;
      double pi1 = prior / Pnew;
      double pi2 = bestT.Pt[i] / Pnew;
      distances[i] = Pnew * JS(instance, i, pi1, pi2);
    }
    return Utils.minIndex(distances);
  }

  /**
   * Process the input and compute the statistics of the training data
   * @return an Input object which holds the statistics about the training data
   */
  private Input sIB_ProcessInput() {
    double valSum = 0.0;
    for (int i = 0; i < m_numInstances; i++) {
      valSum = 0.0;
      for (int v = 0; v < m_data.instance(i).numValues(); v++) {
	valSum += m_data.instance(i).valueSparse(v);
      }
      if (valSum <= 0) {
	if(m_verbose){
	  System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum);
	}
	m_data.delete(i);
	m_numInstances--;
      }
    }

    // get the term-document matrix
    Input input = new Input();
    input.Py_x = getTransposedNormedMatrix(m_data);    
    if (m_uniformPrior) {
      input.Pyx = input.Py_x.copy();  
      normalizePrior(m_data);
    } 
    else {
      input.Pyx = getTransposedMatrix(m_data);
    }
    input.sumVals = getTotalSum(m_data);    
    input.Pyx.timesEquals((double) 1 / input.sumVals);

    // prior probability of documents, ie. sum the columns from the Pyx matrix
    input.Px = new double[m_numInstances];
    for (int i = 0; i < m_numInstances; i++) {
      for (int j = 0; j < m_numAttributes; j++) {
	input.Px[i] += input.Pyx.get(j, i);
      }           
    }

    // prior probability of terms, ie. sum the rows from the Pyx matrix
    input.Py = new double[m_numAttributes];
    for (int i = 0; i < input.Pyx.getRowDimension(); i++) {
      for (int j = 0; j < input.Pyx.getColumnDimension(); j++) {
	input.Py[i] += input.Pyx.get(i, j);
      }
    }
    
    MI(input.Pyx, input);
    return input;
  }

  /**
   * Initialize the partition
   * @param input object holding the statistics of the training data
   * @return the initialized partition
   */
  private Partition sIB_InitT(Input input) {
    Partition T = new Partition();
    int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster);    
    
    ArrayList<Integer> permInstsIdx = new ArrayList<Integer>();
    ArrayList<Integer> unassigned = new ArrayList<Integer>();
    for (int i = 0; i < m_numInstances; i++) {
      unassigned.add(i);
    }
    while (unassigned.size() != 0) {
      int t = random.nextInt(unassigned.size());
      permInstsIdx.add(unassigned.get(t));
      unassigned.remove(t);
    }

    for (int i = 0; i < m_numCluster; i++) {
      int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize;
      for (int j = 0; j < r2; j++) {
	T.Pt_x[permInstsIdx.get(j)] = i;
      }      
      for (int j = 0; j < r2; j++) {	
	permInstsIdx.remove(0);
      }
    }
    
    // initialize the prior prob of each cluster, and the probability 
    // for each attribute within the cluster
    for (int i = 0; i < m_numCluster; i++) {
      ArrayList<Integer> indices = T.find(i);
      for (int j = 0; j < indices.size(); j++) {
	T.Pt[i] += input.Px[indices.get(j)];
      }
      double[][] mArray = input.Pyx.getArray();
      for (int j = 0; j < m_numAttributes; j++) {
	double sum = 0.0;
	for (int k = 0; k < indices.size(); k++) {
	  sum += mArray[j][indices.get(k)];
	}
	sum /= T.Pt[i];
	T.Py_t.set(j, i, sum);
      }
    }
    
    if(m_verbose) {
      System.out.println("Initializing...");
    }
    return T;
  }

  /**
   * Optimize the partition
   * @param tmpT partition to be optimized
   * @param input object describing the statistics of the training dataset
   * @return the optimized partition
   */
  private Partition sIB_OptimizeT(Partition tmpT, Input input) {
    boolean done = false;
    int change = 0, loopCounter = 0;
    if(m_verbose) {
      System.out.println("Optimizing...");
      System.out.println("-------------");
    }
    while (!done) {
      change = 0;
      for (int i = 0; i < m_numInstances; i++) {
	int old_t = tmpT.Pt_x[i];
	// If the current cluster only has one instance left, leave it.
	if (tmpT.size(old_t) == 1) {
	  if(m_verbose){
	    System.out.format("cluster %s has only 1 doc remain\n", old_t);
	  }
	  continue;
	}
	// draw the instance out from its previous cluster
	reduce_x(i, old_t, tmpT, input);
	
	// re-cluster the instance
	int new_t = clusterInstance(i, input, tmpT);	
	if (new_t != old_t) {	  
	  change++;
	  updateAssignment(i, new_t, tmpT, input.Px[i], input.Py_x);
	}
      }
      
      tmpT.counter += change;
      if(m_verbose){
	System.out.format("iteration %s , changes : %s\n", loopCounter, change);
      }
      done = checkConvergence(change, loopCounter);
      loopCounter++;
    }

    // compute the sIB score
    tmpT.L = sIB_local_MI(tmpT.Py_t, tmpT.Pt);
    if(m_verbose){
      System.out.format("score (L) : %s \n", Utils.doubleToString(tmpT.L, 4));
    }
    return tmpT;
  }

  /**
   * Draw a instance out from a cluster. 
   * @param instIdx index of the instance to be drawn out
   * @param t index of the cluster which the instance previously belong to
   * @param T the current working partition
   * @param input the input statistics
   */
  private void reduce_x(int instIdx, int t, Partition T, Input input) {
    // Update the prior probability of the cluster
    ArrayList<Integer> indices = T.find(t);
    double sum = 0.0;
    for (int i = 0; i < indices.size(); i++) {
      if (indices.get(i) == instIdx)
	continue;      
      sum += input.Px[indices.get(i)];
    }
    T.Pt[t] = sum;
    
    if (T.Pt[t] < 0) {
      System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]);
      T.Pt[t] = 0;
    }
    
    // Update prob of each attribute in the cluster    
    double[][] mArray = input.Pyx.getArray();
    for (int i = 0; i < m_numAttributes; i++) {
      sum = 0.0;
      for (int j = 0; j < indices.size(); j++) {
	if (indices.get(j) == instIdx)
	  continue;
	sum += mArray[i][indices.get(j)];
      }
      T.Py_t.set(i, t, sum / T.Pt[t]);
    }    
  }

  /**
   * Put an instance into a new cluster and update.
   * @param instIdx instance to be updated
   * @param newt index of the new cluster this instance has been assigned to
   * @param T the current working partition
   * @param Px an array of prior probabilities of the instances
   */
  private void updateAssignment(int instIdx, int newt, Partition T, double Px, Matrix Py_x) {    
    T.Pt_x[instIdx] = newt;
    
    // update probability of attributes in the cluster 
    double mass = Px + T.Pt[newt];
    double pi1 = Px / mass;
    double pi2 = T.Pt[newt] / mass;
    for (int i = 0; i < m_numAttributes; i++) {
      T.Py_t.set(i, newt, pi1 * Py_x.get(i, instIdx) + pi2 * T.Py_t.get(i, newt));
    }

    T.Pt[newt] = mass;
  }
  
  /**
   * Check whether the current iteration is converged
   * @param change number of changes in current iteration
   * @param loops number of iterations done
   * @return true if the iteration is converged, false otherwise
   */
  private boolean checkConvergence(int change, int loops) {
    if (change <= m_minChange || loops >= m_maxLoop) {
      if(m_verbose){
	System.out.format("\nsIB converged after %s iterations with %s changes\n", loops,
	  change);
      }
      return true;
    }
    return false;
  }

  /**
   * Cluster an instance into the nearest cluster. 
   * @param instIdx Index of the instance to be clustered
   * @param input Object which describe the statistics of the training dataset
   * @param T Partition
   * @return index of the cluster that has the minimum distance to the instance
   */
  private int clusterInstance(int instIdx, Input input, Partition T) {
    double[] distances = new double[m_numCluster];
    for (int i = 0; i < m_numCluster; i++) {
      double Pnew = input.Px[instIdx] + T.Pt[i];
      double pi1 = input.Px[instIdx] / Pnew;
      double pi2 = T.Pt[i] / Pnew;
      distances[i] = Pnew * JS(instIdx, input, T, i, pi1, pi2);
    }
    return Utils.minIndex(distances);    
  }

  /**
   * Compute the JS divergence between an instance and a cluster, used for training data
   * @param instIdx index of the instance
   * @param input statistics of the input data
   * @param T the whole partition
   * @param t index of the cluster 
   * @param pi1
   * @param pi2
   * @return the JS divergence
   */
  private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) {
    if (Math.min(pi1, pi2) <= 0) {
      System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
      return 0;
    }
    Instance inst = m_data.instance(instIdx);
    double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;    
    for (int i = 0; i < inst.numValues(); i++) {
      tmp = input.Py_x.get(inst.index(i), instIdx);      
      if(tmp != 0) {
	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t)));
      }
    }
    for (int i = 0; i < m_numAttributes; i++) {
      if ((tmp = T.Py_t.get(i, t)) != 0) {
	kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp));
      }
    }    
    return pi1 * kl1 + pi2 * kl2;
  }
  
  /**
   * Compute the JS divergence between an instance and a cluster, used for test data
   * @param inst instance to be clustered
   * @param t index of the cluster
   * @param pi1
   * @param pi2
   * @return the JS divergence
   */
  private double JS(Instance inst, int t, double pi1, double pi2) {
    if (Math.min(pi1, pi2) <= 0) {
      System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
      return 0;
    }
    double sum = Utils.sum(inst.toDoubleArray());
    double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;    
    for (int i = 0; i < inst.numValues(); i++) {
      tmp = inst.valueSparse(i) / sum;      
      if(tmp != 0) {
	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t)));
      }
    }
    for (int i = 0; i < m_numAttributes; i++) {
      if ((tmp = bestT.Py_t.get(i, t)) != 0) {
	kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1  / sum + pi2 * tmp));
      }
    }    
    return pi1 * kl1 + pi2 * kl2;
  }
  
  /**
   * Compute the sIB score
   * @param m a term-cluster matrix, with m[i, j] is the probability of term i given cluster j  
   * @param Pt an array of cluster prior probabilities
   * @return the sIB score which indicates the quality of the partition
   */
  private double sIB_local_MI(Matrix m, double[] Pt) {
    double Hy = 0.0, Ht = 0.0;
    for (int i = 0; i < Pt.length; i++) {
      Ht += Pt[i] * Math.log(Pt[i]);
    }
    Ht = -Ht;
    
    for (int i = 0; i < m_numAttributes; i++) {
      double Py = 0.0;
      for (int j = 0; j < m_numCluster; j++) {
	Py += m.get(i, j) * Pt[j];	
      }     
      if(Py == 0) continue;
      Hy += Py * Math.log(Py);
    }
    Hy = -Hy;
    
    double Hyt = 0.0, tmp = 0.0;
    for (int i = 0; i < m.getRowDimension(); i++) {
      for (int j = 0; j < m.getColumnDimension(); j++) {
	if ((tmp = m.get(i, j)) == 0 || Pt[j] == 0) {
	  continue;
	}
	tmp *= Pt[j];
	Hyt += tmp * Math.log(tmp);
      }
    }
    return Hy + Ht + Hyt;
  }
  
  /**
   * Get the sum of value of the dataset
   * @param data set of instances to handle
   * @return sum of all the attribute values for all the instances in the dataset
   */
  private double getTotalSum(Instances data) {
    double sum = 0.0;
    for (int i = 0; i < data.numInstances(); i++) {
      for (int v = 0; v < data.instance(i).numValues(); v++) {
	sum += data.instance(i).valueSparse(v);
      }
    }
    return sum;
  }

  /**
   * Transpose the document-term matrix to term-document matrix
   * @param data instances with document-term info
   * @return a term-document matrix transposed from the input dataset
   */
  private Matrix getTransposedMatrix(Instances data) {
    double[][] temp = new double[data.numAttributes()][data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
      Instance inst = data.instance(i);
      for (int v = 0; v < inst.numValues(); v++) {
	temp[inst.index(v)][i] = inst.valueSparse(v);
      }
    }
    Matrix My_x = new Matrix(temp);
    return My_x;
  }

  /**
   * Normalize the document vectors
   * @param data instances to be normalized
   */
  private void normalizePrior(Instances data) {
    for (int i = 0; i < data.numInstances(); i++) {
      normalizeInstance(data.instance(i));
    }
  }
  
  /**
   * Normalize the instance
   * @param inst instance to be normalized
   * @return a new Instance with normalized values
   */
  private Instance normalizeInstance(Instance inst) {
    double[] vals = inst.toDoubleArray();
    double sum = Utils.sum(vals);
    for(int i = 0; i < vals.length; i++) {
      vals[i] /= sum;
    }
    return new DenseInstance(inst.weight(), vals);
  }
  
  private Matrix getTransposedNormedMatrix(Instances data) {
    Matrix matrix = new Matrix(data.numAttributes(), data.numInstances());
    for(int i = 0; i < data.numInstances(); i++){
      double[] vals = data.instance(i).toDoubleArray();
      double sum = Utils.sum(vals);
      for (int v = 0; v < vals.length; v++) {
	vals[v] /= sum;
	matrix.set(v, i, vals[v]);
      }      
    }
    return matrix;
  }
  
  /**
   * Compute the MI between instances and attributes
   * @param m the term-document matrix
   * @param input object that describes the statistics about the training data
   */
  private void MI(Matrix m, Input input){    
    int minDimSize = m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension();
    if(minDimSize < 2){
      System.err.println("Warning : This is not a JOINT distribution");
      input.Hx = Entropy (m);
      input.Hy = 0;
      input.Ixy = 0;
      return;
    }
    
    input.Hx = Entropy(input.Px);
    input.Hy = Entropy(input.Py);
    
    double entropy = input.Hx + input.Hy;    
    for (int i=0; i < m_numInstances; i++) {
      Instance inst = m_data.instance(i);
      for (int v = 0; v < inst.numValues(); v++) {
	double tmp = m.get(inst.index(v), i);
	if(tmp <= 0) continue;
	entropy += tmp * Math.log(tmp);
      }
    }
    input.Ixy = entropy;
    if(m_verbose) {
      System.out.println("Ixy = " + input.Ixy);
    }
  }
  
  /**
   * Compute the entropy score based on an array of probabilities
   * @param probs array of non-negative and normalized probabilities
   * @return the entropy value
   */
  private double Entropy(double[] probs){
    for (int i = 0; i < probs.length; i++){
      if (probs[i] <= 0) {
	if(m_verbose) {
	  System.out.println("Warning: Negative probability.");
	}
	return Double.NaN;
      }
    }
    // could be unormalized, when normalization is not specified 
    if(Math.abs(Utils.sum(probs)-1) >= 1e-6) {
      if(m_verbose) {
	System.out.println("Warning: Not normalized.");
      }
      return Double.NaN;
    }
    
    double mi = 0.0;
    for (int i = 0; i < probs.length; i++) {
      mi += probs[i] * Math.log(probs[i]);
    }
    mi = -mi;
    return mi;
  }
  
  /**
   * Compute the entropy score based on a matrix
   * @param p a matrix with non-negative and normalized probabilities
   * @return the entropy value
   */
  private double Entropy(Matrix p) {
    double mi = 0;
    for (int i = 0; i < p.getRowDimension(); i++) {
      for (int j = 0; j < p.getColumnDimension(); j++) {
	if(p.get(i, j) == 0){
	  continue;
	}
	mi += p.get(i, j) + Math.log(p.get(i, j)); 
      }
    }
    mi = -mi;
    return mi;
  }
  
  /**
   * Parses a given list of options. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -I <num>
   *  maximum number of iterations
   *  (default 100).</pre>
   * 
   * <pre> -M <num>
   *  minimum number of changes in a single iteration
   *  (default 0).</pre>
   * 
   * <pre> -N <num>
   *  number of clusters.
   *  (default 2).</pre>
   * 
   * <pre> -R <num>
   *  number of restarts.
   *  (default 5).</pre>
   * 
   * <pre> -U
   *  set not to normalize the data
   *  (default true).</pre>
   * 
   * <pre> -V
   *  set to output debug info
   *  (default false).</pre>
   * 
   * <pre> -S <num>
   *  Random number seed.
   *  (default 1)</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String optionString = Utils.getOption('I', options);
    if (optionString.length() != 0) {
      setMaxIterations(Integer.parseInt(optionString));
    }
    optionString = Utils.getOption('M', options);
    if (optionString.length() != 0) {
      setMinChange((new Integer(optionString)).intValue());
    }
    optionString = Utils.getOption('N', options);
    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    } 
    optionString = Utils.getOption('R', options);
    if (optionString.length() != 0) {
      setNumRestarts((new Integer(optionString)).intValue());
    }    
    setNotUnifyNorm(Utils.getFlag('U', options));    
    setDebug(Utils.getFlag('V', options));
    
    super.setOptions(options);
  }

  /**
   * Returns an enumeration describing the available options.
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector<Option> result = new Vector<Option>();
    result.addElement(new Option("\tmaximum number of iterations\n"
	+ "\t(default 100).", "I", 1, "-I <num>"));
    result.addElement(new Option(
	"\tminimum number of changes in a single iteration\n"
	+ "\t(default 0).", "M", 1, "-M <num>"));
    result.addElement(new Option("\tnumber of clusters.\n" + "\t(default 2).",
	"N", 1, "-N <num>"));
    result.addElement(new Option("\tnumber of restarts.\n" 
	+ "\t(default 5).", "R", 1, "-R <num>"));
    result.addElement(new Option("\tset not to normalize the data\n" 
	+ "\t(default true).", "U", 0, "-U"));
    result.addElement(new Option("\tset to output debug info\n" 
	+ "\t(default false).", "V", 0, "-V"));
    
    Enumeration en = super.listOptions();
    while (en.hasMoreElements())
      result.addElement((Option) en.nextElement());
    
    return result.elements();
  }
  
  /**
   * Gets the current settings.
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions() {
    Vector<String> result;
    result = new Vector<String>();
    result.add("-I"); 
    result.add("" + getMaxIterations());
    result.add("-M"); 
    result.add("" + getMinChange());
    result.add("-N"); 
    result.add("" + getNumClusters());
    result.add("-R"); 
    result.add("" + getNumRestarts());
    if(getNotUnifyNorm()) {
      result.add("-U");
    }
    if(getDebug()) {
      result.add("-V");
    }
    
    String[] options = super.getOptions();
    for (int i = 0; i < options.length; i++){
      result.add(options[i]);
    }
    return result.toArray(new String[result.size()]);	  
  }
  
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String debugTipText() {
    return "If set to true, clusterer may output additional info to " +
      "the console.";
  }
  
  /**
   * Set debug mode - verbose output
   * @param v true for verbose output
   */
  public void setDebug (boolean v) {
    m_verbose = v;
  }
  
  /**
   * Get debug mode
   * @return true if debug mode is set
   */
  public boolean getDebug () {
    return  m_verbose;
  }
  
  /**
   * Returns the tip text for this property.
   * @return tip text for this property
   */
  public String maxIterationsTipText() {
    return "set maximum number of iterations (default 100)";
  }

  /**
   * Set the max number of iterations
   * @param i max number of iterations
   */
  public void setMaxIterations(int i) {
    m_maxLoop = i;
  }

  /**
   * Get the max number of iterations
   * @return max number of iterations
   */
  public int getMaxIterations() {
    return m_maxLoop;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property
   */
  public String minChangeTipText() {
    return "set minimum number of changes (default 0)";
  }
  
  /**
   * set the minimum number of changes
   * @param m the minimum number of changes
   */
  public void setMinChange(int m) {
    m_minChange = m;
  }
  
  /**
   * get the minimum number of changes
   * @return the minimum number of changes
   */
  public int getMinChange() {
    return m_minChange;
  }
  
  /**
   * Returns the tip text for this property.
   * @return tip text for this property
   */
  public String numClustersTipText() {
    return "set number of clusters (default 2)";
  }

  /**
   * Set the number of clusters
   * @param n number of clusters
   */
  public void setNumClusters(int n) {
    m_numCluster = n;
  }
  
  /**
   * Get the number of clusters
   * @return the number of clusters
   */
  public int getNumClusters() {
    return m_numCluster;
  }
  
  /**
   * Get the number of clusters
   * @return the number of clusters
   */
  public int numberOfClusters() {
    return m_numCluster;
  }
  
  /**
   * Returns the tip text for this property.
   * @return tip text for this property
   */
  public String numRestartsTipText() {    
    return "set number of restarts (default 5)";
  }
  
  /**
   * Set the number of restarts
   * @param i number of restarts
   */
  public void setNumRestarts(int i) {
    m_numRestarts = i;
  }
  
  /**
   * Get the number of restarts
   * @return number of restarts
   */
  public int getNumRestarts(){
    return m_numRestarts;
  } 
  
  /**
   * Returns the tip text for this property.
   * @return tip text for this property
   */
  public String notUnifyNormTipText() {
    return "set whether to normalize each instance to a unify prior probability (eg. 1).";
  }
  
  /**
   * Set whether to normalize instances to unify prior probability 
   * before building the clusterer
   * @param b true to normalize, otherwise false
   */
  public void setNotUnifyNorm(boolean b){
    m_uniformPrior = !b;
  }
  
  /**
   * Get whether to normalize instances to unify prior probability 
   * before building the clusterer
   * @return true if set to normalize, false otherwise 
   */
  public boolean getNotUnifyNorm() {
    return !m_uniformPrior;
  }
  
  /**
   * Returns a string describing this clusterer
   * @return a description of the clusterer suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the sequential information bottleneck algorithm.\n\n" +
    		"Note: only hard clustering scheme is supported. sIB assign for each " +
    		"instance the cluster that have the minimum cost/distance to the instance. " +
    		"The trade-off beta is set to infinite so 1/beta is zero.\n\n" +
    		"For more information, see:\n\n"
    		+getTechnicalInformation().toString();
  }
  
  /**
   * Returns an instance of a TechnicalInformation object, containing 
   * detailed information about the technical background of this class,
   * e.g., paper reference or book this class is based on.
   * @return the technical information about this class
   */
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation 	result;
    
    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "Noam Slonim and Nir Friedman and Naftali Tishby");
    result.setValue(Field.YEAR, "2002");
    result.setValue(Field.TITLE, "Unsupervised document classification using sequential information maximization");
    result.setValue(Field.BOOKTITLE, "Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval");
    result.setValue(Field.PAGES, "129-136");
    
    return result;
  }
  
  /**
   * Returns default capabilities of the clusterer.
   * @return      the capabilities of this clusterer
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();
    result.enable(Capability.NO_CLASS);
    
    // attributes
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    return result;
  }
  
  public String toString(){
    StringBuffer text = new StringBuffer();
    text.append("\nsIB\n===\n");
    text.append("\nNumber of clusters: " + m_numCluster + "\n");
    
    for (int j = 0; j < m_numCluster; j++) {
      text.append("\nCluster: " + j + " Size : " + bestT.size(j) + " Prior probability: " 
		  + Utils.doubleToString(bestT.Pt[j], 4) + "\n\n");
      for (int i = 0; i < m_numAttributes; i++) {
	text.append("Attribute: " + m_data.attribute(i).name() + "\n");
	text.append("Probability given the cluster = " 
	      + Utils.doubleToString(bestT.Py_t.get(i, j), 4) 
	      + "\n");
      }
    }
    return text.toString();
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 5538 $");
  }
  
  public static void main(String[] argv) {
    runClusterer(new sIB(), argv);  
  }
}