DatasetAmbiguity.java example

Explorer
weka-master
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    DatasetAmbiguity.java
 *    Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
 *
 */

package wekaexamples.core;

import weka.core.Instance;
import weka.core.InstanceComparator;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;

import java.util.TreeSet;

/**
 * Helper class for determining ambiguity in datasets. It outputs the total
 * count of instances in the dataset first. Then, the number of unique
 * instances, taking all attributes including the class attribute into
 * account. Finally, the number of unique instances, this time excluding
 * the class attribute.
 * <p/>
 * The difference between the first two numbers tells one how many duplicates
 * are in the data. The difference between the last two numbers indicates
 * how many instances "at least" have the exact same data, but differ in the class
 * attribute. "At least", as an instance with varying class values can
 * also have duplicates.
 *
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision$
 */
public class DatasetAmbiguity {

  /**
   * Expects a dataset as first parameter. Last attribute is assumed to be
   * the class attribute.
   *
   * @param args	the command-line attributes
   * @throws Exception	if something goes wrong
   */
  public static void main(String[] args) throws Exception {
    // load data
    Instances data = DataSource.read(args[0]);
    data.setClassIndex(data.numAttributes() - 1);

    // output total number of instances
    int total = data.numInstances();
    System.out.println("Total #instances: " + total);

    // output total number of unique instances (incl. class)
    InstanceComparator comp = new InstanceComparator(true);
    TreeSet<Instance> set = new TreeSet<Instance>(comp);
    for (int i = 0; i < data.numInstances(); i++)
      set.add(data.instance(i));
    int uniqueWithClass = set.size();
    System.out.println("Unique #instances (incl. class): " + uniqueWithClass);

    // output total number of unique instances (incl. class)
    comp = new InstanceComparator(false);
    set = new TreeSet<Instance>(comp);
    for (int i = 0; i < data.numInstances(); i++)
      set.add(data.instance(i));
    int uniqueWithoutClass = set.size();
    System.out.println("Unique #instances (excl. class): " + uniqueWithoutClass);

    // output summary
    System.out.println();
    System.out.println("# of duplicate instances (exact): " + (total - uniqueWithClass));
    System.out.println("# of instances with different class (at least): " + (uniqueWithClass - uniqueWithoutClass));
  }
}