/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * Instances.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */ package keel.Algorithms.SVM.SMO.core; import keel.Dataset.*; import org.core.Randomize; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.Serializable; import java.util.Enumeration; import java.util.Random; import java.util.Vector; /** * Class for handling an ordered set of weighted instances. <p> * * Typical usage: <p> * <pre> * import keel.Algorithms.SVM.SMO.core.converters.ConverterUtils.DataSource; * ... * * // Read all the instances in the file (ARFF, CSV, XRFF, ...) * DataSource source = new DataSource(filename); * Instances instances = source.getDataSet(); * * // Make the last attribute be the class * instances.setClassIndex(instances.numAttributes() - 1); * * // Print header and instances. * System.out.println("\nDataset:\n"); * System.out.println(instances); * * ... * </pre><p> * * All methods that change a set of instances are safe, ie. a change * of a set of instances does not affect any other sets of * instances. All methods that change a datasets's attribute * information clone the dataset before it is changed. * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.1 $ */ //Modified to work with KEEL attributes, instead of Weka ones (Julian) public class Instances implements Serializable { /** for serialization */ static final long serialVersionUID = -19412345060742748L; /** The filename extension that should be used for arff files */ public final static String FILE_EXTENSION = ".arff"; /** The filename extension that should be used for bin. serialized instances files */ public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi"; /** The keyword used to denote the start of an arff header */ public final static String ARFF_RELATION = "@relation"; /** The keyword used to denote the start of the arff data section */ public final static String ARFF_DATA = "@data"; /** The dataset's name. */ protected /*@spec_public non_null@*/ String m_RelationName; /** The attribute information. */ protected /*@spec_public non_null@*/ FastVector m_Attributes; /* public invariant (\forall int i; 0 <= i && i < m_Attributes.size(); m_Attributes.elementAt(i) != null); */ /** The instances. */ protected /*@spec_public non_null@*/ FastVector m_Instances; /** The class attribute's index */ protected int m_ClassIndex; //@ protected invariant classIndex() == m_ClassIndex; /** The lines read so far in case of incremental loading. Since the * StreamTokenizer will be re-initialized with every instance that is read, * we have to keep track of the number of lines read so far. * @see #readInstance(Reader) */ protected int m_Lines = 0; /** * Reads an ARFF file from a reader, and assigns a weight of * one to each instance. Lets the index of the class * attribute be undefined (negative). * * @param reader the reader * @throws IOException if the ARFF file is not read * successfully */ // public Instances(/*@non_null@*/Reader reader) throws IOException { // ArffReader arff = new ArffReader(reader); // Instances dataset = arff.getData(); // initialize(dataset, dataset.numInstances()); // dataset.copyInstances(0, this, dataset.numInstances()); // compactify(); // } /** * Reads the header of an ARFF file from a reader and * reserves space for the given number of instances. Lets * the class index be undefined (negative). * * @param reader the reader * @param capacity the capacity * @throws IllegalArgumentException if the header is not read successfully * or the capacity is negative. * @throws IOException if there is a problem with the reader. * @deprecated instead of using this method in conjunction with the * <code>readInstance(Reader)</code> method, one should use the * <code>ArffLoader</code> or <code>DataSource</code> class instead. * @see weka.core.converters.ArffLoader * @see weka.core.converters.ConverterUtils.DataSource */ //@ requires capacity >= 0; //@ ensures classIndex() == -1; // @Deprecated public Instances(/*@non_null@*/Reader reader, int capacity) // throws IOException { // ArffReader arff = new ArffReader(reader, 0); // Instances header = arff.getStructure(); // initialize(header, capacity); // m_Lines = arff.getLineNo(); // } /** * Constructor copying all instances and references to * the header information from the given set of instances. * * @param dataset the set to be copied */ public Instances(/*@non_null@*/Instances dataset) { this(dataset, dataset.numInstances()); dataset.copyInstances(0, this, dataset.numInstances()); } /** * Constructor creating an empty set of instances. Copies references * to the header information from the given set of instances. Sets * the capacity of the set of instances to 0 if its negative. * * @param dataset the instances from which the header * information is to be taken * @param capacity the capacity of the new dataset */ public Instances(/*@non_null@*/Instances dataset, int capacity) { initialize(dataset, capacity); } /** * initializes with the header information of the given dataset and sets * the capacity of the set of instances. * * @param dataset the dataset to use as template * @param capacity the number of rows to reserve */ protected void initialize(Instances dataset, int capacity) { if (capacity < 0) capacity = 0; // Strings only have to be "shallow" copied because // they can't be modified. m_ClassIndex = dataset.m_ClassIndex; m_RelationName = dataset.m_RelationName; m_Attributes = dataset.m_Attributes; m_Instances = new FastVector(capacity); } /** * Creates a new set of instances by copying a * subset of another set. * * @param source the set of instances from which a subset * is to be created * @param first the index of the first instance to be copied * @param toCopy the number of instances to be copied * @throws IllegalArgumentException if first and toCopy are out of range */ //@ requires 0 <= first; //@ requires 0 <= toCopy; //@ requires first + toCopy <= source.numInstances(); public Instances(/*@non_null@*/Instances source, int first, int toCopy) { this(source, toCopy); if ((first < 0) || ((first + toCopy) > source.numInstances())) { throw new IllegalArgumentException("Parameters first and/or toCopy out "+ "of range"); } source.copyInstances(first, this, toCopy); } /** * Creates an empty set of instances. Uses the given * attribute information. Sets the capacity of the set of * instances to 0 if its negative. Given attribute information * must not be changed after this constructor has been used. * * @param name the name of the relation * @param attInfo the attribute information * @param capacity the capacity of the set */ public Instances(/*@non_null@*/String name, /*@non_null@*/FastVector attInfo, int capacity) { m_RelationName = name; m_ClassIndex = -1; m_Attributes = attInfo; // for (int i = 0; i < numAttributes(); i++) { // attribute(i).setIndex(i); // } m_Instances = new FastVector(capacity); } /** * Create a copy of the structure, but "cleanse" string types (i.e. * doesn't contain references to the strings seen in the past). * Also cleanses all relational attributes. * * @return a copy of the instance structure. */ // public Instances stringFreeStructure() { // FastVector atts = (FastVector)m_Attributes.copy(); // for (int i = 0 ; i < atts.size(); i++) { // Attribute att = (Attribute)atts.elementAt(i); // if (att.type() == Attribute.STRING) { // atts.setElementAt(new Attribute(att.name(), (FastVector)null), i); // } else if (att.type() == Attribute.RELATIONAL) { // atts.setElementAt(new Attribute(att.name(), new Instances(att.relation(), 0)), i); // } // } // Instances result = new Instances(relationName(), atts, 0); // result.m_ClassIndex = m_ClassIndex; // return result; // } /** * Adds one instance to the end of the set. * Shallow copies instance before it is added. Increases the * size of the dataset if it is not large enough. Does not * check if the instance is compatible with the dataset. * Note: String or relational values are not transferred. * * @param instance the instance to be added */ public void add(/*@non_null@*/ Instance instance) { Instance newInstance = (Instance)instance.copy(); newInstance.setDataset(this); m_Instances.addElement(newInstance); } /** * Returns an attribute. * * @param index the attribute's index (index starts with 0) * @return the attribute at the given position */ //@ requires 0 <= index; //@ requires index < m_Attributes.size(); //@ ensures \result != null; public /*@pure@*/ Attribute attribute(int index) { return (Attribute) m_Attributes.elementAt(index); } /** * Returns an attribute given its name. If there is more than * one attribute with the same name, it returns the first one. * Returns null if the attribute can't be found. * * @param name the attribute's name * @return the attribute with the given name, null if the * attribute can't be found */ public /*@pure@*/ Attribute attribute(String name) { for (int i = 0; i < numAttributes(); i++) { if (attribute(i).getName().equals(name)) { return attribute(i); } } return null; } /** * Checks for attributes of the given type in the dataset * * @param attType the attribute type to look for * @return true if attributes of the given type are present */ public boolean checkForAttributeType(int attType) { int i = 0; while (i < m_Attributes.size()) { if (attribute(i++).getType() == attType) { return true; } } return false; } /** * Checks for string attributes in the dataset * * @return true if string attributes are present, false otherwise */ public /*@pure@*/ boolean checkForStringAttributes() { return checkForAttributeType(Attribute.NOMINAL); } /** * Checks if the given instance is compatible * with this dataset. Only looks at the size of * the instance and the ranges of the values for * nominal and string attributes. * * @param instance the instance to check * @return true if the instance is compatible with the dataset */ public /*@pure@*/ boolean checkInstance(Instance instance) { if (instance.numAttributes() != numAttributes()) { return false; } for (int i = 0; i < numAttributes(); i++) { if (instance.isMissing(i)) { continue; } else if (attribute(i).getType()==Attribute.NOMINAL) { if (!(Utils.eq(instance.value(i), (double)(int)instance.value(i)))) { return false; } else if (Utils.sm(instance.value(i), 0) || Utils.gr(instance.value(i), attribute(i).getNumNominalValues())) { return false; } } } return true; } /** * Returns the class attribute. * * @return the class attribute * @throws UnassignedClassException if the class is not set */ //@ requires classIndex() >= 0; public /*@pure@*/ Attribute classAttribute() { if (m_ClassIndex < 0) { throw new UnassignedClassException("Class index is negative (not set)!"); } return attribute(m_ClassIndex); } /** * Returns the class attribute's index. Returns negative number * if it's undefined. * * @return the class index as an integer */ // ensures \result == m_ClassIndex; public /*@pure@*/ int classIndex() { return m_ClassIndex; } /** * Compactifies the set of instances. Decreases the capacity of * the set so that it matches the number of instances in the set. */ public void compactify() { m_Instances.trimToSize(); } /** * Removes all instances from the set. */ public void delete() { m_Instances = new FastVector(); } /** * Removes an instance at the given position from the set. * * @param index the instance's position (index starts with 0) */ //@ requires 0 <= index && index < numInstances(); public void delete(int index) { m_Instances.removeElementAt(index); } /** * Deletes an attribute at the given position * (0 to numAttributes() - 1). A deep copy of the attribute * information is performed before the attribute is deleted. * * @param position the attribute's position (position starts with 0) * @throws IllegalArgumentException if the given index is out of range * or the class attribute is being deleted */ //@ requires 0 <= position && position < numAttributes(); //@ requires position != classIndex(); public void deleteAttributeAt(int position) { if ((position < 0) || (position >= m_Attributes.size())) { throw new IllegalArgumentException("Index out of range"); } if (position == m_ClassIndex) { throw new IllegalArgumentException("Can't delete class attribute"); } freshAttributeInfo(); if (m_ClassIndex > position) { m_ClassIndex--; } m_Attributes.removeElementAt(position); for (int i = position; i < m_Attributes.size(); i++) { Attribute current = (Attribute)m_Attributes.elementAt(i); // current.setIndex(current.index() - 1); } for (int i = 0; i < numInstances(); i++) { instance(i).forceDeleteAttributeAt(position); } } /** * Deletes all attributes of the given type in the dataset. A deep copy of * the attribute information is performed before an attribute is deleted. * * @param attType the attribute type to delete * @throws IllegalArgumentException if attribute couldn't be * successfully deleted (probably because it is the class attribute). */ public void deleteAttributeType(int attType) { int i = 0; while (i < m_Attributes.size()) { if (attribute(i).getType() == attType) { deleteAttributeAt(i); } else { i++; } } } /** * Deletes all string attributes in the dataset. A deep copy of the attribute * information is performed before an attribute is deleted. * * @throws IllegalArgumentException if string attribute couldn't be * successfully deleted (probably because it is the class attribute). * @see #deleteAttributeType(int) */ public void deleteStringAttributes() { deleteAttributeType(Attribute.NOMINAL); } /** * Removes all instances with missing values for a particular * attribute from the dataset. * * @param attIndex the attribute's index (index starts with 0) */ //@ requires 0 <= attIndex && attIndex < numAttributes(); public void deleteWithMissing(int attIndex) { FastVector newInstances = new FastVector(numInstances()); for (int i = 0; i < numInstances(); i++) { if (!instance(i).isMissing(attIndex)) { newInstances.addElement(instance(i)); } } m_Instances = newInstances; } /** * Removes all instances with missing values for a particular * attribute from the dataset. * * @param att the attribute */ public void deleteWithMissing(/*@non_null@*/ Attribute att) { deleteWithMissing(indexOf(att)); } /** * Removes all instances with a missing class value * from the dataset. * * @throws UnassignedClassException if class is not set */ public void deleteWithMissingClass() { if (m_ClassIndex < 0) { throw new UnassignedClassException("Class index is negative (not set)!"); } deleteWithMissing(m_ClassIndex); } /** * Returns an enumeration of all the attributes. * * @return enumeration of all the attributes. */ public /*@non_null pure@*/ Enumeration enumerateAttributes() { return m_Attributes.elements(m_ClassIndex); } /** * Returns an enumeration of all instances in the dataset. * * @return enumeration of all instances in the dataset */ public /*@non_null pure@*/ Enumeration enumerateInstances() { return m_Instances.elements(); } /** * Checks if two headers are equivalent. * * @param dataset another dataset * @return true if the header of the given dataset is equivalent * to this header */ public /*@pure@*/ boolean equalHeaders(Instances dataset){ // Check class and all attributes if (m_ClassIndex != dataset.m_ClassIndex) { return false; } if (m_Attributes.size() != dataset.m_Attributes.size()) { return false; } for (int i = 0; i < m_Attributes.size(); i++) { if (!(attribute(i).equals(dataset.attribute(i)))) { return false; } } return true; } /** * Returns the first instance in the set. * * @return the first instance in the set */ //@ requires numInstances() > 0; public /*@non_null pure@*/ Instance firstInstance() { return (Instance)m_Instances.firstElement(); } /** * Returns a random number generator. The initial seed of the random * number generator depends on the given seed and the hash code of * a string representation of a instances chosen based on the given * seed. * * @param seed the given seed * @return the random number generator */ public Random getRandomNumberGenerator(long seed) { Random r = new Random(seed); r.setSeed(instance(r.nextInt(numInstances())).toString().hashCode() + seed); return r; } /** * Inserts an attribute at the given position (0 to * numAttributes()) and sets all values to be missing. * Shallow copies the attribute before it is inserted, and performs * a deep copy of the existing attribute information. * * @param att the attribute to be inserted * @param position the attribute's position (position starts with 0) * @throws IllegalArgumentException if the given index is out of range */ //@ requires 0 <= position; //@ requires position <= numAttributes(); public void insertAttributeAt(/*@non_null@*/ Attribute att, int position) { if ((position < 0) || (position > m_Attributes.size())) { throw new IllegalArgumentException("Index out of range"); } freshAttributeInfo(); // att.setIndex(position); m_Attributes.insertElementAt(att, position); for (int i = position + 1; i < m_Attributes.size(); i++) { Attribute current = (Attribute)m_Attributes.elementAt(i); // current.setIndex(indexOf(current) + 1); } for (int i = 0; i < numInstances(); i++) { instance(i).forceInsertAttributeAt(position); } if (m_ClassIndex >= position) { m_ClassIndex++; } } /** * Returns the instance at the given position. * * @param index the instance's index (index starts with 0) * @return the instance at the given position */ //@ requires 0 <= index; //@ requires index < numInstances(); public /*@non_null pure@*/ Instance instance(int index) { return (Instance)m_Instances.elementAt(index); } /** * Returns the kth-smallest attribute value of a numeric attribute. * Note that calling this method will change the order of the data! * * @param att the Attribute object * @param k the value of k * @return the kth-smallest value */ public double kthSmallestValue(Attribute att, int k) { return kthSmallestValue(indexOf(att), k); } /** * Returns the kth-smallest attribute value of a numeric attribute. * Note that calling this method will change the order of the data! * The number of non-missing values in the data must be as least * as last as k for this to work. * * @param attIndex the attribute's index * @param k the value of k * @return the kth-smallest value */ public double kthSmallestValue(int attIndex, int k) { if (attribute(attIndex).getType()==Attribute.NOMINAL) { throw new IllegalArgumentException("Instances: attribute must be numeric to compute kth-smallest value."); } int i,j; // move all instances with missing values to end j = numInstances() - 1; i = 0; while (i <= j) { if (instance(j).isMissing(attIndex)) { j--; } else { if (instance(i).isMissing(attIndex)) { swap(i,j); j--; } i++; } } if ((k < 0) || (k > j)) { throw new IllegalArgumentException("Instances: value for k for computing kth-smallest value too large."); } return instance(select(attIndex, 0, j, k)).value(attIndex); } /** * Returns the last instance in the set. * * @return the last instance in the set */ //@ requires numInstances() > 0; public /*@non_null pure@*/ Instance lastInstance() { return (Instance)m_Instances.lastElement(); } /** * Returns the mean (mode) for a numeric (nominal) attribute as * a floating-point value. Returns 0 if the attribute is neither nominal nor * numeric. If all values are missing it returns zero. * * @param attIndex the attribute's index (index starts with 0) * @return the mean or the mode */ public /*@pure@*/ double meanOrMode(int attIndex) { double result, found; int [] counts; if (!(attribute(attIndex).getType()==Attribute.NOMINAL)) { result = found = 0; for (int j = 0; j < numInstances(); j++) { if (!instance(j).isMissing(attIndex)) { found += instance(j).weight(); result += instance(j).weight()*instance(j).value(attIndex); } } if (found <= 0) { return 0; } else { return result / found; } } else if (attribute(attIndex).getType()==Attribute.NOMINAL) { counts = new int[attribute(attIndex).getNumNominalValues()]; for (int j = 0; j < numInstances(); j++) { if (!instance(j).isMissing(attIndex)) { counts[(int) instance(j).value(attIndex)] += instance(j).weight(); } } return (double)Utils.maxIndex(counts); } else { return 0; } } /** * Returns the mean (mode) for a numeric (nominal) attribute as a * floating-point value. Returns 0 if the attribute is neither * nominal nor numeric. If all values are missing it returns zero. * * @param att the attribute * @return the mean or the mode */ public /*@pure@*/ double meanOrMode(Attribute att) { return meanOrMode(indexOf(att)); } /** * Returns the number of attributes. * * @return the number of attributes as an integer */ //@ ensures \result == m_Attributes.size(); public /*@pure@*/ int numAttributes() { return m_Attributes.size(); } /** * Returns the number of class labels. * * @return the number of class labels as an integer if the class * attribute is nominal, 1 otherwise. * @throws UnassignedClassException if the class is not set */ //@ requires classIndex() >= 0; public /*@pure@*/ int numClasses() { if (m_ClassIndex < 0) { throw new UnassignedClassException("Class index is negative (not set)!"); } if (!(classAttribute().getType()==Attribute.NOMINAL)) { return 1; } else { return classAttribute().getNumNominalValues(); } } /** * Returns the number of distinct values of a given attribute. * Returns the number of instances if the attribute is a * string attribute. The value 'missing' is not counted. * * @param attIndex the attribute (index starts with 0) * @return the number of distinct values of a given attribute */ //@ requires 0 <= attIndex; //@ requires attIndex < numAttributes(); public /*@pure@*/ int numDistinctValues(int attIndex) { if (!(attribute(attIndex).getType()==Attribute.NOMINAL)) { double [] attVals = attributeToDoubleArray(attIndex); int [] sorted = Utils.sort(attVals); double prev = 0; int counter = 0; for (int i = 0; i < sorted.length; i++) { Instance current = instance(sorted[i]); if (current.isMissing(attIndex)) { break; } if ((i == 0) || (current.value(attIndex) > prev)) { prev = current.value(attIndex); counter++; } } return counter; } else { return attribute(attIndex).getNumNominalValues(); } } /** * Returns the number of distinct values of a given attribute. * Returns the number of instances if the attribute is a * string attribute. The value 'missing' is not counted. * * @param att the attribute * @return the number of distinct values of a given attribute */ public /*@pure@*/ int numDistinctValues(/*@non_null@*/Attribute att) { return numDistinctValues(indexOf(att)); } /** * Returns the number of instances in the dataset. * * @return the number of instances in the dataset as an integer */ //@ ensures \result == m_Instances.size(); public /*@pure@*/ int numInstances() { return m_Instances.size(); } /** * Shuffles the instances in the set so that they are ordered * randomly. * * @param random a random number generator */ public void randomize(Random random) { for (int j = numInstances() - 1; j > 0; j--) swap(j, random.nextInt(j+1)); } /** * Shuffles the instances in the set so that they are ordered * randomly. * It uses the Randomize method of KEEL */ public void randomize() { for (int j = numInstances() - 1; j > 0; j--) swap(j, Randomize.Randint(0, j+1)); } /** * Reads a single instance from the reader and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. This method does * not check for carriage return at the end of the line. * * @param reader the reader * @return false if end of file has been reached * @throws IOException if the information is not read * successfully * @deprecated instead of using this method in conjunction with the * <code>readInstance(Reader)</code> method, one should use the * <code>ArffLoader</code> or <code>DataSource</code> class instead. * @see weka.core.converters.ArffLoader * @see weka.core.converters.ConverterUtils.DataSource */ // @Deprecated public boolean readInstance(Reader reader) throws IOException { // // ArffReader arff = new ArffReader(reader, this, m_Lines, 1); // Instance inst = arff.readInstance(arff.getData(), false); // m_Lines = arff.getLineNo(); // if (inst != null) { // add(inst); // return true; // } // else { // return false; // } // } /** * Returns the relation's name. * * @return the relation's name as a string */ //@ ensures \result == m_RelationName; public /*@pure@*/ String relationName() { return m_RelationName; } /** * Renames an attribute. This change only affects this * dataset. * * @param att the attribute's index (index starts with 0) * @param name the new name */ // public void renameAttribute(int att, String name) { // // Attribute newAtt = attribute(att).copy(name); // FastVector newVec = new FastVector(numAttributes()); // // for (int i = 0; i < numAttributes(); i++) { // if (i == att) { // newVec.addElement(newAtt); // } else { // newVec.addElement(attribute(i)); // } // } // m_Attributes = newVec; // } /** * Renames an attribute. This change only affects this * dataset. * * @param att the attribute * @param name the new name */ // public void renameAttribute(Attribute att, String name) { // // renameAttribute(indexOf(att), name); // } /** * Renames the value of a nominal (or string) attribute value. This * change only affects this dataset. * * @param att the attribute's index (index starts with 0) * @param val the value's index (index starts with 0) * @param name the new name */ // public void renameAttributeValue(int att, int val, String name) { // // Attribute newAtt = (Attribute)attribute(att).copy(); // FastVector newVec = new FastVector(numAttributes()); // //// newAtt.setValue(val, name); // for (int i = 0; i < numAttributes(); i++) { // if (i == att) { // newVec.addElement(newAtt); // } else { // newVec.addElement(attribute(i)); // } // } // m_Attributes = newVec; // } /** * Renames the value of a nominal (or string) attribute value. This * change only affects this dataset. * * @param att the attribute * @param val the value * @param name the new name */ // public void renameAttributeValue(Attribute att, String val, // String name) { // // int v = indexOfValue(att,val); //// int v = att.indexOfValue(val); // if (v == -1) throw new IllegalArgumentException(val + " not found"); // renameAttributeValue(indexOf(att), v, name); // } /** * Creates a new dataset of the same size using random sampling * with replacement. * * @param random a random number generator * @return the new dataset */ public Instances resample(Random random) { Instances newData = new Instances(this, numInstances()); while (newData.numInstances() < numInstances()) { newData.add(instance(random.nextInt(numInstances()))); } return newData; } /** * Creates a new dataset of the same size using random sampling * with replacement according to the current instance weights. The * weights of the instances in the new dataset are set to one. * * @param random a random number generator * @return the new dataset */ public Instances resampleWithWeights(Random random) { double [] weights = new double[numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = instance(i).weight(); } return resampleWithWeights(random, weights); } /** * Creates a new dataset of the same size using random sampling * with replacement according to the given weight vector. The * weights of the instances in the new dataset are set to one. * The length of the weight vector has to be the same as the * number of instances in the dataset, and all weights have to * be positive. * * @param random a random number generator * @param weights the weight vector * @return the new dataset * @throws IllegalArgumentException if the weights array is of the wrong * length or contains negative weights. */ public Instances resampleWithWeights(Random random, double[] weights) { if (weights.length != numInstances()) { throw new IllegalArgumentException("weights.length != numInstances."); } Instances newData = new Instances(this, numInstances()); if (numInstances() == 0) { return newData; } double[] probabilities = new double[numInstances()]; double sumProbs = 0, sumOfWeights = Utils.sum(weights); for (int i = 0; i < numInstances(); i++) { sumProbs += random.nextDouble(); probabilities[i] = sumProbs; } Utils.normalize(probabilities, sumProbs / sumOfWeights); // Make sure that rounding errors don't mess things up probabilities[numInstances() - 1] = sumOfWeights; int k = 0; int l = 0; sumProbs = 0; while ((k < numInstances() && (l < numInstances()))) { if (weights[l] < 0) { throw new IllegalArgumentException("Weights have to be positive."); } sumProbs += weights[l]; while ((k < numInstances()) && (probabilities[k] <= sumProbs)) { newData.add(instance(l)); newData.instance(k).setWeight(1); k++; } l++; } return newData; } /** * Sets the class attribute. * * @param att attribute to be the class */ public void setClass(Attribute att) { m_ClassIndex = indexOf(att); } /** * Sets the class index of the set. * If the class index is negative there is assumed to be no class. * (ie. it is undefined) * * @param classIndex the new class index (index starts with 0) * @throws IllegalArgumentException if the class index is too big or < 0 */ public void setClassIndex(int classIndex) { if (classIndex >= numAttributes()) { throw new IllegalArgumentException("Invalid class index: " + classIndex); } m_ClassIndex = classIndex; } /** * Sets the relation's name. * * @param newName the new relation name. */ public void setRelationName(/*@non_null@*/String newName) { m_RelationName = newName; } /** * Sorts the instances based on an attribute. For numeric attributes, * instances are sorted in ascending order. For nominal attributes, * instances are sorted based on the attribute label ordering * specified in the header. Instances with missing values for the * attribute are placed at the end of the dataset. * * @param attIndex the attribute's index (index starts with 0) */ public void sort(int attIndex) { int i,j; // move all instances with missing values to end j = numInstances() - 1; i = 0; while (i <= j) { if (instance(j).isMissing(attIndex)) { j--; } else { if (instance(i).isMissing(attIndex)) { swap(i,j); j--; } i++; } } quickSort(attIndex, 0, j); } /** * Sorts the instances based on an attribute. For numeric attributes, * instances are sorted into ascending order. For nominal attributes, * instances are sorted based on the attribute label ordering * specified in the header. Instances with missing values for the * attribute are placed at the end of the dataset. * * @param att the attribute */ public void sort(Attribute att) { sort(indexOf(att)); } /** * Stratifies a set of instances according to its class values * if the class attribute is nominal (so that afterwards a * stratified cross-validation can be performed). * * @param numFolds the number of folds in the cross-validation * @throws UnassignedClassException if the class is not set */ public void stratify(int numFolds) { if (numFolds <= 0) { throw new IllegalArgumentException("Number of folds must be greater than 1"); } if (m_ClassIndex < 0) { throw new UnassignedClassException("Class index is negative (not set)!"); } if (classAttribute().getType()==Attribute.NOMINAL) { // sort by class int index = 1; while (index < numInstances()) { Instance instance1 = instance(index - 1); for (int j = index; j < numInstances(); j++) { Instance instance2 = instance(j); if ((instance1.classValue() == instance2.classValue()) || (instance1.classIsMissing() && instance2.classIsMissing())) { swap(index,j); index++; } } index++; } stratStep(numFolds); } } /** * Computes the sum of all the instances' weights. * * @return the sum of all the instances' weights as a double */ public /*@pure@*/ double sumOfWeights() { double sum = 0; for (int i = 0; i < numInstances(); i++) { sum += instance(i).weight(); } return sum; } /** * Creates the test set for one fold of a cross-validation on * the dataset. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @return the test set as a set of weighted instances * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances testCV(int numFolds, int numFold) { int numInstForFold, first, offset; Instances test; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds){ numInstForFold++; offset = numFold; }else offset = numInstances() % numFolds; test = new Instances(this, numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(first, test, numInstForFold); return test; } /** * Returns the dataset as a string in ARFF format. Strings * are quoted if they contain whitespace characters, or if they * are a question mark. * * @return the dataset in ARFF format as a string */ public String toString() { StringBuffer text = new StringBuffer(); text.append(ARFF_RELATION).append(" "). append(Utils.quote(m_RelationName)).append("\n\n"); for (int i = 0; i < numAttributes(); i++) { text.append(attribute(i)).append("\n"); } text.append("\n").append(ARFF_DATA).append("\n"); text.append(stringWithoutHeader()); return text.toString(); } /** * Returns the instances in the dataset as a string in ARFF format. Strings * are quoted if they contain whitespace characters, or if they * are a question mark. * * @return the dataset in ARFF format as a string */ protected String stringWithoutHeader() { StringBuffer text = new StringBuffer(); for (int i = 0; i < numInstances(); i++) { text.append(instance(i)); if (i < numInstances() - 1) { text.append('\n'); } } return text.toString(); } /** * Creates the training set for one fold of a cross-validation * on the dataset. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold) { int numInstForFold, first, offset; Instances train; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds) { numInstForFold++; offset = numFold; }else offset = numInstances() % numFolds; train = new Instances(this, numInstances() - numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(0, train, first); copyInstances(first + numInstForFold, train, numInstances() - first - numInstForFold); return train; } /** * Creates the training set for one fold of a cross-validation * on the dataset. The data is subsequently randomized based * on the given random number generator. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @param random the random number generator * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold, Random random) { Instances train = trainCV(numFolds, numFold); train.randomize(random); return train; } /** * Computes the variance for a numeric attribute. * * @param attIndex the numeric attribute (index starts with 0) * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(int attIndex) { double sum = 0, sumSquared = 0, sumOfWeights = 0; if (attribute(attIndex).getType()==Attribute.NOMINAL) { throw new IllegalArgumentException("Can't compute variance because attribute is " + "not numeric!"); } for (int i = 0; i < numInstances(); i++) { if (!instance(i).isMissing(attIndex)) { sum += instance(i).weight() * instance(i).value(attIndex); sumSquared += instance(i).weight() * instance(i).value(attIndex) * instance(i).value(attIndex); sumOfWeights += instance(i).weight(); } } if (sumOfWeights <= 1) { return 0; } double result = (sumSquared - (sum * sum / sumOfWeights)) / (sumOfWeights - 1); // We don't like negative variance if (result < 0) { return 0; } else { return result; } } /** * Computes the variance for a numeric attribute. * * @param att the numeric attribute * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(Attribute att) { return variance(indexOf(att)); } /** * Calculates summary statistics on the values that appear in this * set of instances for a specified attribute. * * @param index the index of the attribute to summarize (index starts with 0) * @return an AttributeStats object with it's fields calculated. */ //@ requires 0 <= index && index < numAttributes(); // public AttributeStats attributeStats(int index) { // // AttributeStats result = new AttributeStats(); // if (attribute(index).isNominal()) { // result.nominalCounts = new int [attribute(index).numValues()]; // } // if (attribute(index).isNumeric()) { // result.numericStats = new weka.experiment.Stats(); // } // result.totalCount = numInstances(); // // double [] attVals = attributeToDoubleArray(index); // int [] sorted = Utils.sort(attVals); // int currentCount = 0; // double prev = Instance.missingValue(); // for (int j = 0; j < numInstances(); j++) { // Instance current = instance(sorted[j]); // if (current.isMissing(index)) { // result.missingCount = numInstances() - j; // break; // } // if (current.value(index) == prev) { // currentCount++; // } else { // result.addDistinct(prev, currentCount); // currentCount = 1; // prev = current.value(index); // } // } // result.addDistinct(prev, currentCount); // result.distinctCount--; // So we don't count "missing" as a value // return result; // } /** * Gets the value of all instances in this dataset for a particular * attribute. Useful in conjunction with Utils.sort to allow iterating * through the dataset in sorted order for some attribute. * * @param index the index of the attribute. * @return an array containing the value of the desired attribute for * each instance in the dataset. */ //@ requires 0 <= index && index < numAttributes(); public /*@pure@*/ double [] attributeToDoubleArray(int index) { double [] result = new double[numInstances()]; for (int i = 0; i < result.length; i++) { result[i] = instance(i).value(index); } return result; } /** * Generates a string summarizing the set of instances. Gives a breakdown * for each attribute indicating the number of missing/discrete/unique * values and other information. * * @return a string summarizing the dataset */ // public String toSummaryString() { // // StringBuffer result = new StringBuffer(); // result.append("Relation Name: ").append(relationName()).append('\n'); // result.append("Num Instances: ").append(numInstances()).append('\n'); // result.append("Num Attributes: ").append(numAttributes()).append('\n'); // result.append('\n'); // // result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25)); // result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5)); // result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5)); // result.append(Utils.padLeft("Missing", 12)); // result.append(Utils.padLeft("Unique", 12)); // result.append(Utils.padLeft("Dist", 6)).append('\n'); // for (int i = 0; i < numAttributes(); i++) { // Attribute a = attribute(i); // AttributeStats as = attributeStats(i); // result.append(Utils.padLeft("" + (i + 1), 4)).append(' '); // result.append(Utils.padRight(a.name(), 25)).append(' '); // long percent; // switch (a.type()) { // case Attribute.NOMINAL: // result.append(Utils.padLeft("Nom", 4)).append(' '); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // case Attribute.NUMERIC: // result.append(Utils.padLeft("Num", 4)).append(' '); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // case Attribute.DATE: // result.append(Utils.padLeft("Dat", 4)).append(' '); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // case Attribute.STRING: // result.append(Utils.padLeft("Str", 4)).append(' '); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // case Attribute.RELATIONAL: // result.append(Utils.padLeft("Rel", 4)).append(' '); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // default: // result.append(Utils.padLeft("???", 4)).append(' '); // result.append(Utils.padLeft("" + 0, 3)).append("% "); // percent = Math.round(100.0 * as.intCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // percent = Math.round(100.0 * as.realCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // break; // } // result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /"); // percent = Math.round(100.0 * as.missingCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /"); // percent = Math.round(100.0 * as.uniqueCount / as.totalCount); // result.append(Utils.padLeft("" + percent, 3)).append("% "); // result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' '); // result.append('\n'); // } // return result.toString(); // } /** * Copies instances from one set to the end of another * one. * * @param from the position of the first instance to be copied * @param dest the destination for the instances * @param num the number of instances to be copied */ //@ requires 0 <= from && from <= numInstances() - num; //@ requires 0 <= num; protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) { for (int i = 0; i < num; i++) { dest.add(instance(from + i)); } } /** * Replaces the attribute information by a clone of * itself. */ protected void freshAttributeInfo() { m_Attributes = (FastVector) m_Attributes.copyElements(); } /** * Returns string including all instances, their weights and * their indices in the original dataset. * * @return description of instance and its weight as a string */ protected /*@pure@*/ String instancesAndWeights(){ StringBuffer text = new StringBuffer(); for (int i = 0; i < numInstances(); i++) { text.append(instance(i) + " " + instance(i).weight()); if (i < numInstances() - 1) { text.append("\n"); } } return text.toString(); } /** * Partitions the instances around a pivot. Used by quicksort and * kthSmallestValue. * * @param attIndex the attribute's index (index starts with 0) * @param l the first index of the subset (index starts with 0) * @param r the last index of the subset (index starts with 0) * * @return the index of the middle element */ //@ requires 0 <= attIndex && attIndex < numAttributes(); //@ requires 0 <= left && left <= right && right < numInstances(); protected int partition(int attIndex, int l, int r) { double pivot = instance((l + r) / 2).value(attIndex); while (l < r) { while ((instance(l).value(attIndex) < pivot) && (l < r)) { l++; } while ((instance(r).value(attIndex) > pivot) && (l < r)) { r--; } if (l < r) { swap(l, r); l++; r--; } } if ((l == r) && (instance(r).value(attIndex) > pivot)) { r--; } return r; } /** * Implements quicksort according to Manber's "Introduction to * Algorithms". * * @param attIndex the attribute's index (index starts with 0) * @param left the first index of the subset to be sorted (index starts with 0) * @param right the last index of the subset to be sorted (index starts with 0) */ //@ requires 0 <= attIndex && attIndex < numAttributes(); //@ requires 0 <= first && first <= right && right < numInstances(); protected void quickSort(int attIndex, int left, int right) { if (left < right) { int middle = partition(attIndex, left, right); quickSort(attIndex, left, middle); quickSort(attIndex, middle + 1, right); } } /** * Implements computation of the kth-smallest element according * to Manber's "Introduction to Algorithms". * * @param attIndex the attribute's index (index starts with 0) * @param left the first index of the subset (index starts with 0) * @param right the last index of the subset (index starts with 0) * @param k the value of k * * @return the index of the kth-smallest element */ //@ requires 0 <= attIndex && attIndex < numAttributes(); //@ requires 0 <= first && first <= right && right < numInstances(); protected int select(int attIndex, int left, int right, int k) { if (left == right) { return left; } else { int middle = partition(attIndex, left, right); if ((middle - left + 1) >= k) { return select(attIndex, left, middle, k); } else { return select(attIndex, middle + 1, right, k - (middle - left + 1)); } } } /** * Help function needed for stratification of set. * * @param numFolds the number of folds for the stratification */ protected void stratStep (int numFolds){ FastVector newVec = new FastVector(m_Instances.capacity()); int start = 0, j; // create stratified batch while (newVec.size() < numInstances()) { j = start; while (j < numInstances()) { newVec.addElement(instance(j)); j = j + numFolds; } start++; } m_Instances = newVec; } /** * Swaps two instances in the set. * * @param i the first instance's index (index starts with 0) * @param j the second instance's index (index starts with 0) */ //@ requires 0 <= i && i < numInstances(); //@ requires 0 <= j && j < numInstances(); public void swap(int i, int j){ m_Instances.swap(i, j); } /** * Merges two sets of Instances together. The resulting set will have * all the attributes of the first set plus all the attributes of the * second set. The number of instances in both sets must be the same. * * @param first the first set of Instances * @param second the second set of Instances * @return the merged set of Instances * @throws IllegalArgumentException if the datasets are not the same size */ public static Instances mergeInstances(Instances first, Instances second) { if (first.numInstances() != second.numInstances()) { throw new IllegalArgumentException("Instance sets must be of the same size"); } // Create the vector of merged attributes FastVector newAttributes = new FastVector(); for (int i = 0; i < first.numAttributes(); i++) { newAttributes.addElement(first.attribute(i)); } for (int i = 0; i < second.numAttributes(); i++) { newAttributes.addElement(second.attribute(i)); } // Create the set of Instances Instances merged = new Instances(first.relationName() + '_' + second.relationName(), newAttributes, first.numInstances()); // Merge each instance for (int i = 0; i < first.numInstances(); i++) { merged.add(first.instance(i).mergeInstance(second.instance(i))); } return merged; } /** * Method for testing this class. * * @param argv should contain one element: the name of an ARFF file */ //@ requires argv != null; //@ requires argv.length == 1; //@ requires argv[0] != null; // public static void test(String [] argv) { // // Instances instances, secondInstances, train, test, empty; // Random random = new Random(2); // Reader reader; // int start, num; // FastVector testAtts, testVals; // int i,j; // // try{ // if (argv.length > 1) { // throw (new Exception("Usage: Instances [<filename>]")); // } // // // Creating set of instances from scratch // testVals = new FastVector(2); // testVals.addElement("first_value"); // testVals.addElement("second_value"); // testAtts = new FastVector(2); // testAtts.addElement(new Attribute("nominal_attribute", testVals)); // testAtts.addElement(new Attribute("numeric_attribute")); // instances = new Instances("test_set", testAtts, 10); // instances.add(new Instance(instances.numAttributes())); // instances.add(new Instance(instances.numAttributes())); // instances.add(new Instance(instances.numAttributes())); // instances.setClassIndex(0); // System.out.println("\nSet of instances created from scratch:\n"); // System.out.println(instances); // // if (argv.length == 1) { // String filename = argv[0]; // reader = new FileReader(filename); // // // Read first five instances and print them // System.out.println("\nFirst five instances from file:\n"); // instances = new Instances(reader, 1); // instances.setClassIndex(instances.numAttributes() - 1); // i = 0; // while ((i < 5) && (instances.readInstance(reader))) { // i++; // } // System.out.println(instances); // // // Read all the instances in the file // reader = new FileReader(filename); // instances = new Instances(reader); // // // Make the last attribute be the class // instances.setClassIndex(instances.numAttributes() - 1); // // // Print header and instances. // System.out.println("\nDataset:\n"); // System.out.println(instances); // System.out.println("\nClass index: "+instances.classIndex()); // } // // // Test basic methods based on class index. // System.out.println("\nClass name: "+instances.classAttribute().name()); // System.out.println("\nClass index: "+instances.classIndex()); // System.out.println("\nClass is nominal: " + // instances.classAttribute().isNominal()); // System.out.println("\nClass is numeric: " + // instances.classAttribute().isNumeric()); // System.out.println("\nClasses:\n"); // for (i = 0; i < instances.numClasses(); i++) { // System.out.println(instances.classAttribute().value(i)); // } // System.out.println("\nClass values and labels of instances:\n"); // for (i = 0; i < instances.numInstances(); i++) { // Instance inst = instances.instance(i); // System.out.print(inst.classValue() + "\t"); // System.out.print(inst.toString(inst.classIndex())); // if (instances.instance(i).classIsMissing()) { // System.out.println("\tis missing"); // } else { // System.out.println(); // } // } // // // Create random weights. // System.out.println("\nCreating random weights for instances."); // for (i = 0; i < instances.numInstances(); i++) { // instances.instance(i).setWeight(random.nextDouble()); // } // // // Print all instances and their weights (and the sum of weights). // System.out.println("\nInstances and their weights:\n"); // System.out.println(instances.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(instances.sumOfWeights()); // // // Insert an attribute // secondInstances = new Instances(instances); // Attribute testAtt = new Attribute("Inserted"); // secondInstances.insertAttributeAt(testAtt, 0); // System.out.println("\nSet with inserted attribute:\n"); // System.out.println(secondInstances); // System.out.println("\nClass name: " // + secondInstances.classAttribute().name()); // // // Delete the attribute // secondInstances.deleteAttributeAt(0); // System.out.println("\nSet with attribute deleted:\n"); // System.out.println(secondInstances); // System.out.println("\nClass name: " // + secondInstances.classAttribute().name()); // // // Test if headers are equal // System.out.println("\nHeaders equal: "+ // instances.equalHeaders(secondInstances) + "\n"); // // // Print data in internal format. // System.out.println("\nData (internal values):\n"); // for (i = 0; i < instances.numInstances(); i++) { // for (j = 0; j < instances.numAttributes(); j++) { // if (instances.instance(i).isMissing(j)) { // System.out.print("? "); // } else { // System.out.print(instances.instance(i).value(j) + " "); // } // } // System.out.println(); // } // // // Just print header // System.out.println("\nEmpty dataset:\n"); // empty = new Instances(instances, 0); // System.out.println(empty); // System.out.println("\nClass name: "+empty.classAttribute().name()); // // // Create copy and rename an attribute and a value (if possible) // if (empty.classAttribute().isNominal()) { // Instances copy = new Instances(empty, 0); // copy.renameAttribute(copy.classAttribute(), "new_name"); // copy.renameAttributeValue(copy.classAttribute(), // copy.classAttribute().value(0), // "new_val_name"); // System.out.println("\nDataset with names changed:\n" + copy); // System.out.println("\nOriginal dataset:\n" + empty); // } // // // Create and prints subset of instances. // start = instances.numInstances() / 4; // num = instances.numInstances() / 2; // System.out.print("\nSubset of dataset: "); // System.out.println(num + " instances from " + (start + 1) // + ". instance"); // secondInstances = new Instances(instances, start, num); // System.out.println("\nClass name: " // + secondInstances.classAttribute().name()); // // // Print all instances and their weights (and the sum of weights). // System.out.println("\nInstances and their weights:\n"); // System.out.println(secondInstances.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(secondInstances.sumOfWeights()); // // // Create and print training and test sets for 3-fold // // cross-validation. // System.out.println("\nTrain and test folds for 3-fold CV:"); // if (instances.classAttribute().isNominal()) { // instances.stratify(3); // } // for (j = 0; j < 3; j++) { // train = instances.trainCV(3,j, new Random(1)); // test = instances.testCV(3,j); // // // Print all instances and their weights (and the sum of weights). // System.out.println("\nTrain: "); // System.out.println("\nInstances and their weights:\n"); // System.out.println(train.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(train.sumOfWeights()); // System.out.println("\nClass name: "+train.classAttribute().name()); // System.out.println("\nTest: "); // System.out.println("\nInstances and their weights:\n"); // System.out.println(test.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(test.sumOfWeights()); // System.out.println("\nClass name: "+test.classAttribute().name()); // } // // // Randomize instances and print them. // System.out.println("\nRandomized dataset:"); // instances.randomize(random); // // // Print all instances and their weights (and the sum of weights). // System.out.println("\nInstances and their weights:\n"); // System.out.println(instances.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(instances.sumOfWeights()); // // // Sort instances according to first attribute and // // print them. // System.out.print("\nInstances sorted according to first attribute:\n "); // instances.sort(0); // // // Print all instances and their weights (and the sum of weights). // System.out.println("\nInstances and their weights:\n"); // System.out.println(instances.instancesAndWeights()); // System.out.print("\nSum of weights: "); // System.out.println(instances.sumOfWeights()); // } catch (Exception e) { // e.printStackTrace(); // } // } /** * Main method for this class. The following calls are possible: * <ul> * <li> * <code>weka.core.Instances</code> help<br/> * prints a short list of possible commands. * </li> * <li> * <code>weka.core.Instances</code> <filename><br/> * prints a summary of a set of instances. * </li> * <li> * <code>weka.core.Instances</code> merge <filename1> <filename2><br/> * merges the two datasets (must have same number of instances) and * outputs the results on stdout. * </li> * <li> * <code>weka.core.Instances</code> append <filename1> <filename2><br/> * appends the second dataset to the first one (must have same headers) and * outputs the results on stdout. * </li> * <li> * <code>weka.core.Instances</code> randomize <seed> <filename><br/> * randomizes the dataset with the given seed and outputs the result on stdout. * </li> * </ul> * * @param args the commandline parameters */ // public static void main(String[] args) { // // try { // Instances i; // // read from stdin and print statistics // if (args.length == 0) { // DataSource source = new DataSource(System.in); // i = source.getDataSet(); // System.out.println(i.toSummaryString()); // } // // read file and print statistics // else if ((args.length == 1) && (!args[0].equals("-h")) && (!args[0].equals("help"))) { // DataSource source = new DataSource(args[0]); // i = source.getDataSet(); // System.out.println(i.toSummaryString()); // } // // read two files, merge them and print result to stdout // else if ((args.length == 3) && (args[0].toLowerCase().equals("merge"))) { // DataSource source1 = new DataSource(args[1]); // DataSource source2 = new DataSource(args[2]); // i = Instances.mergeInstances(source1.getDataSet(), source2.getDataSet()); // System.out.println(i); // } // // read two files, append them and print result to stdout // else if ((args.length == 3) && (args[0].toLowerCase().equals("append"))) { // DataSource source1 = new DataSource(args[1]); // DataSource source2 = new DataSource(args[2]); // if (!source1.getStructure().equalHeaders(source2.getStructure())) // throw new Exception("The two datasets have different headers!"); // Instances structure = source1.getStructure(); // System.out.println(source1.getStructure()); // while (source1.hasMoreElements(structure)) // System.out.println(source1.nextElement(structure)); // structure = source2.getStructure(); // while (source2.hasMoreElements(structure)) // System.out.println(source2.nextElement(structure)); // } // // read file and seed value, randomize data and print result to stdout // else if ((args.length == 3) && (args[0].toLowerCase().equals("randomize"))) { // DataSource source = new DataSource(args[2]); // i = source.getDataSet(); // i.randomize(new Random(Integer.parseInt(args[1]))); // System.out.println(i); // } // // wrong parameters // else { // System.err.println( // "\nUsage:\n" // + "\tweka.core.Instances help\n" // + "\tweka.core.Instances <filename>\n" // + "\tweka.core.Instances merge <filename1> <filename2>\n" // + "\tweka.core.Instances append <filename1> <filename2>\n" // + "\tweka.core.Instances randomize <seed> <filename>\n" // ); // System.exit(1); // } // } // catch (Exception ex) { // ex.printStackTrace(); // System.err.println(ex.getMessage()); // } // } protected int indexOf(Attribute att){ int index = -1; for(int i=0;i<Attributes.getNumAttributes() && index==-1;i++){ if(Attributes.getAttribute(i)==att) index = i; } return index; } protected int indexOfValue(Attribute att,String value){ Vector vals = att.getNominalValuesList(); String s; int index = -1; for(int i=0;i<vals.size() && index == -1;i++){ s = (String)vals.get(i); if(value.compareTo(s)==0) index = i; } return index; } }