/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * PropositionalToMultiInstance.java * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand * */ package weka.filters.unsupervised.attribute; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RelationalLocator; import weka.core.RevisionUtils; import weka.core.StringLocator; import weka.core.Utils; import weka.core.Capabilities.Capability; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.DenseInstance; /** <!-- globalinfo-start --> * Converts the propositional instance dataset into multi-instance dataset (with relational attribute). When normalize or standardize a multi-instance dataset, a MIToSingleInstance filter can be applied first to convert the multi-instance dataset into propositional instance dataset. After normalization or standardization, may use this PropositionalToMultiInstance filter to convert the data back to multi-instance format.<br/> * <br/> * Note: the first attribute of the original propositional instance dataset must be a nominal attribute which is expected to be bagId attribute. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -S <num> * The seed for the randomization of the order of bags. (default 1)</pre> * * <pre> -R * Randomizes the order of the produced bags after the generation. (default off)</pre> * <!-- options-end --> * * @author Lin Dong (ld21@cs.waikato.ac.nz) * @version $Revision: 5547 $ * @see MultiInstanceToPropositional */ public class PropositionalToMultiInstance extends Filter implements OptionHandler, UnsupervisedFilter { /** for serialization */ private static final long serialVersionUID = 5825873573912102482L; /** the seed for randomizing, default is 1 */ protected int m_Seed = 1; /** whether to randomize the output data */ protected boolean m_Randomize = false; /** Indices of string attributes in the bag */ protected StringLocator m_BagStringAtts = null; /** Indices of relational attributes in the bag */ protected RelationalLocator m_BagRelAtts = null; /** * Returns a string describing this filter * * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Converts the propositional instance dataset into multi-instance " + "dataset (with relational attribute). When normalize or standardize a " + "multi-instance dataset, a MIToSingleInstance filter can be applied " + "first to convert the multi-instance dataset into propositional " + "instance dataset. After normalization or standardization, may use " + "this PropositionalToMultiInstance filter to convert the data back to " + "multi-instance format.\n\n" + "Note: the first attribute of the original propositional instance " + "dataset must be a nominal attribute which is expected to be bagId " + "attribute."; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tThe seed for the randomization of the order of bags." + "\t(default 1)", "S", 1, "-S <num>")); result.addElement(new Option( "\tRandomizes the order of the produced bags after the generation." + "\t(default off)", "R", 0, "-R")); return result.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -S <num> * The seed for the randomization of the order of bags. (default 1)</pre> * * <pre> -R * Randomizes the order of the produced bags after the generation. (default off)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; setRandomize(Utils.getFlag('R', options)); tmpStr = Utils.getOption('S', options); if (tmpStr.length() != 0) setSeed(Integer.parseInt(tmpStr)); else setSeed(1); } /** * Gets the current settings of the classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { Vector result; result = new Vector(); result.add("-S"); result.add("" + getSeed()); if (m_Randomize) result.add("-R"); return (String[]) result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String seedTipText() { return "The random seed used by the random number generator"; } /** * Sets the new seed for randomizing the order of the generated data * * @param value the new seed value */ public void setSeed(int value) { m_Seed = value; } /** * Returns the current seed value for randomizing the order of the generated * data * * @return the current seed value */ public int getSeed() { return m_Seed; } /** * Sets whether the order of the generated data is randomized * * @param value whether to randomize or not */ public void setRandomize(boolean value) { m_Randomize = value; } /** * Gets whether the order of the generated is randomized * * @return true if the order is randomized */ public boolean getRandomize() { return m_Randomize; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String randomizeTipText() { return "Whether the order of the generated data is randomized."; } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.STRING_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained in the object are * ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set * successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { if (instanceInfo.attribute(0).type()!= Attribute.NOMINAL) { throw new Exception("The first attribute type of the original propositional instance dataset must be Nominal!"); } super.setInputFormat(instanceInfo); /* create a new output format (multi-instance format) */ Instances newData = instanceInfo.stringFreeStructure(); Attribute attBagIndex = (Attribute) newData.attribute(0).copy(); Attribute attClass = (Attribute) newData.classAttribute().copy(); // remove the bagIndex attribute newData.deleteAttributeAt(0); // remove the class attribute newData.setClassIndex(-1); newData.deleteAttributeAt(newData.numAttributes() - 1); FastVector attInfo = new FastVector(3); attInfo.addElement(attBagIndex); attInfo.addElement(new Attribute("bag", newData)); // relation-valued attribute attInfo.addElement(attClass); Instances data = new Instances("Multi-Instance-Dataset", attInfo, 0); data.setClassIndex(data.numAttributes() - 1); super.setOutputFormat(data.stringFreeStructure()); m_BagStringAtts = new StringLocator(data.attribute(1).relation()); m_BagRelAtts = new RelationalLocator(data.attribute(1).relation()); return true; } /** * adds a new bag out of the given data and adds it to the output * * @param input the intput dataset * @param output the dataset this bag is added to * @param bagInsts the instances in this bag * @param bagIndex the bagIndex of this bag * @param classValue the associated class value * @param bagWeight the weight of the bag */ protected void addBag( Instances input, Instances output, Instances bagInsts, int bagIndex, double classValue, double bagWeight) { // copy strings/relational values for (int i = 0; i < bagInsts.numInstances(); i++) { RelationalLocator.copyRelationalValues( bagInsts.instance(i), false, input, m_InputRelAtts, bagInsts, m_BagRelAtts); StringLocator.copyStringValues( bagInsts.instance(i), false, input, m_InputStringAtts, bagInsts, m_BagStringAtts); } int value = output.attribute(1).addRelation(bagInsts); Instance newBag = new DenseInstance(output.numAttributes()); newBag.setValue(0, bagIndex); newBag.setValue(2, classValue); newBag.setValue(1, value); newBag.setWeight(bagWeight); newBag.setDataset(output); output.add(newBag); } /** * Adds an output instance to the queue. The derived class should use this * method for each output instance it makes available. * * @param instance the instance to be added to the queue. */ protected void push(Instance instance) { if (instance != null) { super.push(instance); // set correct references } } /** * Signify that this batch of input to the filter is finished. * If the filter requires all instances prior to filtering, * output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ public boolean batchFinished() { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } Instances input = getInputFormat(); input.sort(0); // make sure that bagID is sorted Instances output = getOutputFormat(); Instances bagInsts = output.attribute(1).relation(); Instance inst = new DenseInstance(bagInsts.numAttributes()); inst.setDataset(bagInsts); double bagIndex = input.instance(0).value(0); double classValue = input.instance(0).classValue(); double bagWeight = 0.0; // Convert pending input instances for(int i = 0; i < input.numInstances(); i++) { double currentBagIndex = input.instance(i).value(0); // copy the propositional instance value, except the bagIndex and the class value for (int j = 0; j < input.numAttributes() - 2; j++) inst.setValue(j, input.instance(i).value(j + 1)); inst.setWeight(input.instance(i).weight()); if (currentBagIndex == bagIndex){ bagInsts.add(inst); bagWeight += inst.weight(); } else{ addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight); bagInsts = bagInsts.stringFreeStructure(); bagInsts.add(inst); bagIndex = currentBagIndex; classValue = input.instance(i).classValue(); bagWeight = inst.weight(); } } // reach the last instance, create and add the last bag addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight); if (getRandomize()) output.randomize(new Random(getSeed())); for (int i = 0; i < output.numInstances(); i++) push(output.instance(i)); // Free memory flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5547 $"); } /** * Main method for running this filter. * * @param args should contain arguments to the filter: * use -h for help */ public static void main(String[] args) { runFilter(new PropositionalToMultiInstance(), args); } }