/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * DataGenerator.java * Copyright (C) 2005-2012 University of Waikato, Hamilton, New Zealand * */ package weka.datagenerators; import java.io.FileOutputStream; import java.io.PrintWriter; import java.io.Serializable; import java.io.StringWriter; import java.util.Enumeration; import java.util.HashSet; import java.util.Hashtable; import java.util.Random; import java.util.Vector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.Randomizable; import weka.core.RevisionHandler; import weka.core.Utils; /** * Abstract superclass for data generators that generate data for * classifiers and clusterers. * * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ */ public abstract class DataGenerator implements OptionHandler, Randomizable, Serializable, RevisionHandler { /** for serialization */ private static final long serialVersionUID = -3698585946221802578L; /** Debugging mode */ protected boolean m_Debug = false; /** The format for the generated dataset */ protected Instances m_DatasetFormat = null; /** Relation name the dataset should have */ protected String m_RelationName = ""; /** Number of instances that should be produced into the dataset * this number is by default m_NumExamples, * but can be reset by the generator */ protected int m_NumExamplesAct; /** default output (is printed to stdout after generation) */ protected transient StringWriter m_DefaultOutput = new StringWriter(); /** PrintWriter for outputting the generated data */ protected transient PrintWriter m_Output = new PrintWriter(m_DefaultOutput); /** random number generator seed*/ protected int m_Seed; /** random number generator*/ protected Random m_Random = null; /** flag, that indicates whether the relationname is currently assembled */ protected boolean m_CreatingRelationName = false; /** a black list for options not to be listed (for derived generators) * in the makeOptionString method * @see #makeOptionString(DataGenerator) */ protected static HashSet m_OptionBlacklist; static { m_OptionBlacklist = new HashSet(); } /** * initializes with default settings. <br/> * Note: default values are set via a default<name> method. These * default methods are also used in the listOptions method and in the * setOptions method. Why? Derived generators can override the return value * of these default methods, to avoid exceptions. */ public DataGenerator() { clearBlacklist(); setNumExamplesAct(defaultNumExamplesAct()); setSeed(defaultSeed()); } /** * creates a vector out of the enumeration from the listOptions of the * super class. Only a "convenience" method. * @param enm the Enumeration to dump into a vector * @return the elements of the enumeration in a vector */ protected Vector enumToVector(Enumeration enm) { Vector result; result = new Vector(); while (enm.hasMoreElements()) result.add(enm.nextElement()); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result; result = new Vector(); result.addElement(new Option( "\tPrints this help.", "h", 1, "-h")); result.addElement(new Option( "\tThe name of the output file, otherwise the generated data is\n" + "\tprinted to stdout.", "o", 1, "-o <file>")); result.addElement(new Option( "\tThe name of the relation.", "r", 1, "-r <name>")); result.addElement(new Option( "\tWhether to print debug informations.", "d", 0, "-d")); result.addElement(new Option( "\tThe seed for random function (default " + defaultSeed() + ")", "S", 1, "-S")); return result.elements(); } /** * Parses a list of options for this object. <p/> * * For list of valid options see class description. <p/> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; // remove unwanted options options = removeBlacklist(options); tmpStr = Utils.getOption('r', options); if (tmpStr.length() != 0) setRelationName(Utils.unquote(tmpStr)); else setRelationName(""); tmpStr = Utils.getOption('o', options); if (tmpStr.length() != 0) setOutput(new PrintWriter(new FileOutputStream(tmpStr))); else if (getOutput() == null) throw new Exception("No Output defined!"); setDebug(Utils.getFlag('d', options)); tmpStr = Utils.getOption('S', options); if (tmpStr.length() != 0) setSeed(Integer.parseInt(tmpStr)); else setSeed(defaultSeed()); } /** * Gets the current settings of the datagenerator RDG1. Removing of * blacklisted options has to be done in the derived class, that defines * the blacklist-entry. * * @return an array of strings suitable for passing to setOptions * @see #removeBlacklist(String[]) */ public String[] getOptions() { Vector result; result = new Vector(); // to avoid endless loop if (!m_CreatingRelationName) { result.add("-r"); result.add(Utils.quote(getRelationNameToUse())); } if (getDebug()) result.add("-d"); result.add("-S"); result.add("" + getSeed()); return (String[]) result.toArray(new String[result.size()]); } /** * Initializes the format for the dataset produced. * Must be called before the generateExample or generateExamples * methods are used. Also sets a default relation name in case * the current relation name is empty. * * @return the format for the dataset * @throws Exception if the generating of the format failed * @see #defaultRelationName() */ public Instances defineDataFormat() throws Exception { if (getRelationName().length() == 0) setRelationName(defaultRelationName()); return m_DatasetFormat; } /** * Generates one example of the dataset. * * @return the generated example * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExamples * which means in non single mode */ public abstract Instance generateExample() throws Exception; /** * Generates all examples of the dataset. * * @return the generated dataset * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExample, * which means in single mode */ public abstract Instances generateExamples() throws Exception; /** * Generates a comment string that documentates the data generator. * By default this string is added at the beginning of the produced output * as ARFF file type, next after the options. * * @return string contains info about the generated rules * @throws Exception if the generating of the documentation fails */ public abstract String generateStart () throws Exception; /** * Generates a comment string that documentates the data generator. * By default this string is added at the end of the produced output * as ARFF file type. * * @return string contains info about the generated rules * @throws Exception if the generating of the documentation fails */ public abstract String generateFinished () throws Exception; /** * Return if single mode is set for the given data generator * mode depends on option setting and or generator type. * * @return single mode flag * @throws Exception if mode is not set yet */ public abstract boolean getSingleModeFlag () throws Exception; /** * Sets the debug flag. * @param debug the new debug flag */ public void setDebug(boolean debug) { m_Debug = debug; } /** * Gets the debug flag. * @return the debug flag */ public boolean getDebug() { return m_Debug; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String debugTipText() { return "Whether the generator is run in debug mode or not."; } /** * Sets the relation name the dataset should have. * @param relationName the new relation name */ public void setRelationName(String relationName) { m_RelationName = relationName; } /** * returns a relation name based on the options * * @return a relation name based on the options */ protected String defaultRelationName() { StringBuffer result; String[] options; String option; int i; m_CreatingRelationName = true; result = new StringBuffer(this.getClass().getName()); options = getOptions(); for (i = 0; i < options.length; i++) { option = options[i].trim(); if (i > 0) result.append("_"); result.append(option.replaceAll(" ", "_")); } m_CreatingRelationName = false; return result.toString(); } /** * returns the relation name to use, i.e., in case the currently set * relation name is empty, a generic one is returned. Must be used in * defineDataFormat() * @return the relation name * @see #defaultRelationName() * @see #defineDataFormat() */ protected String getRelationNameToUse() { String result; result = getRelationName(); if (result.length() == 0) result = defaultRelationName(); return result; } /** * Gets the relation name the dataset should have. * @return the relation name the dataset should have */ public String getRelationName() { return m_RelationName; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String relationNameTipText() { return "The relation name of the generated data (if empty, a generic one will be supplied)."; } /** * returns the default number of actual examples * * @return the default number of actual examples */ protected int defaultNumExamplesAct() { return 0; } /** * Sets the number of examples the dataset should have. * @param numExamplesAct the new number of examples */ protected void setNumExamplesAct(int numExamplesAct) { m_NumExamplesAct = numExamplesAct; } /** * Gets the number of examples the dataset should have. * @return the number of examples the dataset should have */ public int getNumExamplesAct() { return m_NumExamplesAct; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ protected String numExamplesActTipText() { return "The actual number of examples to generate."; } /** * Sets the print writer. * @param newOutput the new print writer */ public void setOutput(PrintWriter newOutput) { m_Output = newOutput; m_DefaultOutput = null; } /** * Gets the print writer. * @return print writer object */ public PrintWriter getOutput() { return m_Output; } /** * Gets the string writer, which is used for outputting to stdout. * A workaround for the problem of closing stdout when closing the * associated Printwriter. * @return print string writer object */ public StringWriter defaultOutput() { return m_DefaultOutput; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String outputTipText() { return "The output writer to use for printing the generated data."; } /** * Sets the format of the dataset that is to be generated. * @param newFormat the new dataset format of the dataset */ public void setDatasetFormat(Instances newFormat) { m_DatasetFormat = new Instances(newFormat, 0); } /** * Gets the format of the dataset that is to be generated. * @return the dataset format of the dataset */ public Instances getDatasetFormat() { if (m_DatasetFormat != null) return new Instances(m_DatasetFormat, 0); else return null; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String formatTipText() { return "The data format to use."; } /** * returns the default seed * * @return the default seed */ protected int defaultSeed() { return 1; } /** * Gets the random number seed. * * @return the random number seed. */ public int getSeed() { return m_Seed; } /** * Sets the random number seed. * * @param newSeed the new random number seed. */ public void setSeed(int newSeed) { m_Seed = newSeed; m_Random = new Random(newSeed); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String seedTipText() { return "The seed value for the random number generator."; } /** * Gets the random generator. * * @return the random generator */ public Random getRandom() { if (m_Random == null) m_Random = new Random (getSeed()); return m_Random; } /** * Sets the random generator. * * @param newRandom is the random generator. */ public void setRandom(Random newRandom) { m_Random = newRandom; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String randomTipText() { return "The random number generator to use."; } /** * Returns a string representing the dataset in the instance queue. * @return the string representing the output data format */ protected String toStringFormat() { if (m_DatasetFormat == null) return ""; return m_DatasetFormat.toString(); } /** * removes all entries from the options blacklist */ protected static void clearBlacklist() { m_OptionBlacklist.clear(); } /** * adds the given option, e.g., for "-V" use "V", to the blacklist of options * that are not to be output via the makeOptionString method * @param option the option to exclude from listing * @see #makeOptionString(DataGenerator) */ protected static void addToBlacklist(String option) { m_OptionBlacklist.add(option); } /** * checks, whether the given option is in the blacklist of options not to * be output by makeOptionString * @param option the option to check * @return true if the option is on the blacklist * @see #makeOptionString(DataGenerator) */ protected static boolean isOnBlacklist(String option) { return m_OptionBlacklist.contains(option); } /** * removes all the options from the options array that are blacklisted * * @param options the options to remove from the blacklist * @return the processed options array */ protected String[] removeBlacklist(String[] options) { Enumeration enm; Hashtable pool; Option option; // retrieve options that are on blacklist enm = listOptions(); pool = new Hashtable(); while (enm.hasMoreElements()) { option = (Option) enm.nextElement(); if (isOnBlacklist(option.name())) pool.put(option.name(), option); } // remove options enm = pool.keys(); while (enm.hasMoreElements()) { option = (Option) pool.get(enm.nextElement()); try { if (option.numArguments() == 0) Utils.getFlag(option.name(), options); else Utils.getOption(option.name(), options); } catch (Exception e) { e.printStackTrace(); } } return options; } /** * returns all the options in a string * * @param generator the DataGenerator to return all the options for * @return the assembled option string */ protected static String makeOptionString(DataGenerator generator) { StringBuffer result; Enumeration enm; Option option; result = new StringBuffer(); result.append("\nData Generator options:\n\n"); enm = generator.listOptions(); while (enm.hasMoreElements()) { option = (Option) enm.nextElement(); // skip option if on blacklist if (isOnBlacklist(option.name())) continue; result.append(option.synopsis() + "\n" + option.description() + "\n"); } return result.toString(); } /** * Calls the data generator. * * @param generator one of the data generators * @param options options of the data generator * @throws Exception if there was an error in the option list */ public static void makeData(DataGenerator generator, String[] options) throws Exception { boolean printhelp; Vector unknown; int i; // help? printhelp = (Utils.getFlag('h', options)); // read options if (!printhelp) { try { options = generator.removeBlacklist(options); generator.setOptions(options); // check for left-over options, but don't raise exception unknown = new Vector(); for (i = 0; i < options.length; i++) { if (options[i].length() != 0) unknown.add(options[i]); } if (unknown.size() > 0) { System.out.print("Unknown options:"); for (i = 0; i < unknown.size(); i++) System.out.print(" " + unknown.get(i)); System.out.println(); } } catch (Exception e) { e.printStackTrace(); printhelp = true; } } if (printhelp) { System.out.println(makeOptionString(generator)); return; } // define dataset format // computes actual number of examples to be produced generator.setDatasetFormat(generator.defineDataFormat()); // get print writer PrintWriter output = generator.getOutput(); // output of options output.println("%"); output.println("% Commandline"); output.println("%"); output.println("% " + generator.getClass().getName() + " " + Utils.joinOptions(generator.getOptions())); output.println("%"); // comment at beginning of ARFF File String commentAtStart = generator.generateStart(); if (commentAtStart.length() > 0) { output.println("%"); output.println("% Prologue"); output.println("%"); output.println(commentAtStart.trim()); output.println("%"); } // ask data generator which mode boolean singleMode = generator.getSingleModeFlag(); // start data producer if (singleMode) { // output of dataset header output.println(generator.toStringFormat()); for (i = 0; i < generator.getNumExamplesAct(); i++) { // over all examples to be produced Instance inst = generator.generateExample(); output.println(inst); } } else { // generator produces all instances at once Instances dataset = generator.generateExamples(); // output of dataset output.println(dataset); } // comment at end of ARFF File String commentAtEnd = generator.generateFinished(); if (commentAtEnd.length() > 0) { output.println("%"); output.println("% Epilogue"); output.println("%"); output.println(commentAtEnd.trim()); output.println("%"); } output.flush(); output.close(); // print result to stdout? if (generator.defaultOutput() != null) System.out.println(generator.defaultOutput().toString()); } /** * runs the datagenerator instance with the given options. * * @param datagenerator the datagenerator to run * @param options the commandline options */ protected static void runDataGenerator(DataGenerator datagenerator, String[] options) { try { DataGenerator.makeData(datagenerator, options); } catch (Exception e) { if ( (e.getMessage() != null) && (e.getMessage().indexOf("Data Generator options") == -1) ) e.printStackTrace(); else System.err.println(e.getMessage()); } } }