/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * Agrawal.java * Copyright (C) 2005-2012 University of Waikato, Hamilton, New Zealand * */ package weka.datagenerators.classifiers.classification; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionUtils; import weka.core.SelectedTag; import weka.core.Tag; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.datagenerators.ClassificationGenerator; /** <!-- globalinfo-start --> * Generates a people database and is based on the paper by Agrawal et al.:<br/> * R. Agrawal, T. Imielinski, A. Swami (1993). Database Mining: A Performance Perspective. IEEE Transactions on Knowledge and Data Engineering. 5(6):914-925. URL http://www.almaden.ibm.com/software/quest/Publications/ByDate.html. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @article{Agrawal1993, * author = {R. Agrawal and T. Imielinski and A. Swami}, * journal = {IEEE Transactions on Knowledge and Data Engineering}, * note = {Special issue on Learning and Discovery in Knowledge-Based Databases}, * number = {6}, * pages = {914-925}, * title = {Database Mining: A Performance Perspective}, * volume = {5}, * year = {1993}, * URL = {http://www.almaden.ibm.com/software/quest/Publications/ByDate.html}, * PDF = {http://www.almaden.ibm.com/software/quest/Publications/papers/tkde93.pdf} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -h * Prints this help.</pre> * * <pre> -o <file> * The name of the output file, otherwise the generated data is * printed to stdout.</pre> * * <pre> -r <name> * The name of the relation.</pre> * * <pre> -d * Whether to print debug informations.</pre> * * <pre> -S * The seed for random function (default 1)</pre> * * <pre> -n <num> * The number of examples to generate (default 100)</pre> * * <pre> -F <num> * The function to use for generating the data. (default 1)</pre> * * <pre> -B * Whether to balance the class.</pre> * * <pre> -P <num> * The perturbation factor. (default 0.05)</pre> * <!-- options-end --> * * @author Richard Kirkby (rkirkby at cs dot waikato dot ac dot nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ */ public class Agrawal extends ClassificationGenerator implements TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 2254651939636143025L; /** * the interface for the class functions */ protected interface ClassFunction { /** * returns a class value based on the given inputs * @param salary the salary * @param commission the commission * @param age the age * @param elevel the education level * @param car * @param zipcode the zip code * @param hvalue * @param hyears * @param loan */ public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan); } /** * built in functions are based on the paper (page 924), * which turn out to be functions pred20 thru pred29 in the public c code */ protected static ClassFunction[] builtInFunctions = { // function 1 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { if (age < 40 || 60 <= age) return 0; else return 1; } }, // function 2 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { if (age < 40) if (50000 <= salary && salary <= 100000) return 0; else return 1; else if (age < 60) // && age >= 40 if (75000 <= salary && salary <= 125000) return 0; else return 1; else // age >= 60 if (25000 <= salary && salary <= 75000) return 0; else return 1; } }, // function 3 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { if (age < 40) if (elevel == 0 || elevel == 1) return 0; else return 1; else if (age < 60) // && age >= 40 if (elevel == 1 || elevel == 2 || elevel == 3) return 0; else return 1; else // age >= 60 if (elevel == 2 || elevel == 3 || elevel == 4) return 0; else return 1; } }, // function 4 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { if (age < 40) if (elevel == 0 || elevel == 1) if (25000 <= salary && salary <= 75000) return 0; else return 1; else if (50000 <= salary && salary <= 100000) return 0; else return 1; else if (age < 60) // && age >= 40 if (elevel == 1 || elevel == 2 || elevel == 3) if (50000 <= salary && salary <= 100000) return 0; else return 1; else if (75000 <= salary && salary <= 125000) return 0; else return 1; else // age >= 60 if (elevel == 2 || elevel == 3 || elevel == 4) if (50000 <= salary && salary <= 100000) return 0; else return 1; else if (25000 <= salary && salary <= 75000) return 0; else return 1; } }, // function 5 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { if (age < 40) if (50000 <= salary && salary <= 100000) if (100000 <= loan && loan <= 300000) return 0; else return 1; else if (200000 <= loan && loan <= 400000) return 0; else return 1; else if (age < 60) // && age >= 40 if (75000 <= salary && salary <= 125000) if (200000 <= loan && loan <= 400000) return 0; else return 1; else if (300000 <= loan && loan <= 500000) return 0; else return 1; else // age >= 60 if (25000 <= salary && salary <= 75000) if (300000 <= loan && loan <= 500000) return 0; else return 1; else if (100000 <= loan && loan <= 300000) return 0; else return 1; } }, // function 6 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { double totalSalary = salary + commission; if (age < 40) if (50000 <= totalSalary && totalSalary <= 100000) return 0; else return 1; else if (age < 60) // && age >= 40 if (75000 <= totalSalary && totalSalary <= 125000) return 0; else return 1; else // age >= 60 if (25000 <= totalSalary && totalSalary <= 75000) return 0; else return 1; } }, // function 7 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { double disposable = (2.0 * (salary + commission) / 3.0 - loan / 5.0 - 20000.0); return disposable > 0 ? 0 : 1; } }, // function 8 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { double disposable = (2.0 * (salary + commission) / 3.0 - 5000.0 * (double) elevel - 20000.0); return disposable > 0 ? 0 : 1; } }, // function 9 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { double disposable = (2.0 * (salary + commission) / 3.0 - 5000.0 * (double) elevel - loan / 5.0 - 10000.0); return disposable > 0 ? 0 : 1; } }, // function 10 new ClassFunction() { public long determineClass(double salary, double commission, int age, int elevel, int car, int zipcode, double hvalue, int hyears, double loan) { double equity = 0.0; if (hyears >= 20) equity = hvalue * ((double) hyears - 20.0) / 10.0; double disposable = (2.0 * (salary + commission) / 3.0 - 5000.0 * (double) elevel + equity / 5.0 - 10000.0); return disposable > 0 ? 0 : 1; } } }; /** function 1 */ public final static int FUNCTION_1 = 1; /** function 2 */ public final static int FUNCTION_2 = 2; /** function 3 */ public final static int FUNCTION_3 = 3; /** function 4 */ public final static int FUNCTION_4 = 4; /** function 5 */ public final static int FUNCTION_5 = 5; /** function 6 */ public final static int FUNCTION_6 = 6; /** function 7 */ public final static int FUNCTION_7 = 7; /** function 8 */ public final static int FUNCTION_8 = 8; /** function 9 */ public final static int FUNCTION_9 = 9; /** function 10 */ public final static int FUNCTION_10 = 10; /** the funtion tags */ public static final Tag[] FUNCTION_TAGS = { new Tag(FUNCTION_1, "Function 1"), new Tag(FUNCTION_2, "Function 2"), new Tag(FUNCTION_3, "Function 3"), new Tag(FUNCTION_4, "Function 4"), new Tag(FUNCTION_5, "Function 5"), new Tag(FUNCTION_6, "Function 6"), new Tag(FUNCTION_7, "Function 7"), new Tag(FUNCTION_8, "Function 8"), new Tag(FUNCTION_9, "Function 9"), new Tag(FUNCTION_10, "Function 10"), }; /** the function to use for generating the data */ protected int m_Function; /** whether to balance the class */ protected boolean m_BalanceClass; /** the perturabation fraction */ protected double m_PerturbationFraction; /** used for balancing the class */ protected boolean m_nextClassShouldBeZero; /** the last class label that was generated */ protected double m_lastLabel; /** * initializes the generator with default values */ public Agrawal() { super(); setFunction(defaultFunction()); setBalanceClass(defaultBalanceClass()); setPerturbationFraction(defaultPerturbationFraction()); } /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Generates a people database and is based on the paper by Agrawal " + "et al.:\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "R. Agrawal and T. Imielinski and A. Swami"); result.setValue(Field.YEAR, "1993"); result.setValue(Field.TITLE, "Database Mining: A Performance Perspective"); result.setValue(Field.JOURNAL, "IEEE Transactions on Knowledge and Data Engineering"); result.setValue(Field.VOLUME, "5"); result.setValue(Field.NUMBER, "6"); result.setValue(Field.PAGES, "914-925"); result.setValue(Field.NOTE, "Special issue on Learning and Discovery in Knowledge-Based Databases"); result.setValue(Field.URL, "http://www.almaden.ibm.com/software/quest/Publications/ByDate.html"); result.setValue(Field.PDF, "http://www.almaden.ibm.com/software/quest/Publications/papers/tkde93.pdf"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = enumToVector(super.listOptions()); result.add(new Option( "\tThe function to use for generating the data. (default " + defaultFunction().getSelectedTag().getID() + ")", "F", 1, "-F <num>")); result.add(new Option( "\tWhether to balance the class.", "B", 0, "-B")); result.add(new Option( "\tThe perturbation factor. (default " + defaultPerturbationFraction() + ")", "P", 1, "-P <num>")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -h * Prints this help.</pre> * * <pre> -o <file> * The name of the output file, otherwise the generated data is * printed to stdout.</pre> * * <pre> -r <name> * The name of the relation.</pre> * * <pre> -d * Whether to print debug informations.</pre> * * <pre> -S * The seed for random function (default 1)</pre> * * <pre> -n <num> * The number of examples to generate (default 100)</pre> * * <pre> -F <num> * The function to use for generating the data. (default 1)</pre> * * <pre> -B * Whether to balance the class.</pre> * * <pre> -P <num> * The perturbation factor. (default 0.05)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; super.setOptions(options); tmpStr = Utils.getOption('F', options); if (tmpStr.length() != 0) setFunction(new SelectedTag(Integer.parseInt(tmpStr), FUNCTION_TAGS)); else setFunction(defaultFunction()); setBalanceClass(Utils.getFlag('B', options)); tmpStr = Utils.getOption('P', options); if (tmpStr.length() != 0) setPerturbationFraction(Double.parseDouble(tmpStr)); else setPerturbationFraction(defaultPerturbationFraction()); } /** * Gets the current settings of the datagenerator. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; String[] options; int i; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); result.add("-F"); result.add("" + m_Function); if (getBalanceClass()) result.add("-B"); result.add("-P"); result.add("" + getPerturbationFraction()); return (String[]) result.toArray(new String[result.size()]); } /** * returns the default function * * @return the default function */ protected SelectedTag defaultFunction() { return new SelectedTag(FUNCTION_1, FUNCTION_TAGS); } /** * Gets the function for generating the data. * * @return the function. * @see #FUNCTION_TAGS */ public SelectedTag getFunction() { return new SelectedTag(m_Function, FUNCTION_TAGS); } /** * Sets the function for generating the data. * * @param value the function. * @see #FUNCTION_TAGS */ public void setFunction(SelectedTag value) { if (value.getTags() == FUNCTION_TAGS) m_Function = value.getSelectedTag().getID(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String functionTipText() { return "The function to use for generating the data."; } /** * returns the default for balancing the class * * @return the default for balancing the class */ protected boolean defaultBalanceClass() { return false; } /** * Gets whether the class is balanced. * * @return whether the class is balanced. */ public boolean getBalanceClass() { return m_BalanceClass; } /** * Sets whether the class is balanced. * * @param value whether to balance the class. */ public void setBalanceClass(boolean value) { m_BalanceClass = value; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String balanceClassTipText() { return "Whether to balance the class."; } /** * returns the default perturbation fraction * * @return the default perturbation fraction */ protected double defaultPerturbationFraction() { return 0.05; } /** * Gets the perturbation fraction. * * @return the perturbation fraction. */ public double getPerturbationFraction() { return m_PerturbationFraction; } /** * Sets the perturbation fraction. * * @param value the perturbation fraction. */ public void setPerturbationFraction(double value) { if ( (value >= 0.0) && (value <= 1.0) ) m_PerturbationFraction = value; else throw new IllegalArgumentException( "Perturbation fraction must be in [0,1] (provided: " + value + ")!"); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String perturbationFractionTipText() { return "The perturbation fraction: 0 <= fraction <= 1."; } /** * Return if single mode is set for the given data generator * mode depends on option setting and or generator type. * * @return single mode flag * @throws Exception if mode is not set yet */ public boolean getSingleModeFlag() throws Exception { return true; } /** * Initializes the format for the dataset produced. * Must be called before the generateExample or generateExamples * methods are used. * Re-initializes the random number generator with the given seed. * * @return the format for the dataset * @throws Exception if the generating of the format failed * @see #getSeed() */ public Instances defineDataFormat() throws Exception { FastVector atts; FastVector attValues; int i; m_Random = new Random(getSeed()); m_nextClassShouldBeZero = true; m_lastLabel = Double.NaN; // number of examples is the same as given per option setNumExamplesAct(getNumExamples()); // set up attributes atts = new FastVector(); atts.addElement(new Attribute("salary")); atts.addElement(new Attribute("commission")); attValues = new FastVector(); atts.addElement(new Attribute("age")); attValues = new FastVector(); for (i = 0; i < 5; i++) attValues.addElement("" + i); atts.addElement(new Attribute("elevel", attValues)); attValues = new FastVector(); for (i = 1; i <= 20; i++) attValues.addElement("" + i); atts.addElement(new Attribute("car", attValues)); attValues = new FastVector(); for (i = 0; i < 9; i++) attValues.addElement("" + i); atts.addElement(new Attribute("zipcode", attValues)); atts.addElement(new Attribute("hvalue")); atts.addElement(new Attribute("hyears")); atts.addElement(new Attribute("loan")); attValues = new FastVector(); for (i = 0; i < 2; i++) attValues.addElement("" + i); atts.addElement(new Attribute("group", attValues)); // dataset m_DatasetFormat = new Instances(getRelationNameToUse(), atts, 0); return m_DatasetFormat; } /** * perturbs the given value * * @param val the value to perturb * @param min the minimum * @param max the maximum * @return the perturbed value */ protected double perturbValue(double val, double min, double max) { return perturbValue(val, max - min, min, max); } /** * perturbs the given value * * @param val the value to perturb * @param range the range for the perturbation * @param min the minimum * @param max the maximum * @return the perturbed value */ protected double perturbValue(double val, double range, double min, double max) { val += range * (2.0 * (getRandom().nextDouble() - 0.5)) * getPerturbationFraction(); if (val < min) val = min; else if (val > max) val = max; return val; } /** * Generates one example of the dataset. * * @return the generated example * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExamples * which means in non single mode */ public Instance generateExample() throws Exception { Instance result; double salary; double commission; double hvalue; double loan; int age; int elevel; int car; int zipcode; int hyears; boolean desiredClassFound; double[] atts; Random random; ClassFunction classFunction; result = null; random = getRandom(); if (m_DatasetFormat == null) throw new Exception("Dataset format not defined."); salary = 0; commission = 0; hvalue = 0; loan = 0; age = 0; elevel = 0; car = 0; zipcode = 0; hyears = 0; desiredClassFound = false; classFunction = builtInFunctions[m_Function - 1]; while (!desiredClassFound) { // generate attributes salary = 20000.0 + 130000.0 * random.nextDouble(); commission = (salary >= 75000.0) ? 0 : (10000.0 + 65000.0 * random.nextDouble()); age = 20 + random.nextInt(61); elevel = random.nextInt(5); car = 1 + random.nextInt(20); zipcode = random.nextInt(9); hvalue = (9.0 - (double) zipcode) * 100000.0 * (0.5 + random.nextDouble()); hyears = 1 + random.nextInt(30); loan = random.nextDouble() * 500000.0; // determine class m_lastLabel = classFunction.determineClass(salary, commission, age, elevel, car, zipcode, hvalue, hyears, loan); if (!getBalanceClass()) { desiredClassFound = true; } else { // balance the classes if ( ( m_nextClassShouldBeZero && (m_lastLabel == 0)) || (!m_nextClassShouldBeZero && (m_lastLabel == 1)) ) { desiredClassFound = true; m_nextClassShouldBeZero = !m_nextClassShouldBeZero; } // else keep searching } } // perturb values if (getPerturbationFraction() > 0.0) { salary = perturbValue(salary, 20000, 150000); if (commission > 0) commission = perturbValue(commission, 10000, 75000); age = (int) Math.round(perturbValue(age, 20, 80)); hvalue = perturbValue( hvalue, (9.0 - (double) zipcode) * 100000.0, 0, 135000); hyears = (int) Math.round(perturbValue(hyears, 1, 30)); loan = perturbValue(loan, 0, 500000); } // create instance atts = new double[m_DatasetFormat.numAttributes()]; atts[0] = salary; atts[1] = commission; atts[2] = age; atts[3] = elevel; atts[4] = car - 1; atts[5] = zipcode; atts[6] = hvalue; atts[7] = hyears; atts[8] = loan; atts[9] = m_lastLabel; result = new DenseInstance(1.0, atts); result.setDataset(m_DatasetFormat); return result; } /** * Generates all examples of the dataset. Re-initializes the random number * generator with the given seed, before generating instances. * * @return the generated dataset * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExample, * which means in single mode * @see #getSeed() */ public Instances generateExamples() throws Exception { Instances result; int i; result = new Instances(m_DatasetFormat, 0); m_Random = new Random(getSeed()); for (i = 0; i < getNumExamplesAct(); i++) result.add(generateExample()); return result; } /** * Generates a comment string that documentates the data generator. * By default this string is added at the beginning of the produced output * as ARFF file type, next after the options. * * @return string contains info about the generated rules */ public String generateStart () { return ""; } /** * Generates a comment string that documentats the data generator. * By default this string is added at the end of theproduces output * as ARFF file type. * * @return string contains info about the generated rules * @throws Exception if the generating of the documentaion fails */ public String generateFinished() throws Exception { return ""; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Main method for executing this class. * * @param args should contain arguments for the data producer: */ public static void main(String[] args) { runDataGenerator(new Agrawal(), args); } }