/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * KDDataGenerator.java * Copyright (C) 2002 Mark Hall * */ package weka.gui.boundaryvisualizer; import weka.core.*; import java.util.Random; import java.io.*; /** * KDDataGenerator. Class that uses kernels to generate new random * instances based on a supplied set of instances. * * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a> * @version $Revision: 1.1.1.1 $ * @since 1.0 * @see DataGenerator * @see Serializable */ public class KDDataGenerator implements DataGenerator, Serializable { // the instances to use private Instances m_instances; // standard deviations of the normal distributions for numeric attributes in // each KD estimator private double [] m_standardDeviations; // global means or modes to use for missing values private double [] m_globalMeansOrModes; // minimum standard deviation for numeric attributes private double m_minStdDev = 1e-5; // Laplace correction for discrete distributions private double m_laplaceConst = 1.0; // random number seed private int m_seed = 1; // random number generator private Random m_random; // the kernel estimator from which to generate the next instance from private int m_KDToGenerateFrom; // which dimensions to use for computing a weight for each generated // instance private boolean [] m_weightingDimensions; // the values for the weighting dimensions to use for computing the weight // for the next instance to be generated private double [] m_weightingValues; // created once only - for generating instances fast private Instance m_instance; private double [] m_instanceVals; private static double m_normConst = Math.sqrt(2*Math.PI); /** * Initialize the generator using the supplied instances * * @param inputInstances the instances to use as the basis of the kernels * @exception Exception if an error occurs */ public void buildGenerator(Instances inputInstances) throws Exception { m_random = new Random(m_seed); m_KDToGenerateFrom = 0; m_instances = inputInstances; m_standardDeviations = new double [m_instances.numAttributes()]; m_globalMeansOrModes = new double [m_instances.numAttributes()]; if (m_weightingDimensions == null) { m_weightingDimensions = new boolean[m_instances.numAttributes()]; } for (int i = 0; i < m_instances.numAttributes(); i++) { if (m_instances.attribute(i).isNumeric()) { // global standard deviations double var = m_instances.variance(i); if (var == 0) { var = m_minStdDev; } else { var = Math.sqrt(var); // heuristic to take into account # instances and dimensions double adjust = Math.pow((double) m_instances.numInstances(), 1.0 / m_instances.numAttributes()); // double adjust = m_instances.numInstances(); var /= adjust; } m_standardDeviations[i] = var; } else { m_globalMeansOrModes[i] = m_instances.meanOrMode(i); } } m_instanceVals = new double [m_instances.numAttributes()]; m_instance = new Instance(1.0, m_instanceVals); } /** * Return a cumulative distribution from a discrete distribution * * @param dist the distribution to use * @return the cumulative distribution */ private double [] computeCumulativeDistribution(double [] dist) { double [] cumDist = new double[dist.length]; double sum = 0; for (int i = 0; i < dist.length; i++) { sum += dist[i]; cumDist[i] = sum; } return cumDist; } /** * Generate a new instance. Returns the instance in an brand new * Instance object. * * @return an <code>Instance</code> value * @exception Exception if an error occurs */ public Instance generateInstance() throws Exception { return generateInstance(false); } /** * Generate a new instance. Reuses an existing instance object to * speed up the process. * * @return an <code>Instance</code> value * @exception Exception if an error occurs */ public Instance generateInstanceFast() throws Exception { return generateInstance(true); } /** * Generates a new instance using one kernel estimator. Each successive * call to this method incremets the index of the kernel to use. * * @param fast generate the instance quickly * @return the new random instance * @exception Exception if an error occurs */ private Instance generateInstance(boolean fast) throws Exception { if (m_weightingDimensions.length != m_instances.numAttributes()) { throw new Exception("Weighting dimension array != num attributes!"); } Instance newInst; if (fast) { newInst = m_instance; } else { m_instanceVals = new double [m_instances.numAttributes()]; newInst = new Instance(1.0, m_instanceVals); } double weight = 1; for (int i = 0; i < m_instances.numAttributes(); i++) { if (!m_weightingDimensions[i]) { if (m_instances.attribute(i).isNumeric()) { double mean = 0; double val = m_random.nextGaussian(); if (!m_instances.instance(m_KDToGenerateFrom).isMissing(i)) { mean = m_instances.instance(m_KDToGenerateFrom).value(i); } else { mean = m_globalMeansOrModes[i]; } val *= m_standardDeviations[i]; val += mean; m_instanceVals[i] = val; } else { // nominal attribute double [] dist = new double[m_instances.attribute(i).numValues()]; for (int j = 0; j < dist.length; j++) { dist[j] = m_laplaceConst; } if (!m_instances.instance(m_KDToGenerateFrom).isMissing(i)) { dist[(int)m_instances.instance(m_KDToGenerateFrom).value(i)]++; } else { dist[(int)m_globalMeansOrModes[i]]++; } Utils.normalize(dist); double [] cumDist = computeCumulativeDistribution(dist); double randomVal = m_random.nextDouble(); int instVal = 0; for (int j = 0; j < cumDist.length; j++) { if (randomVal <= cumDist[j]) { instVal = j; break; } } m_instanceVals[i] = (double)instVal; } } else { double mean = 0; if (!m_instances.instance(m_KDToGenerateFrom).isMissing(i)) { mean = m_instances.instance(m_KDToGenerateFrom).value(i); } else { mean = m_globalMeansOrModes[i]; } double wm = 1.0; if (m_instances.attribute(i).isNumeric()) { wm = normalDens(m_weightingValues[i], mean, m_standardDeviations[i]); } else { wm = (1.0 + m_laplaceConst) / (m_instances.attribute(i).numValues() * m_laplaceConst); } if (wm > 0) { weight *= wm; } m_instanceVals[i] = m_weightingValues[i]; } } newInst.setWeight(weight); // next kernel to generate from m_KDToGenerateFrom++; m_KDToGenerateFrom %= m_instances.numInstances(); return newInst; } /** * Density function of normal distribution. * @param x input value * @param mean mean of distribution * @param stdDev standard deviation of distribution */ private double normalDens (double x, double mean, double stdDev) { double diff = x - mean; return (1/(m_normConst*stdDev))*Math.exp(-(diff*diff/(2*stdDev*stdDev))); } /** * Set which dimensions to use when computing a weight for the next * instance to generate * * @param dims an array of booleans indicating which dimensions to use */ public void setWeightingDimensions(boolean [] dims) { m_weightingDimensions = dims; } /** * Set the values for the weighting dimensions to be used when computing * the weight for the next instance to be generated * * @param vals an array of doubles containing the values of the * weighting dimensions (corresponding to the entries that are set to * true throw setWeightingDimensions) */ public void setWeightingValues(double [] vals) { m_weightingValues = vals; } /** * Return the number of kernels (there is one per training instance) * * @return the number of kernels */ public int getNumGeneratingModels() { if (m_instances != null) { return m_instances.numInstances(); } return 0; } /** * Main method for tesing this class * * @param args a <code>String[]</code> value */ public static void main(String [] args) { try { Reader r = null; if (args.length != 1) { throw new Exception("Usage: KDDataGenerator <filename>"); } else { r = new BufferedReader(new FileReader(args[0])); Instances insts = new Instances(r); KDDataGenerator dg = new KDDataGenerator(); dg.buildGenerator(insts); Instances header = new Instances(insts,0); System.out.println(header); for (int i = 0; i < insts.numInstances(); i++) { Instance newInst = dg.generateInstance(); newInst.setDataset(header); System.out.println(newInst); } } } catch (Exception ex) { ex.printStackTrace(); } } }