/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * KDDataGenerator.java * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */ package weka.gui.boundaryvisualizer; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.core.Utils; import java.io.Serializable; import java.util.Random; /** * KDDataGenerator. Class that uses kernels to generate new random * instances based on a supplied set of instances. * * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a> * @version $Revision: 1.7 $ * @since 1.0 * @see DataGenerator * @see Serializable */ public class KDDataGenerator implements DataGenerator, Serializable { /** for serialization */ private static final long serialVersionUID = -958573275606402792L; /** the instances to use */ private Instances m_instances; /** standard deviations of the normal distributions for numeric attributes in * each KD estimator */ private double [] m_standardDeviations; /** global means or modes to use for missing values */ private double [] m_globalMeansOrModes; /** minimum standard deviation for numeric attributes */ private double m_minStdDev = 1e-5; /** Laplace correction for discrete distributions */ private double m_laplaceConst = 1.0; /** random number seed */ private int m_seed = 1; /** random number generator */ private Random m_random; /** which dimensions to use for computing a weight for each generated * instance */ private boolean [] m_weightingDimensions; /** the values for the weighting dimensions to use for computing the weight * for the next instance to be generated */ private double [] m_weightingValues; private static double m_normConst = Math.sqrt(2*Math.PI); /** Number of neighbours to use for kernel bandwidth */ private int m_kernelBandwidth = 3; /** standard deviations for numeric attributes computed from the * m_kernelBandwidth nearest neighbours for each kernel. */ private double [][] m_kernelParams; /** The minimum values for numeric attributes. */ protected double [] m_Min; /** The maximum values for numeric attributes. */ protected double [] m_Max; /** * Initialize the generator using the supplied instances * * @param inputInstances the instances to use as the basis of the kernels * @throws Exception if an error occurs */ public void buildGenerator(Instances inputInstances) throws Exception { m_random = new Random(m_seed); m_instances = inputInstances; m_standardDeviations = new double [m_instances.numAttributes()]; m_globalMeansOrModes = new double [m_instances.numAttributes()]; if (m_weightingDimensions == null) { m_weightingDimensions = new boolean[m_instances.numAttributes()]; } /* for (int i = 0; i < m_instances.numAttributes(); i++) { if (i != m_instances.classIndex()) { if (m_instances.attribute(i).isNumeric()) { // global standard deviations double var = m_instances.variance(i); if (var == 0) { var = m_minStdDev; } else { var = Math.sqrt(var); // heuristic to take into account # instances and dimensions double adjust = Math.pow((double) m_instances.numInstances(), 1.0 / m_instances.numAttributes()); // double adjust = m_instances.numInstances(); var /= adjust; } m_standardDeviations[i] = var; } else { m_globalMeansOrModes[i] = m_instances.meanOrMode(i); } } } */ for (int i = 0; i < m_instances.numAttributes(); i++) { if (i != m_instances.classIndex()) { m_globalMeansOrModes[i] = m_instances.meanOrMode(i); } } m_kernelParams = new double [m_instances.numInstances()][m_instances.numAttributes()]; computeParams(); } public double [] getWeights() { double [] weights = new double[m_instances.numInstances()]; for (int k = 0; k < m_instances.numInstances(); k++) { double weight = 1; for (int i = 0; i < m_instances.numAttributes(); i++) { if (m_weightingDimensions[i]) { double mean = 0; if (!m_instances.instance(k).isMissing(i)) { mean = m_instances.instance(k).value(i); } else { mean = m_globalMeansOrModes[i]; } double wm = 1.0; // wm = normalDens(m_weightingValues[i], mean, m_standardDeviations[i]); wm = normalDens(m_weightingValues[i], mean, m_kernelParams[k][i]); weight *= wm; } } weights[k] = weight; } return weights; } /** * Return a cumulative distribution from a discrete distribution * * @param dist the distribution to use * @return the cumulative distribution */ private double [] computeCumulativeDistribution(double [] dist) { double [] cumDist = new double[dist.length]; double sum = 0; for (int i = 0; i < dist.length; i++) { sum += dist[i]; cumDist[i] = sum; } return cumDist; } /** * Generates a new instance using one kernel estimator. Each successive * call to this method incremets the index of the kernel to use. * * @return the new random instance * @throws Exception if an error occurs */ public double [][] generateInstances(int [] indices) throws Exception { double [][] values = new double[m_instances.numInstances()][]; for (int k = 0; k < indices.length; k++) { values[indices[k]] = new double[m_instances.numAttributes()]; for (int i = 0; i < m_instances.numAttributes(); i++) { if ((!m_weightingDimensions[i]) && (i != m_instances.classIndex())) { if (m_instances.attribute(i).isNumeric()) { double mean = 0; double val = m_random.nextGaussian(); if (!m_instances.instance(indices[k]).isMissing(i)) { mean = m_instances.instance(indices[k]).value(i); } else { mean = m_globalMeansOrModes[i]; } val *= m_kernelParams[indices[k]][i]; val += mean; values[indices[k]][i] = val; } else { // nominal attribute double [] dist = new double[m_instances.attribute(i).numValues()]; for (int j = 0; j < dist.length; j++) { dist[j] = m_laplaceConst; } if (!m_instances.instance(indices[k]).isMissing(i)) { dist[(int)m_instances.instance(indices[k]).value(i)]++; } else { dist[(int)m_globalMeansOrModes[i]]++; } Utils.normalize(dist); double [] cumDist = computeCumulativeDistribution(dist); double randomVal = m_random.nextDouble(); int instVal = 0; for (int j = 0; j < cumDist.length; j++) { if (randomVal <= cumDist[j]) { instVal = j; break; } } values[indices[k]][i] = (double)instVal; } } } } return values; } /** * Density function of normal distribution. * @param x input value * @param mean mean of distribution * @param stdDev standard deviation of distribution */ private double normalDens (double x, double mean, double stdDev) { double diff = x - mean; return (1/(m_normConst*stdDev))*Math.exp(-(diff*diff/(2*stdDev*stdDev))); } /** * Set which dimensions to use when computing a weight for the next * instance to generate * * @param dims an array of booleans indicating which dimensions to use */ public void setWeightingDimensions(boolean [] dims) { m_weightingDimensions = dims; } /** * Set the values for the weighting dimensions to be used when computing * the weight for the next instance to be generated * * @param vals an array of doubles containing the values of the * weighting dimensions (corresponding to the entries that are set to * true throw setWeightingDimensions) */ public void setWeightingValues(double [] vals) { m_weightingValues = vals; } /** * Return the number of kernels (there is one per training instance) * * @return the number of kernels */ public int getNumGeneratingModels() { if (m_instances != null) { return m_instances.numInstances(); } return 0; } /** * Set the kernel bandwidth (number of nearest neighbours to cover) * * @param kb an <code>int</code> value */ public void setKernelBandwidth(int kb) { m_kernelBandwidth = kb; } /** * Get the kernel bandwidth * * @return an <code>int</code> value */ public int getKernelBandwidth() { return m_kernelBandwidth; } /** * Initializes a new random number generator using the * supplied seed. * * @param seed an <code>int</code> value */ public void setSeed(int seed) { m_seed = seed; m_random = new Random(m_seed); } /** * Calculates the distance between two instances * * @param test the first instance * @param train the second instance * @return the distance between the two given instances, between 0 and 1 */ private double distance(Instance first, Instance second) { double diff, distance = 0; for(int i = 0; i < m_instances.numAttributes(); i++) { if (i == m_instances.classIndex()) { continue; } double firstVal = m_globalMeansOrModes[i]; double secondVal = m_globalMeansOrModes[i]; switch (m_instances.attribute(i).type()) { case Attribute.NUMERIC: // If attribute is numeric if (!first.isMissing(i)) { firstVal = first.value(i); } if (!second.isMissing(i)) { secondVal = second.value(i); } diff = norm(firstVal,i) - norm(secondVal,i); break; default: diff = 0; break; } distance += diff * diff; } return Math.sqrt(distance); } /** * Normalizes a given value of a numeric attribute. * * @param x the value to be normalized * @param i the attribute's index */ private double norm(double x,int i) { if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i])) { return 0; } else { return (x - m_Min[i]) / (m_Max[i] - m_Min[i]); } } /** * Updates the minimum and maximum values for all the attributes * based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0; j < m_instances.numAttributes(); j++) { if (!instance.isMissing(j)) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j); } } } } private void computeParams() throws Exception { // Calculate the minimum and maximum values m_Min = new double [m_instances.numAttributes()]; m_Max = new double [m_instances.numAttributes()]; for (int i = 0; i < m_instances.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } for (int i = 0; i < m_instances.numInstances(); i++) { updateMinMax(m_instances.instance(i)); } double [] distances = new double[m_instances.numInstances()]; for (int i = 0; i < m_instances.numInstances(); i++) { Instance current = m_instances.instance(i); for (int j = 0; j < m_instances.numInstances(); j++) { distances[j] = distance(current, m_instances.instance(j)); } int [] sorted = Utils.sort(distances); int k = m_kernelBandwidth; double bandwidth = distances[sorted[k]]; // Check for bandwidth zero if (bandwidth <= 0) { for (int j = k + 1; j < sorted.length; j++) { if (distances[sorted[j]] > bandwidth) { bandwidth = distances[sorted[j]]; break; } } if (bandwidth <= 0) { throw new Exception("All training instances coincide with " +"test instance!"); } } for (int j = 0; j < m_instances.numAttributes(); j++) { if ((m_Max[j] - m_Min[j]) > 0) { m_kernelParams[i][j] = bandwidth * (m_Max[j] - m_Min[j]); } } } } }