package weka.classifiers.bayes; import weka.classifiers.Classifier; import weka.classifiers.DistributionClassifier; import weka.classifiers.Evaluation; import weka.classifiers.UpdateableClassifier; import java.io.*; import java.util.*; import weka.core.*; import weka.estimators.*; /** * Class for a Naive Bayes classifier using estimator classes. Numeric * estimator precision values are chosen based on analysis of the * training data. For this reason, the classifier is not an * UpdateableClassifier (which in typical usage are initialized with zero * training instances) -- if you need the UpdateableClassifier functionality, * use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable * classifier will use a default precision of 0.1 for numeric attributes * when buildClassifier is called with zero training instances. * <p> * For more information on Naive Bayes classifiers, see<p> * * George H. John and Pat Langley (1995). <i>Estimating * Continuous Distributions in Bayesian Classifiers</i>. Proceedings * of the Eleventh Conference on Uncertainty in Artificial * Intelligence. pp. 338-345. Morgan Kaufmann, San Mateo.<p> * * Valid options are:<p> * * -K <br> * Use kernel estimation for modelling numeric attributes rather than * a single normal distribution.<p> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.2 $ */ public class NaiveBayes extends DistributionClassifier implements OptionHandler, WeightedInstancesHandler { /** The attribute estimators. */ protected Estimator [][] m_Distributions; /** The class estimator. */ protected Estimator m_ClassDistribution; /** * Whether to use kernel density estimator rather than normal distribution * for numeric attributes */ protected boolean m_UseKernelEstimator; /** The number of classes (or 1 for numeric class) */ protected int m_NumClasses; /** * The dataset header for the purposes of printing out a semi-intelligible * model */ protected Instances m_Instances; /*** The precision parameter used for numeric attributes */ protected static final double DEFAULT_NUM_PRECISION = 0.01; /** * Return class-feature-value distributions. */ public Estimator [][]getDistributions(){ return m_Distributions; } /** * Generates the classifier. * * @param instances set of instances serving as training data * @exception Exception if the classifier has not been generated * successfully */ public void buildClassifier(Instances instances) throws Exception { if (instances.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (instances.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Naive Bayes: Class is numeric!"); } m_NumClasses = instances.numClasses(); if (m_NumClasses < 0) { throw new Exception ("Dataset has no class attribute"); } // Copy the instances m_Instances = new Instances(instances); // Reserve space for the distributions m_Distributions=new Estimator[m_Instances.numAttributes()-1][m_Instances.numClasses()]; m_ClassDistribution=new DiscreteEstimator(m_Instances.numClasses(),true); int attIndex = 0; Enumeration enum = m_Instances.enumerateAttributes(); while (enum.hasMoreElements()) { Attribute attribute = (Attribute) enum.nextElement(); // If the attribute is numeric, determine the estimator // numeric precision from differences between adjacent values double numPrecision = DEFAULT_NUM_PRECISION; if (attribute.type() == Attribute.NUMERIC) { m_Instances.sort(attribute); if ((m_Instances.numInstances() > 0) && !m_Instances.instance(0).isMissing(attribute)) { double lastVal = m_Instances.instance(0).value(attribute); double currentVal, deltaSum = 0; int distinct = 0; for (int i = 1; i < m_Instances.numInstances(); i++) { Instance currentInst = m_Instances.instance(i); if (currentInst.isMissing(attribute)) { break; } currentVal = currentInst.value(attribute); if (currentVal != lastVal) { deltaSum += currentVal - lastVal; lastVal = currentVal; distinct++; } } if (distinct > 0) { numPrecision = deltaSum / distinct; } } } for (int j = 0; j < m_Instances.numClasses(); j++) { switch (attribute.type()) { case Attribute.NUMERIC: if (m_UseKernelEstimator) { m_Distributions[attIndex][j] = new KernelEstimator(numPrecision); } else { m_Distributions[attIndex][j] = new NormalEstimator(numPrecision); } break; case Attribute.NOMINAL: m_Distributions[attIndex][j]=new DiscreteEstimator(attribute.numValues(),true); break; default: throw new Exception("Attribute type unknown to NaiveBayes"); } } attIndex++; } // Compute counts Enumeration enumInsts = m_Instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { Instance instance = (Instance) enumInsts.nextElement(); updateClassifier(instance); } // Save space m_Instances = new Instances(m_Instances, 0); } /** * Updates the classifier with the given instance. * * @param instance the new training instance to include in the model * @exception Exception if the instance could not be incorporated in * the model. */ public void updateClassifier(Instance instance) throws Exception { if (!instance.classIsMissing()) { Enumeration enumAtts = m_Instances.enumerateAttributes(); int attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = (Attribute) enumAtts.nextElement(); if (!instance.isMissing(attribute)) { m_Distributions[attIndex][(int)instance.classValue()].addValue(instance.value(attribute),instance.weight()); } attIndex++; } m_ClassDistribution.addValue(instance.classValue(), instance.weight()); } } /** * Calculates the class membership probabilities for the given test * instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if there is a problem generating the prediction */ public double [] distributionForInstance(Instance instance) throws Exception { double [] probs = new double[m_NumClasses]; for (int j = 0; j < m_NumClasses; j++) { probs[j] = m_ClassDistribution.getProbability(j); } Enumeration enumAtts = instance.enumerateAttributes(); int attIndex = 0; while(enumAtts.hasMoreElements()){ Attribute attribute=(Attribute)enumAtts.nextElement(); if(!instance.isMissing(attribute)){ double temp,max=0; for(int j=0;j<m_NumClasses;j++){ temp=Math.max(1e-75,m_Distributions[attIndex][j].getProbability(instance.value(attribute))); probs[j]*=temp; if(probs[j]>max)max=probs[j]; if(Double.isNaN(probs[j])){ throw new Exception("NaN returned from estimator for attribute " +attribute.name()+":\n" +m_Distributions[attIndex][j].toString()); } } if((max>0)&&(max<1e-75)){//Danger of probability underflow for(int j=0;j<m_NumClasses;j++)probs[j]*=1e75; } } attIndex++; } // Display probabilities Utils.normalize(probs); return probs; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(1); newVector.addElement( new Option("\tUse kernel density estimator rather than normal\n" +"\tdistribution for numeric attributes", "K", 0,"-K")); return newVector.elements(); } /** * Parses a given list of options. Valid options are:<p> * * -K <br> * Use kernel estimation for modelling numeric attributes rather than * a single normal distribution.<p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { m_UseKernelEstimator = Utils.getFlag('K', options); Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [1]; int current = 0; if (m_UseKernelEstimator) { options[current++] = "-K"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns a description of the classifier. * * @return a description of the classifier as a string. */ public String toString() { StringBuffer text = new StringBuffer(); text.append("Naive Bayes Classifier"); if (m_Instances == null) { text.append(": No model built yet."); } else { try { for (int i = 0; i < m_Distributions[0].length; i++) { text.append("\n\nClass " + m_Instances.classAttribute().value(i) + ": Prior probability = " + Utils. doubleToString(m_ClassDistribution.getProbability(i), 4, 2) + "\n\n"); Enumeration enumAtts = m_Instances.enumerateAttributes(); int attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = (Attribute) enumAtts.nextElement(); text.append(attribute.name() + ": " + m_Distributions[attIndex][i]); attIndex++; } } } catch (Exception ex) { text.append(ex.getMessage()); } } return text.toString(); } /** * Gets if kernel estimator is being used. * * @return Value of m_UseKernelEstimatory. */ public boolean getUseKernelEstimator() { return m_UseKernelEstimator; } /** * Sets if kernel estimator is to be used. * * @param v Value to assign to m_UseKernelEstimatory. */ public void setUseKernelEstimator(boolean v) { m_UseKernelEstimator = v; } /** * Main method for testing this class. * * @param argv the options */ public static void main(String [] argv) { try { System.out.println(Evaluation.evaluateModel(new NaiveBayes(), argv)); } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); } } }