/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * NaiveBayesSimpleSoft.java * Copyright (C) 2003 Ray Mooney * */ package weka.classifiers.sparse; import weka.classifiers.*; import java.io.*; import java.util.*; import weka.core.*; /** * Version of NaiveBayesSimpleSparse that supports training on SoftClassifiedInstances * and WeightedInstances for use with SemiSupEM * * @author Ray Mooney (mooney@cs.utexas.edu) */ public class NaiveBayesSimpleSparseSoft extends NaiveBayesSimpleSparse implements SoftClassifier, OptionHandler, WeightedInstancesHandler { /** * Generates the classifier. * * @param instances set of instances serving as training data * @exception Exception if the classifier has not been generated successfully */ public void buildClassifier(SoftClassifiedInstances instances) throws Exception { if (instances.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Sparse Instances are optimized for non-string attributes!"); } if (instances.checkForNominalAttributes()) { throw new UnsupportedAttributeTypeException("Sparse Instances are optimized for non-nominal attributes!"); } if (instances.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Sparse Naive Bayes: Class is numeric!"); } if (m_debug) { System.out.println("Training on " + instances.numInstances() + " instances"); } m_instances = instances; m_classIndex = instances.classIndex(); m_numClasses = instances.numClasses(); m_numAttributes = instances.numAttributes(); int numTrainingInstances = 0; // Reserve space m_priors = new double[m_numClasses]; m_condProbs = new double[m_numAttributes][m_numClasses]; double[] totalCounts = new double[m_numClasses]; // stores total count of all tokens in each category // Compute counts and sums Enumeration enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { SparseInstance instance = (SparseInstance) enumInsts.nextElement(); for (int i = 0; i < instance.numValues(); i++) { int attrIdx = instance.index(i); if (attrIdx == m_classIndex) continue; double value = instance.valueSparse(i); if (Instance.isMissingValue(value)) throw new NoSupportForMissingValuesException("Sparse instance should not have missing value"); for (int classIdx = 0; classIdx < m_numClasses; classIdx++) { // increment counts by attribute value weighted by class probability and instance weight double incr = value * ((SoftClassifiedInstance)instance).getClassProbability(classIdx) * instance.weight(); m_condProbs[attrIdx][classIdx] += incr; totalCounts[classIdx] += incr; } } for (int classIdx = 0; classIdx < m_numClasses; classIdx++) { m_priors[classIdx] += ((SoftClassifiedInstance)instance).getClassProbability(classIdx) * instance.weight();; numTrainingInstances += instance.weight(); } } // Compute log probabilities for each attribute for (int i = 0; i < m_numAttributes; i++) { if (i == m_classIndex) continue; double[] countArray = m_condProbs[i]; for(int j = 0; j < m_numClasses; j++){ countArray[j] = Math.log((countArray[j] + (m_m / m_numAttributes))/(totalCounts[j]+ m_m)); } } // Calculate priors for (int i = 0; i < m_numClasses; i++) { m_priors[i] = Math.log((m_priors[i] + (m_m / m_numClasses)) / (numTrainingInstances + m_m)); } if (m_debug) { System.out.print("Priors: ["); for (int i = 0; i < m_priors.length; i++) System.out.print(m_priors[i] + "(" + Math.exp(m_priors[i]) + ") "); System.out.println("]"); } } public String globalInfo() { return "SoftClassifier version of NaiveBayesSimpleSparse for use with SemiSupEM"; } }