VectorNaiveBayesCategorizer.java example

Explorer
Foundry-master
- Components
/*
 * File:                VectorNaiveBayesCategorizer.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry Learning Core
 * 
 * Copyright November 24, 2010, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 */

package gov.sandia.cognition.learning.algorithm.bayes;

import gov.sandia.cognition.collection.CollectionUtil;
import gov.sandia.cognition.learning.algorithm.AbstractBatchAndIncrementalLearner;
import gov.sandia.cognition.learning.algorithm.IncrementalLearner;
import gov.sandia.cognition.learning.algorithm.SupervisedBatchLearner;
import gov.sandia.cognition.learning.data.DatasetUtil;
import gov.sandia.cognition.learning.data.InputOutputPair;
import gov.sandia.cognition.learning.data.DefaultWeightedValueDiscriminant;
import gov.sandia.cognition.learning.function.categorization.Categorizer;
import gov.sandia.cognition.learning.function.categorization.DiscriminantCategorizer;
import gov.sandia.cognition.math.LogMath;
import gov.sandia.cognition.math.RingAccumulator;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorInputEvaluator;
import gov.sandia.cognition.math.matrix.Vectorizable;
import gov.sandia.cognition.statistics.DataDistribution;
import gov.sandia.cognition.statistics.DistributionEstimator;
import gov.sandia.cognition.statistics.UnivariateProbabilityDensityFunction;
import gov.sandia.cognition.statistics.distribution.DefaultDataDistribution;
import gov.sandia.cognition.statistics.distribution.UnivariateGaussian;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import gov.sandia.cognition.util.ObjectUtil;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * A naive Bayesian categorizer that takes an input vector and applies an
 * independent scalar probability density function to each one.
 *
 * @param   <CategoryType>
 *      The output category type for the categorizer. Must implement equals and
 *      hash code.
 * @param   <DistributionType>
 *      The type of the distributions used to compute the conditionals for each
 *      dimension.
 * @author  Justin Basilico
 * @since   3.1
 */
public class VectorNaiveBayesCategorizer<CategoryType, DistributionType extends UnivariateProbabilityDensityFunction>
    extends AbstractCloneableSerializable
    implements Categorizer<Vectorizable, CategoryType>,
        VectorInputEvaluator<Vectorizable, CategoryType>,
        DiscriminantCategorizer<Vectorizable, CategoryType, Double>
{

    /** The prior distribution for the categorizer. */
    protected DataDistribution<CategoryType> priors;

    /** The mapping of category to the conditional distribution for the category
     *  with one probability density function for each dimension. */
    protected Map<CategoryType, List<DistributionType>> conditionals;

    /**
     * Creates a new {@code VectorNaiveBayesCategorizer} with an empty prior
     * and conditionals.
     */
    public VectorNaiveBayesCategorizer()
    {
        this(new DefaultDataDistribution<CategoryType>(),
            new LinkedHashMap<CategoryType, List<DistributionType>>());
    }

    /**
     * Creates a new {@code VectorNaiveBayesCategorizer} with the given prior
     * and conditionals.
     *
     * @param   priors
     *      The prior distribution.
     * @param   conditionals
     *      The conditional distribution.
     */
    public VectorNaiveBayesCategorizer(
        final DataDistribution<CategoryType> priors,
        final Map<CategoryType, List<DistributionType>> conditionals)
    {
        super();

        this.setPriors(priors);
        this.setConditionals(conditionals);
    }

    @Override
    public VectorNaiveBayesCategorizer<CategoryType, DistributionType> clone()
    {
        @SuppressWarnings("unchecked")
        final VectorNaiveBayesCategorizer<CategoryType, DistributionType> clone =
            (VectorNaiveBayesCategorizer<CategoryType, DistributionType>) super.clone();

        clone.priors = ObjectUtil.cloneSafe(this.priors);
        clone.conditionals =
            new LinkedHashMap<CategoryType, List<DistributionType>>(
            this.conditionals.size());
        for (CategoryType category : this.conditionals.keySet())
        {
            clone.conditionals.put(category,
                ObjectUtil.cloneSmartElementsAsArrayList(
                this.conditionals.get(category)));
        }

        return clone;
    }

    @Override
    public CategoryType evaluate(
        final Vectorizable input)
    {
        final Vector vector = input.convertToVector();

        // We want to find the category with the maximum posterior distribution.
        // This means we only have to compute the numerator of the class
        // probability formula, since the denominator is the same for every
        // class.
        double maxLogPosterior = Double.NEGATIVE_INFINITY;
        CategoryType maxCategory = null;
        for (CategoryType category : this.getCategories())
        {
            // Compute the posterior probability for the category.
            final double logPosterior = this.computeLogPosterior(
                vector, category);

            // See if the new posterior is the best found so far.
            if (maxCategory == null || logPosterior > maxLogPosterior)
            {
                maxLogPosterior = logPosterior;
                maxCategory = category;
            }
        }

        return maxCategory;
    }

    @Override
    public DefaultWeightedValueDiscriminant<CategoryType> evaluateWithDiscriminant(
        final Vectorizable input)
    {
        final Vector vector = input.convertToVector();

        // We want to find the category with the maximum posterior distribution.
        // We also compute the denominator in order to have a valid descriminant
        // value, which means adding together all of the posteriors.
        double maxLogPosterior = Double.NEGATIVE_INFINITY;
        double logDenominator = Double.NEGATIVE_INFINITY;
        CategoryType maxCategory = null;
        for (CategoryType category : this.getCategories())
        {
            // Compute the posterior probability for the category.
            final double logPosterior = this.computeLogPosterior(
                vector, category);

            // See if the new posterior is the best found so far.
            if (maxCategory == null || logPosterior > maxLogPosterior)
            {
                maxLogPosterior = logPosterior;
                maxCategory = category;
            }

            logDenominator = LogMath.add(logDenominator, logPosterior);
        }

        // The discriminant is the log of the maximum likelihood estimate,
        // which is the probability the input belongs to the most likely class.
        // This would be P(y) * P(x|y) / P(x), but since we are in log space,
        // the division is just substraction.
        final double logMaximumLikelihood = maxLogPosterior - logDenominator;
        return DefaultWeightedValueDiscriminant.create(
            maxCategory, logMaximumLikelihood);
    }

    /**
     * Computes the posterior probability that the input belongs to the
     * given category.
     *
     * @param   input
     *      The input vector.
     * @param   category
     *      The category to compute the posterior for.
     * @return
     *      The posterior probability that the input is part of the given
     *      category. Between 0.0 and 1.0.
     */
    public double computePosterior(
        final Vector input,
        final CategoryType category)
    {
        return Math.exp(this.computeLogPosterior(input, category));
    }

    /**
     * Computes the log-posterior probability that the input belongs to the
     * given category.
     *
     * @param   input
     *      The input vector.
     * @param   category
     *      The category to compute the posterior for.
     * @return
     *      The log-posterior probability.
     */
    public double computeLogPosterior(
        final Vector input,
        final CategoryType category)
    {
        // Get the prior for the class.
        final double priorProbability = this.priors.getFraction(category);

        // Now compute the posterior by looking at the probability density
        // function for each dimension. We loop until
        double logPosterior = Math.log(priorProbability);
        final List<DistributionType> probabilityFunctions =
            this.conditionals.get(category);
        final int size = probabilityFunctions.size();
        for (int i = 0; i < size; i++)
        {
            // Get the value for the element.
            final double value = input.getElement(i);
            final double x = probabilityFunctions.get(i).logEvaluate(value);

            // Update the posterior.
            logPosterior += x;
        }

        return logPosterior;
    }

    @Override
    public Set<CategoryType> getCategories()
    {
        return this.conditionals.keySet();
    }

    @Override
    public int getInputDimensionality()
    {
        // The dimensionality is the size of the first list (which should be
        // the same as the size of all the others).
        final List<DistributionType> first =
            CollectionUtil.getFirst(this.conditionals.values());

        return first == null ? 0 : first.size();
    }

    /**
     * Gets the prior distribution over the categories.
     *
     * @return
     *      The prior distribution over the categories.
     */
    public DataDistribution<CategoryType> getPriors()
    {
        return this.priors;
    }

    /**
     * Sets the prior distribution over the categories.
     *
     * @param   priors
     *      The prior distribution over the categories.
     */
    public void setPriors(
        final DataDistribution<CategoryType> priors)
    {
        this.priors = priors;
    }

    /**
     * Gets the conditional distributions, which is a mapping of category to
     * the list of probability density functions, one for each dimension of the
     * vector.
     *
     * @return
     *      The conditional distributions for each category.
     */
    public Map<CategoryType, List<DistributionType>> getConditionals()
    {
        return this.conditionals;
    }

    /**
     * Sets the conditional distributions, which is a mapping of category to
     * the list of probability density functions, one for each dimension of the
     * vector.
     *
     * @param   conditionals
     *      The conditional distributions for each category.
     */
    public void setConditionals(
        final Map<CategoryType, List<DistributionType>> conditionals)
    {
        this.conditionals = conditionals;
    }

    /**
     * A supervised batch distributionLearner for a vector Naive Bayes categorizer.
     *
     * @param   <CategoryType>
     *      The output category type for the categorizer. Must implement equals and
     *      hash code.
     * @param   <DistributionType>
     *      The type of distribution that the distributionLearner produces.
     */
    public static class Learner<CategoryType, DistributionType extends UnivariateProbabilityDensityFunction>
        extends AbstractCloneableSerializable
        implements SupervisedBatchLearner<Vectorizable, CategoryType, VectorNaiveBayesCategorizer<CategoryType, DistributionType>>
    {

        /** The distributionLearner for the distribution of each dimension of each category. */
        protected DistributionEstimator<? super Double, ? extends DistributionType> distributionEstimator;

        /**
         * Creates a new {@code BatchLearner} with a null estimator.
         */
        public Learner()
        {
            this(null);
        }

        /**
         * Creates a new {@code BatchLearner} with the given distribution
         * estimator.
         *
         * @param   distributionEstimator
         *      The estimator for the distribution of each dimension of each
         *      category.
         */
        public Learner(
            final DistributionEstimator<? super Double, ? extends DistributionType> distributionEstimator)
        {
            super();

            this.setDistributionEstimator(distributionEstimator);
        }

        @Override
        public VectorNaiveBayesCategorizer<CategoryType, DistributionType> learn(
            final Collection<? extends InputOutputPair<? extends Vectorizable, CategoryType>> data)
        {
            // Split the data by category.
            final int dimensionality = DatasetUtil.getInputDimensionality(data);
            final Map<CategoryType, List<Vectorizable>> examplesPerCategory =
                DatasetUtil.splitOnOutput(data);

            // Create the categorizer to store the result.
            final VectorNaiveBayesCategorizer<CategoryType, DistributionType> result =
                new VectorNaiveBayesCategorizer<CategoryType, DistributionType>();

            final ArrayList<Double> values = new ArrayList<Double>(data.size());

            // Go through the categories.
            for (CategoryType category : examplesPerCategory.keySet())
            {
                // Get the examples for that category.
                final List<Vectorizable> examples =
                    examplesPerCategory.get(category);
                final int count = examples.size();

                // Go through all the dimensions and create the conditional
                // distribution for it.
                final List<DistributionType> conditionals =
                    new ArrayList<DistributionType>(
                        dimensionality);
                for (int i = 0; i < dimensionality; i++)
                {
                    // Add the values for the given dimension to the array.
                    for (Vectorizable input : examples)
                    {
                        values.add(input.convertToVector().getElement(i));
                    }

                    // Create the univariate gaussian PDF.
                    conditionals.add(this.distributionEstimator.learn(values));
                    
                    // Clear the reusable array of values.
                    values.clear();
                }

                // Add the category to the priors and its conditional.
                result.priors.increment(category, count);
                result.conditionals.put(category, conditionals);
            }

            return result;
        }

        /**
         * Gets the estimation method for the distribution of each dimension of
         * each category.
         *
         * @return
         *      The estimator for the distribution of each dimension of each
         *      category.
         */
        public DistributionEstimator<? super Double, ? extends DistributionType> getDistributionEstimator()
        {
            return this.distributionEstimator;
        }

        /**
         * Sets the estimation method for the distribution of each dimension of
         * each category.
         *
         * @param   distributionEstimator
         *      The estimator for the distribution of each dimension of each
         *      category.
         */
        public void setDistributionEstimator(
            final DistributionEstimator<? super Double, ? extends DistributionType> distributionEstimator)
        {
            this.distributionEstimator = distributionEstimator;
        }

    }

    /**
     * A supervised batch distributionLearner for a vector Naive Bayes categorizer that fits
     * a Gaussian.
     *
     * @param   <CategoryType>
     *      The output category type for the categorizer. Must implement equals and
     *      hash code.
     */
    public static class BatchGaussianLearner<CategoryType>
        extends AbstractCloneableSerializable
        implements SupervisedBatchLearner<Vectorizable, CategoryType, VectorNaiveBayesCategorizer<CategoryType, UnivariateGaussian.PDF>>
    {

        /**
         * Creates a new {@code BatchGaussianLearner}.
         */
        public BatchGaussianLearner()
        {
            super();
        }

        @Override
        public VectorNaiveBayesCategorizer<CategoryType, UnivariateGaussian.PDF> learn(
            final Collection<? extends InputOutputPair<? extends Vectorizable, CategoryType>> data)
        {
            // Split the data by category.
            final int dimensionality = DatasetUtil.getInputDimensionality(data);
            final Map<CategoryType, List<Vectorizable>> examplesPerCategory =
                DatasetUtil.splitOnOutput(data);

            // Create the categorizer to store the result.
            final VectorNaiveBayesCategorizer<CategoryType, UnivariateGaussian.PDF> result =
                new VectorNaiveBayesCategorizer<CategoryType, UnivariateGaussian.PDF>();

            // Go through the categories.
            for (CategoryType category : examplesPerCategory.keySet())
            {
                // Get the examples for that category.
                final List<Vectorizable> examples =
                    examplesPerCategory.get(category);

                // Try to compute the mean and variance for each dimension in
                // one pass by using the sum of values and the sum of squared
                // values.
                final RingAccumulator<Vector> sumsAccumulator =
                    new RingAccumulator<Vector>();
                final RingAccumulator<Vector> sumsOfSquaresAccumulator =
                    new RingAccumulator<Vector>();
                for (Vectorizable input : examples)
                {
                    final Vector vector = input.convertToVector();
                    sumsAccumulator.accumulate(vector);
                    sumsOfSquaresAccumulator.accumulate(vector.dotTimes(vector));
                }

                // Transform the accumuators into vectors.
                final Vector sums = sumsAccumulator.getSum();
                final Vector sumsOfSquares = sumsOfSquaresAccumulator.getSum();

                // Figure out the number of instances and the denominator for
                // the variance. We check for values greater than 1 to avoid a
                // divide-by-zero.
                final int count = examples.size();
                final long varianceDenominator =
                    count > 1 ? (count - 1) : 1;
                final List<UnivariateGaussian.PDF> conditionals =
                    new ArrayList<UnivariateGaussian.PDF>(dimensionality);
                
                for (int i = 0; i < dimensionality; i++)
                {
                    // Figure out the mean and variance.
                    final double sum = sums.getElement(i);
                    final double sumOfSquares = sumsOfSquares.getElement(i);
                    final double mean = sum / count;
                    final double variance = (sumOfSquares - sum * mean)
                        / varianceDenominator;

                    // Create the univariate gaussian PDF.
                    conditionals.add(
                        new UnivariateGaussian.PDF(mean, variance));
                }

                // Add the category to the priors and its conditional.
                result.priors.increment(category, count);
                result.conditionals.put(category, conditionals);
            }

            return result;
        }

    }

    /**
     * An online (incremental) distributionLearner for the Naive Bayes 
     * categorizer that uses an incremental distribution learner for the
     * distribution representing each dimension for each category.
     *
     * @param   <CategoryType>
     *      The output category type for the categorizer. Must implement equals and
     *      hash code.
     * @param   <DistributionType>
     *      The type of the distributions used to compute the conditionals for each
     *      dimension.
     * @author  Justin Basilico
     * @since   3.3.0
     */
    public static class OnlineLearner<CategoryType, DistributionType extends UnivariateProbabilityDensityFunction>
        extends AbstractBatchAndIncrementalLearner<InputOutputPair<? extends Vectorizable, CategoryType>, VectorNaiveBayesCategorizer<CategoryType, DistributionType>>
    {
        /** The incremental learner for the distribution used to represent each
         *  dimension. By the generic, it must learn a univariate probability
         *  density function. */
        protected IncrementalLearner<? super Double, DistributionType> distributionLearner;

        /**
         * Creates a new learner with a null distribution learner.
         */
        public OnlineLearner()
        {
            this(null);
        }

        /**
         * Creates a new learner with a given distribution learner.
         *
         * @param   distributionLearner
         *      The learner for the distribution representing each dimension.
         */
        public OnlineLearner(
            final IncrementalLearner<? super Double, DistributionType> distributionLearner)
        {
            super();

            this.setDistributionLearner(distributionLearner);
        }

        @Override
        public VectorNaiveBayesCategorizer<CategoryType, DistributionType> createInitialLearnedObject()
        {
            return new VectorNaiveBayesCategorizer<CategoryType, DistributionType>();
        }

        @Override
        public void update(
            final VectorNaiveBayesCategorizer<CategoryType, DistributionType> target,
            final InputOutputPair<? extends Vectorizable, CategoryType> data)
        {
            // Get the input vector and the output category.
            final Vector input = data.getInput().convertToVector();
            final CategoryType category = data.getOutput();

            // Increment the priors for the category.
            target.getPriors().increment(category);

            List<DistributionType> conditionals =
                target.getConditionals().get(category);

            final int dimensionality = input.getDimensionality();
            if (conditionals == null)
            {
                // Have not seen this category yet. Initialize it.
                conditionals = new ArrayList<DistributionType>(dimensionality);

                for (int i = 0; i < dimensionality; i++)
                {
                    conditionals.add(
                        this.distributionLearner.createInitialLearnedObject());
                }

                target.getConditionals().put(category, conditionals);
            }

            // Update all the conditionals for the category.
            for (int i = 0; i < dimensionality; i++)
            {
                final DistributionType conditional = conditionals.get(i);

                this.distributionLearner.update(
                    conditional, input.getElement(i));
            }
        }

        /**
         * Gets the learner used for the distribution representing each
         * dimension.
         *
         * @return
         *      The distribution learner.
         */
        public IncrementalLearner<? super Double, DistributionType> getDistributionLearner()
        {
            return this.distributionLearner;
        }

        /**
         * Sets the learner used for the distribution representing each
         * dimension.
         *
         * @param   distributionLearner
         *      The distribution learner.
         */
        public void setDistributionLearner(
            final IncrementalLearner<? super Double, DistributionType> distributionLearner)
        {
            this.distributionLearner = distributionLearner;
        }


    }


}