DatasetUtil.java example

Explorer
Foundry-master
- Components
/*
 * File:                DatasetUtil.java
 * Authors:             Kevin R. Dixon
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 *
 * Copyright September 11, 2007, Sandia Corporation.  Under the terms of Contract
 * DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
 * or on behalf of the U.S. Government. Export of this program may require a
 * license from the United States Government. See CopyrightHistory.txt for
 * complete details.
 *
 */

package gov.sandia.cognition.learning.data;

import gov.sandia.cognition.collection.DefaultMultiCollection;
import gov.sandia.cognition.collection.MultiCollection;
import gov.sandia.cognition.math.matrix.DimensionalityMismatchException;
import gov.sandia.cognition.math.matrix.VectorFactory;
import gov.sandia.cognition.math.matrix.Matrix;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorUtil;
import gov.sandia.cognition.math.matrix.Vectorizable;
import gov.sandia.cognition.math.matrix.mtj.SparseMatrixFactoryMTJ;
import gov.sandia.cognition.statistics.DataDistribution;
import gov.sandia.cognition.statistics.distribution.DefaultDataDistribution;
import gov.sandia.cognition.util.DefaultPair;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Static class containing utility methods for handling Collections of data
 * in the learning package.
 *
 * @author Kevin R. Dixon
 * @since  2.0
 *
 */
public class DatasetUtil
{

    /**
     * Appends a bias (constant 1.0) to the end of each Vector in the dataset,
     * the original dataset is unmodified.  The resulting Vectors will have one
     * greater dimension and look like: [ x1 x2 ] -> [ x1 x2 1.0 ]
     * @param dataset
     * Dataset to append a bias term to, Vectors can be of different
     * dimensionality
     * @return
     * Dataset with 1.0 appended to each Vector in the dataset
     */
    public static ArrayList<Vector> appendBias(
        Collection<? extends Vector> dataset)
    {
        return DatasetUtil.appendBias(dataset, 1.0);
    }

    /**
     * Appends "biasValue" to the end of each Vector in the dataset,
     * the original dataset is unmodified.  The resulting Vectors will have one
     * greater dimension and look like: [ x1 x2 ] -> [ x1 x2 1.0 ]
     * @param dataset
     * Dataset to append a bias term to, Vectors can be of different
     * dimensionality
     * @param biasValue
     * Bias value to append to the samples
     * @return
     * Dataset with "biasValue" appended to each Vector in the dataset
     */
    public static ArrayList<Vector> appendBias(
        Collection<? extends Vector> dataset,
        double biasValue)
    {
        ArrayList<Vector> biasDataset = new ArrayList<Vector>(dataset.size());
        Vector bias = VectorFactory.getDefault().copyValues(biasValue);
        for (Vector data : dataset)
        {
            biasDataset.add(data.stack(bias));
        }
        return biasDataset;
    }

    /**
     * Takes a set of equal-dimension Vector-Vector InputOutputPairs and
     * turns them into a collection of Double-Double InputOutputPairs.  This
     * is useful when one can treat each element of the Vector-Vector pairs
     * as independent of the other elements
     * @param dataset
     * Collection of Vector-Vector InputOutputPairs.  All Vectors (both inputs
     * and outputs) must have equal dimension!!
     * @return
     * ArrayList of ArrayList of Double-Double InputOutputPairs.  The outer
     * ArrayList contains a dataset for each element in the vector (and thus
     * there are as many ArrayList<InputOutputPair<Double,Double>> as there
     * are elements in the Vectors).  Each ArrayList<InputOutputPair<Double,Double>>
     * has as many elements as the original dataset
     */
    public static ArrayList<ArrayList<InputOutputPair<Double, Double>>> decoupleVectorPairDataset(
        Collection<? extends InputOutputPair<? extends Vector, ? extends Vector>> dataset)
    {
        int numSamples = dataset.size();
        int M = dataset.iterator().next().getInput().getDimensionality();
        ArrayList<ArrayList<InputOutputPair<Double, Double>>> retval =
            new ArrayList<ArrayList<InputOutputPair<Double, Double>>>(M);
        for (int i = 0; i < M; i++)
        {
            retval.add(new ArrayList<InputOutputPair<Double, Double>>(numSamples));
        }

        for (InputOutputPair<? extends Vector, ? extends Vector> pair : dataset)
        {
            if ((pair.getInput().getDimensionality() != M) ||
                (pair.getOutput().getDimensionality() != M))
            {
                throw new IllegalArgumentException(
                    "All input-output Vectors must have same dimension!");
            }

            for (int i = 0; i < M; i++)
            {
                double x = pair.getInput().getElement(i);
                double y = pair.getOutput().getElement(i);
                InputOutputPair<Double, Double> rowPair;
                if (pair instanceof WeightedInputOutputPair<?, ?>)
                {
                    double weight = ((WeightedInputOutputPair<?, ?>) pair).getWeight();
                    rowPair = new DefaultWeightedInputOutputPair<Double, Double>(
                        x, y, weight);
                }
                else
                {
                    rowPair = new DefaultInputOutputPair<Double, Double>(x, y);
                }
                retval.get(i).add(rowPair);
            }
        }

        return retval;

    }

    /**
     * Takes a dataset of M-dimensional Vectors and turns it into M
     * datasets of Doubles
     * @param dataset
     * M-dimensional Vectors, throws IllegalArgumentException if all Vectors
     * aren't the same dimensionality
     * @return
     * M datasets of dataset.size() Doubles
     */
    static public ArrayList<ArrayList<Double>> decoupleVectorDataset(
        Collection<? extends Vector> dataset)
    {

        int M = dataset.iterator().next().getDimensionality();
        int num = dataset.size();
        ArrayList<ArrayList<Double>> decoupledDatasets =
            new ArrayList<ArrayList<Double>>(M);
        for (int i = 0; i < M; i++)
        {
            decoupledDatasets.add(new ArrayList<Double>(num));
        }

        for (Vector data : dataset)
        {

            if (M != data.getDimensionality())
            {
                throw new IllegalArgumentException(
                    "All vectors in the dataset must be the same size");
            }

            for (int i = 0; i < M; i++)
            {
                decoupledDatasets.get(i).add(data.getElement(i));
            }

        }

        return decoupledDatasets;
    }

    /**
     * Splits a dataset of input-output pair into two datasets, one for the
     * inputs that have a "true" output and another for the inputs that have
     * a "false" output
     * 
     * @param   <DataType> The type of the data.
     * @param data
     * Collection of InputOutputPairs to split according to the output flag
     * @return
     * DefaultPair of LinkedLists where the first dataset corresponds to the inputs
     * where the output was "true" and the second dataset corresponds to the
     * inputs where the output was "false"
     */
    public static <DataType> DefaultPair<LinkedList<DataType>, LinkedList<DataType>> splitDatasets(
        Collection<? extends InputOutputPair<? extends DataType, Boolean>> data)
    {
        LinkedList<DataType> dtrue = new LinkedList<DataType>();
        LinkedList<DataType> dfalse = new LinkedList<DataType>();

        for (InputOutputPair<? extends DataType, Boolean> pair : data)
        {
            if (pair.getOutput())
            {
                dtrue.add(pair.getInput());
            }
            else
            {
                dfalse.add(pair.getInput());
            }
        }
        
        return DefaultPair.create( dtrue, dfalse );
    }


    /**
     * Splits a dataset according to its output value (usually a category) so
     * that all the inputs for that category are given in a list. It maps the
     * category value to its list.
     *
     * @param   <InputType>
     *      The the of the input values.
     * @param   <CategoryType>
     *      The type of the output values.
     * @param   data
     *      The input-output pairs to split.
     * @return
     *      A mapping of category to a list of all of the inputs for that
     *      category.
     */
    public static <InputType, CategoryType> Map<CategoryType, List<InputType>> splitOnOutput(
        final Iterable<? extends InputOutputPair<? extends InputType, ? extends CategoryType>> data)
    {
        final LinkedHashMap<CategoryType, List<InputType>> result =
            new LinkedHashMap<CategoryType, List<InputType>>();

        for (InputOutputPair<? extends InputType, ? extends CategoryType> example
            : data)
        {
            final CategoryType category = example.getOutput();
            List<InputType> examplesForCategory = result.get(category);
            if (examplesForCategory == null)
            {
                examplesForCategory = new ArrayList<InputType>();
                result.put(category, examplesForCategory);
            }
            examplesForCategory.add(example.getInput());
        }

        return result;
    }

    /**
     * Computes the outer-product Matrix of the given set of data:
     * XXt = [ x1 x2 ... xn ] * [ x1 x2 ... xn ]^T.
     * The outer-product data Matrix is useful in things like computing the
     * Principal Components Analysis of the dataset.  For exapmle, finding the 
     * eigenvectors of the outer-product data Matrix is equivalent to finding 
     * the left singular Vectors ("U" from the SingularValueDecomposition) of
     * the dataset.
     * Note that if the input dataset has a size of "N" and each Vector in the
     * dataset has "M" dimensions, then the return Matrix (XXt) is an (MxM)
     * Matrix.  This method computes the return Matrix without explicitly
     * forming the data matrix, potentially saving quite a lot of memory.
     * @param data 
     * Input dataset where each of "N" Vectors has dimension of "M"
     * @return 
     * Outer product Matrix of the input dataset, having dimensions
     * (MxM).  
     */
    public static Matrix computeOuterProductDataMatrix(
        ArrayList<? extends Vector> data)
    {

        // This is X*transpose(X), if X is a (MxN) resulting in an (MxM) matrix
        // [ x1 x2 ... xn ] * [ x1 x2 ... xn ]^T
        // We're avoiding storing the entire matrices
        int M = data.iterator().next().getDimensionality();
        Matrix XXt = SparseMatrixFactoryMTJ.INSTANCE.createMatrix(M, M);
        for (int j = 0; j < M; j++)
        {
            for (int i = 0; i < M; i++)
            {
                double sum = 0.0;
                for (int k = 0; k < data.size(); k++)
                {
                    Vector dk = data.get(k);
                    sum += dk.getElement(i) * dk.getElement(j);
                }

                if( sum != 0.0 )
                {
                    XXt.setElement(i, j, sum);
                }
            }
        }

        return XXt;
    }

    /**
     * Computes the mean of the output data.
     * 
     * @param  data The data to compute the mean of the output.
     * @return The mean of the output values of the given data.
     */
    public static double computeOutputMean(
        final Collection<? extends InputOutputPair<?, ? extends Number>> data)
    {
        if (data == null)
        {
            // No data, so zero mean.
            return 0.0;
        }

        // Compute the sum in order to find the mean.
        double sum = 0.0;
        int count = 0;
        for (InputOutputPair<?, ? extends Number> example : data)
        {
            sum += example.getOutput().doubleValue();
            count++;
        }

        if (count <= 0)
        {
            // No elements, so zero mean.
            return 0.0;
        }
        else
        {
            return sum / count;
        }
    }
    
    
    /**
     * Computes the mean of the output data.
     * 
     * @param  data The data to compute the mean of the output.
     * @return The mean of the output values of the given data.
     */
    public static double computeWeightedOutputMean(
        final Collection<? extends InputOutputPair<?, ? extends Number>> data)
    {
        if (data == null || data.size() <= 0.0)
        {
            // No data, so zero mean.
            return 0.0;
        }

        // Compute the sum in order to find the mean.
        double sum = 0.0;
        double weightSum = 0.0;
        for (InputOutputPair<?, ? extends Number> example : data)
        {
            final double weight = DatasetUtil.getWeight(example);
            sum += weight * example.getOutput().doubleValue();
            weightSum += weight;
        }

        if (weightSum == 0.0)
        {
            // No elements, so zero mean.
            return 0.0;
        }
        else
        {
            return sum / weightSum;
        }
    }

    /**
     * Computes the variance of the output of a given set of input-output pairs.
     *
     * @param  data The data.
     * @return The variance of the output of the data.
     */
    public static double computeOutputVariance(
        final Collection<? extends InputOutputPair<?, ? extends Number>> data)
    {
        if (data == null)
        {
            // No data, so zero variance.
            return 0.0;
        }

        final int count = data.size();
        if (count <= 0)
        {
            // No elements, so zero variance.
            return 0.0;
        }


        // Compute the mean.
        final double mean = computeOutputMean(data);

        // Now compute the variance.
        double sum = 0.0;
        for (InputOutputPair<?, ? extends Number> example : data)
        {
            final double difference = example.getOutput().doubleValue() - mean;
            sum += difference * difference;
        }

        // Compute the variance and return it.
        final double variance = sum / count;
        return variance;
    }

    /**
     * Creates a set containing the unique output values from the given data.
     *
     * @param   <OutputType>
     *      The type of the output values.
     * @param   data
     *      The data to collect the unique output values from.
     * @return
     *      The set of unique output values. Implemented as a linked hash set.
     */
    public static <OutputType> Set<OutputType> findUniqueOutputs(
        final Iterable<? extends InputOutputPair<?, ? extends OutputType>> data)
    {
        // Create the result set.
        final Set<OutputType> outputs = new LinkedHashSet<OutputType>();

        if (data != null)
        {
            // Go through and add each output.
            for (InputOutputPair<?, ? extends OutputType> example : data)
            {
                outputs.add(example.getOutput());
            }
        }
        
        return outputs;
    }

    /**
     * Creates a data histogram over the output values from the given data.
     *
     * @param   <OutputType>
     *      The type of the output values.
     * @param   data
     *      The data to collect the output values from.
     * @return
     *      The histogram of output values.
     */
    public static <OutputType> DataDistribution<OutputType> countOutputValues(
        final Iterable<? extends InputOutputPair<?, ? extends OutputType>> data)
    {
        // Create the result set.
        final DataDistribution<OutputType> outputs =
            new DefaultDataDistribution<OutputType>();

        if (data != null)
        {
            // Go through and add each output.
            for (InputOutputPair<?, ? extends OutputType> example : data)
            {
                outputs.increment(example.getOutput());
            }
        }

        return outputs;
    }

    /**
     * Creates a list containing all of the input values from the given data.
     *
     * @param   <InputType>
     *      The type of the input values.
     * @param   data
     *      The data to collect the input values from.
     * @return
     *      A list containing the output values.
     */
    public static <InputType> List<InputType> inputsList(
        final Iterable<? extends InputOutputPair<? extends InputType, ?>> data)
    {
// TODO: Make this a proxy object rather than copying the list.
        // Create the result list.
        final ArrayList<InputType> inputs = new ArrayList<InputType>();

        if (data != null)
        {
            // Go through and add each input.
            for (InputOutputPair<? extends InputType, ?> example : data)
            {
                inputs.add(example.getInput());
            }
        }

        return inputs;
    }

    /**
     * Creates a list containing all of the output values from the given data.
     *
     * @param   <OutputType>
     *      The type of the output values.
     * @param   data
     *      The data to collect the output values from.
     * @return
     *      A list containing the output values.
     */
    public static <OutputType> List<OutputType> outputsList(
        final Iterable<? extends InputOutputPair<?, ? extends OutputType>> data)
    {
// TODO: Make this a proxy object rather than copying the list.
        // Create the result list.
        final ArrayList<OutputType> outputs = new ArrayList<OutputType>();

        if (data != null)
        {
            // Go through and add each input.
            for (InputOutputPair<?, ? extends OutputType> example : data)
            {
                outputs.add(example.getOutput());
            }
        }

        return outputs;
    }

    /**
     * Takes a collection and returns a multi-collection version of that
     * collection. If the given collection is a multi-collection, it casts it
     * to that value and returns it. If it is not a multi-collection, it creates
     * a new, singleton multi-collection with the given collection and returns
     * it.
     *
     * @param   <EntryType>
     *      The entry type of the collection.
     * @param   collection
     *      A collection.
     * @return
     *      A multi-collection version of the given collection.
     */
    public static <EntryType> MultiCollection<EntryType> asMultiCollection(
        final Collection<EntryType> collection)
    {
        if (collection instanceof MultiCollection)
        {
            return (MultiCollection<EntryType>) collection;
        }
        else
        {
            return new DefaultMultiCollection<EntryType>(
                Collections.singletonList(collection));
        }
    }

    /**
     * Takes a collection of {@code Vectorizable} objects and returns a
     * collection of {@code Vector} objects of the same size.
     *
     * @param   collection
     *      The collection of {@code Vectorizable} objects to convert.
     * @return
     *      The corresponding collection of {@code Vector} objects.
     */
    public static Collection<Vector> asVectorCollection(
        final Collection<? extends Vectorizable> collection)
    {
// TODO: Have this class make a wrapper collection that doesn't actually
// recreate the whole collection of vectors, but instead calls convertToVector
// when needed.

        // Convert the values to vector form.
        final Collection<Vector> result = new ArrayList<Vector>(
            collection.size());
        for (Vectorizable value : collection)
        {
            result.add(value.convertToVector());
        }
        return result;
    }

    /**
     * Gets the dimensionality of the input vectors in given set of input-output
     * pairs. It finds the first non-null vector and returns its dimensionality.
     * If there are non-null vectors, then -1 is returned.
     *
     * @param   data
     *      The data to find the input dimensionality of.
     * @return
     *      The dimensionality of the first non-null in put in the given data.
     *      -1 if there are no non-null inputs.
     */
    public static int getInputDimensionality(
        final Iterable<? extends InputOutputPair<? extends Vectorizable, ?>> data)
    {
        if (data != null)
        {
            for (InputOutputPair<? extends Vectorizable, ?> example : data)
            {
                if (example == null)
                {
                    // Keep looking since this was null.
                    continue;
                }

                final Vectorizable input = example.getInput();
                if (input == null)
                {
                    // Keep looking since this was null.
                    continue;
                }

                final Vector vector = input.convertToVector();
                if (vector != null)
                {
                    // Found a non-null vector. Get its dimensionality.
                    return vector.getDimensionality();
                }
                // else - The vector was null. Keep looking.
            }
        }

        // No valid vectors.
        return -1;
    }

    /**
     * Asserts that all of the dimensionalities of the input vectors in the
     * given set of input-output pairs are the same.
     *
     * @param   data
     *      A collection of input-output pairs.
     * @throws  DimensionalityMismatchException
     *      If the dimensionalities are not all equal.
     */
    public static void assertInputDimensionalitiesAllEqual(
        final Iterable<? extends InputOutputPair<? extends Vectorizable, ?>> data)
    {
        assertInputDimensionalitiesAllEqual(data, getInputDimensionality(data));
    }

    /**
     * Asserts that all of the dimensionalities of the input vectors in the
     * given set of input-output pairs equal the given dimensionality.
     *
     * @param   data
     *      A collection of input-output pairs.
     * @param   dimensionality
     *      The dimensionality that all the inputs must have.
     * @throws  DimensionalityMismatchException
     *      If the dimensionalities are not all equal.
     */
    public static void assertInputDimensionalitiesAllEqual(
        final Iterable<? extends InputOutputPair<? extends Vectorizable, ?>> data,
        final int dimensionality)
    {
        if (data != null)
        {
            for (InputOutputPair<? extends Vectorizable, ?> example : data)
            {
                if (example == null)
                {
                    // Ignore null examples.
                    continue;
                }

                final Vectorizable input = example.getInput();
                if (input == null)
                {
                    // Ignre null inputs.
                    continue;
                }
                
                final Vector vector = input.convertToVector();
                if (vector == null)
                {
                    // Ignore null vectors.
                    continue;
                }

                // Make sure this dimensionality is correct.
                vector.assertDimensionalityEquals(dimensionality);
            }
        }
    }


    /**
     * Gets the dimensionality of the vectors in given set of data. It finds
     * the first non-null vector and returns its dimensionality. If there are
     * non-null vectors, then -1 is returned.
     *
     * @param   data
     *      The data to find the dimensionality of.
     * @return
     *      The dimensionality of the first non-null vector in the given data.
     *      -1 if there are no non-null vector.
     */
    public static int getDimensionality(
        final Iterable<? extends Vectorizable> data)
    {
        if (data != null)
        {
            for (Vectorizable example : data)
            {
                if (example == null)
                {
                    // Keep looking since this was null.
                    continue;
                }

                final Vector vector = example.convertToVector();
                if (vector != null)
                {
                    // Found a non-null vector. Get its dimensionality.
                    return vector.getDimensionality();
                }
                // else - The vector was null. Keep looking.
            }
        }

        // No valid vectors.
        return -1;
    }

    /**
     * Asserts that all of the dimensionalities of the vectors in the
     * given set of data are the same.
     *
     * @param   data
     *      A collection of data.
     * @throws  DimensionalityMismatchException
     *      If the dimensionalities are not all equal.
     */
    public static void assertDimensionalitiesAllEqual(
        final Iterable<? extends Vectorizable> data)
    {
        VectorUtil.assertDimensionalitiesAllEqual(data, getDimensionality(data));
    }

    /**
     * Gets the weight of a given input-output pair. If it is a weighted
     * input-output pair (implements the {@code WeightedInputOutputPair}
     * interface, then it casts it to retrieve its weight. Otherwise, it
     * returns 1.0.
     *
     * @param   pair
     *      The pair to get the weight of.
     * @return
     *      The weight of the given pair, if it exists, otherwise 1.0.
     */
    public static double getWeight(
        final InputOutputPair<?, ?> pair)
    {
        if (pair instanceof WeightedInputOutputPair<?, ?>)
        {
            return ((WeightedInputOutputPair<?, ?>) pair).getWeight();
        }
        else
        {
            return 1.0;
        }
    }

    /**
     * Gets the weight of a given target-estimate pair. If it is a weighted
     * target-estimate pair (implements the {@code WeightedTargetEstimatePair}
     * interface, then it casts it to retrieve its weight. Otherwise, it
     * returns 1.0.
     *
     * @param   pair
     *      The pair to get the weight of.
     * @return
     *      The weight of the given pair, if it exists, otherwise 1.0.
     */
    public static double getWeight(
        final TargetEstimatePair<?, ?> pair)
    {
        if (pair instanceof WeightedTargetEstimatePair<?, ?>)
        {
            return ((WeightedTargetEstimatePair<?, ?>) pair).getWeight();
        }
        else
        {
            return 1.0;
        }
    }

    /**
     * Gets the sum of the weights of the weights of the elements of the
     * dataset. It loops over the items and calls getWeight on each one.
     *
     * @param   data
     *      The dataset to compute the sum of the weights
     * @return
     *      The sum of the weights of the elements in the dataset.
     */
    public static double sumWeights(
        final Collection<? extends InputOutputPair<?, ?>> data)
    {
        double result = 0.0;
        for (InputOutputPair<?, ?> example : data)
        {
            result += getWeight(example);
        }
        return result;
    }

}