package cc.mallet.util; /* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** * Class of static methods for calculating statistics of a SparseVector sample * packaged in an InstanceList. * * @author Jerod Weinman <A HREF="mailto:weinman@cs.umass.edu">weinman@cs.umass.edu</A> */ import java.util.Arrays; import java.util.Iterator; import cc.mallet.types.*; import gnu.trove.TIntHashSet; public class VectorStats { /** * Returns a <CODE>SparseVector</CODE> whose entries (taken from the union of * those in the instances) are the expected values of those in the * <CODE>InstanceList</CODE>. This implies the returned vector will not have * binary values. */ public static SparseVector mean(InstanceList instances) { if (instances == null || instances.size() == 0) return null; Iterator<Instance> instanceItr = instances.iterator(); SparseVector v; Instance instance; int indices[]; int maxSparseIndex = -1; int maxDenseIndex = -1; // First, we find the union of all the indices used in the instances TIntHashSet hIndices = new TIntHashSet(instances.getDataAlphabet().size()); while (instanceItr.hasNext()) { instance = (Instance) instanceItr.next(); v = (SparseVector) (instance.getData()); indices = v.getIndices(); if (indices != null) { hIndices.addAll(indices); if (indices[indices.length - 1] > maxSparseIndex) maxSparseIndex = indices[indices.length - 1]; } else // dense if (v.numLocations() > maxDenseIndex) maxDenseIndex = v.numLocations() - 1; } if (maxDenseIndex > -1) // dense vectors were present { if (maxSparseIndex > maxDenseIndex) // sparse vectors were present and they had greater indices than // the dense vectors { // therefore, we create sparse vectors and // add all the dense indices for (int i = 0; i <= maxDenseIndex; i++) hIndices.add(i); } else // sparse indices may have been present, but we don't care // since they never had indices that exceeded those of the // dense vectors { return mean(instances, maxDenseIndex + 1); } } // reaching this statement implies we can create a sparse vector return mean(instances, hIndices.toArray()); } /** * Returns a <CODE>SparseVector</CODE> whose entries (dense with the given * number of indices) are the expected values of those in the * <CODE>InstanceList</CODE>. This implies the returned vector will not have * binary values. */ public static SparseVector mean(InstanceList instances, int numIndices) { SparseVector mv = new SparseVector(new double[numIndices], false); return mean(instances, mv); } /** * Returns a <CODE>SparseVector</CODE> whose entries (the given indices) are * the expected values of those in the <CODE>InstanceList</CODE>. This implies * the returned vector will not have binary values. */ public static SparseVector mean(InstanceList instances, int[] indices) { // Create the mean vector with the indices having all zeros, // nothing copied, sorted, and no checks for duplicates. // gdruck@cs.umass.edu // it is faster to sort indices first Arrays.sort(indices); SparseVector mv = new SparseVector(indices, new double[indices.length], // gdruck@cs.umass.edu // it is faster to sort indices first (above) // false, true, false); false, false, false); return mean(instances, mv); } private static SparseVector mean(InstanceList instances, SparseVector meanVector) { if (instances == null || instances.size() == 0) return null; Instance instance; SparseVector v; Iterator<Instance> instanceItr = instances.iterator(); double factor = 1.0 / (double) instances.size(); while (instanceItr.hasNext()) { instance = (Instance) instanceItr.next(); v = (SparseVector) (instance.getData()); meanVector.plusEqualsSparse(v, factor); } return meanVector; } /** * Returns a <CODE>SparseVector</CODE> whose entries (taken from the union of * those in the instances) are the variance of those in the * <CODE>InstanceList</CODE>. This implies the returned vector will not have * binary values. * * @param unbiased * Normalizes by N-1 when true, and by N otherwise. */ public static SparseVector variance(InstanceList instances, boolean unbiased) { return variance(instances, mean(instances), unbiased); } /** * Returns a <CODE>SparseVector</CODE> whose entries (taken from the mean * argument) are the variance of those in the <CODE>InstanceList</CODE>. This * implies the returned vector will not have binary values. * * @param unbiased * Normalizes by N-1 when true, and by N otherwise. */ public static SparseVector variance(InstanceList instances, SparseVector mean, boolean unbiased) { if (instances == null || instances.size() == 0) return null; double factor = 1.0 / (double) (instances.size() - (unbiased ? 1.0 : 0.0)); System.out.println("factor = " + factor); SparseVector v; // var = (x^2 - n*mu^2)/(n-1) SparseVector vv = (SparseVector) mean.cloneMatrix(); vv.timesEqualsSparse(vv, -(double) instances.size() * factor); Iterator<Instance> instanceItr = instances.iterator(); Instance instance; while (instanceItr.hasNext()) { instance = (Instance) instanceItr.next(); v = (SparseVector) ((SparseVector) (instance.getData())).cloneMatrix(); v.timesEqualsSparse(v); vv.plusEqualsSparse(v, factor); } System.out.println("Var:\n" + vv); return vv; } /** Returns unbiased variance */ public static SparseVector variance(InstanceList instances) { return variance(instances, true); } /** Returns unbiased variance of instances having the given mean. */ public static SparseVector variance(InstanceList instances, SparseVector mean) { return variance(instances, mean, true); } /** * Square root of variance. * * @param mean * Mean of the given instances. * @param unbiased * Normalizes variance by N-1 when true, and by N otherwise. * @see variance */ public static SparseVector stddev(InstanceList instances, SparseVector mean, boolean unbiased) { if (instances.size() == 0) return null; SparseVector sv = variance(instances, mean, unbiased); int dim = sv.numLocations(); double val; for (int i = 0; i < dim; i++) { val = sv.valueAtLocation(i); sv.setValueAtLocation(i, Math.sqrt(val)); } return sv; } /** Square root of unbiased variance. */ public static SparseVector stddev(InstanceList instances) { return stddev(instances, true); } /** * Square root of variance. * * @param unbiased * Normalizes variance by N-1 when true, and by N otherwise. * @see variance */ public static SparseVector stddev(InstanceList instances, boolean unbiased) { return stddev(instances, mean(instances), unbiased); } /** Square root of unbiased variance of instances having the given mean */ public static SparseVector stddev(InstanceList instances, SparseVector mean) { return stddev(instances, mean, true); } }