/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * MixtureDistribution.java * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */ package weka.classifiers.functions.pace; import weka.core.RevisionHandler; import weka.core.TechnicalInformation; import weka.core.TechnicalInformationHandler; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.matrix.DoubleVector; import weka.core.matrix.IntVector; /** * Abtract class for manipulating mixture distributions. <p> * * REFERENCES <p> * * Wang, Y. (2000). "A new approach to fitting linear models in high * dimensional spaces." PhD Thesis. Department of Computer Science, * University of Waikato, New Zealand. <p> * * Wang, Y. and Witten, I. H. (2002). "Modeling for optimal probability * prediction." Proceedings of ICML'2002. Sydney. <p> * * @author Yong Wang (yongwang@cs.waikato.ac.nz) * @version $Revision: 1.5 $ */ public abstract class MixtureDistribution implements TechnicalInformationHandler, RevisionHandler { protected DiscreteFunction mixingDistribution; /** The nonnegative-measure-based method */ public static final int NNMMethod = 1; /** The probability-measure-based method */ public static final int PMMethod = 2; // The CDF-based method // public static final int CDFMethod = 3; // The method based on the Kolmogrov and von Mises measure // public static final int ModifiedCDFMethod = 4; /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.PHDTHESIS); result.setValue(Field.AUTHOR, "Wang, Y"); result.setValue(Field.YEAR, "2000"); result.setValue(Field.TITLE, "A new approach to fitting linear models in high dimensional spaces"); result.setValue(Field.SCHOOL, "Department of Computer Science, University of Waikato"); result.setValue(Field.ADDRESS, "Hamilton, New Zealand"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Wang, Y. and Witten, I. H."); additional.setValue(Field.YEAR, "2002"); additional.setValue(Field.TITLE, "Modeling for optimal probability prediction"); additional.setValue(Field.BOOKTITLE, "Proceedings of the Nineteenth International Conference in Machine Learning"); additional.setValue(Field.YEAR, "2002"); additional.setValue(Field.PAGES, "650-657"); additional.setValue(Field.ADDRESS, "Sydney, Australia"); return result; } /** * Gets the mixing distribution * * @return the mixing distribution */ public DiscreteFunction getMixingDistribution() { return mixingDistribution; } /** Sets the mixing distribution * @param d the mixing distribution */ public void setMixingDistribution( DiscreteFunction d ) { mixingDistribution = d; } /** Fits the mixture (or mixing) distribution to the data. The default * method is the nonnegative-measure-based method. * @param data the data, supposedly generated from the mixture model */ public void fit( DoubleVector data ) { fit( data, NNMMethod ); } /** Fits the mixture (or mixing) distribution to the data. * @param data the data supposedly generated from the mixture * @param method the method to be used. Refer to the static final * variables of this class. */ public void fit( DoubleVector data, int method ) { DoubleVector data2 = (DoubleVector) data.clone(); if( data2.unsorted() ) data2.sort(); int n = data2.size(); int start = 0; DoubleVector subset; DiscreteFunction d = new DiscreteFunction(); for( int i = 0; i < n-1; i++ ) { if( separable( data2, start, i, data2.get(i+1) ) && separable( data2, i+1, n-1, data2.get(i) ) ) { subset = (DoubleVector) data2.subvector( start, i ); d.plusEquals( fitForSingleCluster( subset, method ). timesEquals(i - start + 1) ); start = i + 1; } } subset = (DoubleVector) data2.subvector( start, n-1 ); d.plusEquals( fitForSingleCluster( subset, method ). timesEquals(n - start) ); d.sort(); d.normalize(); mixingDistribution = d; } /** * Fits the mixture (or mixing) distribution to the data. The data is * not pre-clustered for computational efficiency. * * @param data the data supposedly generated from the mixture * @param method the method to be used. Refer to the static final * variables of this class. * @return the generated distribution */ public DiscreteFunction fitForSingleCluster( DoubleVector data, int method ) { if( data.size() < 2 ) return new DiscreteFunction( data ); DoubleVector sp = supportPoints( data, 0 ); PaceMatrix fi = fittingIntervals( data ); PaceMatrix pm = probabilityMatrix( sp, fi ); PaceMatrix epm = new PaceMatrix( empiricalProbability( data, fi ). timesEquals( 1. / data.size() ) ); IntVector pvt = (IntVector) IntVector.seq(0, sp.size()-1); DoubleVector weights; switch( method ) { case NNMMethod: weights = pm.nnls( epm, pvt ); break; case PMMethod: weights = pm.nnlse1( epm, pvt ); break; default: throw new IllegalArgumentException("unknown method"); } DoubleVector sp2 = new DoubleVector( pvt.size() ); for( int i = 0; i < sp2.size(); i++ ){ sp2.set( i, sp.get(pvt.get(i)) ); } DiscreteFunction d = new DiscreteFunction( sp2, weights ); d.sort(); d.normalize(); return d; } /** * Return true if a value can be considered for mixture estimatino * separately from the data indexed between i0 and i1 * * @param data the data supposedly generated from the mixture * @param i0 the index of the first element in the group * @param i1 the index of the last element in the group * @param x the value * @return true if a value can be considered */ public abstract boolean separable( DoubleVector data, int i0, int i1, double x ); /** * Contructs the set of support points for mixture estimation. * * @param data the data supposedly generated from the mixture * @param ne the number of extra data that are suppposedly discarded * earlier and not passed into here * @return the set of support points */ public abstract DoubleVector supportPoints( DoubleVector data, int ne ); /** * Contructs the set of fitting intervals for mixture estimation. * * @param data the data supposedly generated from the mixture * @return the set of fitting intervals */ public abstract PaceMatrix fittingIntervals( DoubleVector data ); /** * Contructs the probability matrix for mixture estimation, given a set * of support points and a set of intervals. * * @param s the set of support points * @param intervals the intervals * @return the probability matrix */ public abstract PaceMatrix probabilityMatrix( DoubleVector s, PaceMatrix intervals ); /** * Computes the empirical probabilities of the data over a set of * intervals. * * @param data the data * @param intervals the intervals * @return the empirical probabilities */ public PaceMatrix empiricalProbability( DoubleVector data, PaceMatrix intervals ) { int n = data.size(); int k = intervals.getRowDimension(); PaceMatrix epm = new PaceMatrix( k, 1, 0 ); double point; for( int j = 0; j < n; j ++ ) { for(int i = 0; i < k; i++ ) { point = 0.0; if( intervals.get(i, 0) == data.get(j) || intervals.get(i, 1) == data.get(j) ) point = 0.5; else if( intervals.get(i, 0) < data.get(j) && intervals.get(i, 1) > data.get(j) ) point = 1.0; epm.setPlus( i, 0, point); } } return epm; } /** * Converts to a string * * @return a string representation */ public String toString() { return "The mixing distribution:\n" + mixingDistribution.toString(); } }