QualityMeasures.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
package weka.core.shapelet;

import java.util.ArrayList;
import java.util.TreeMap;
import java.util.Collections;
import weka.filters.timeseries.shapelet_transforms.FullShapeletTransform;


/**
 *      * copyright: Anthony Bagnall

 * A class to store shapelet quality measure implementations. This includes an abstract quality measure class,
 * and implementations of each of the four shapelet quality measures used in:
 * <p>
 * Jason Lines , Anthony Bagnall, Alternative quality measures for time series shapelets, Proceedings of the 13th international conference on Intelligent Data Engineering and Automated Learning, August 29-31, 2012, Natal, Brazil
 * <p>
 * and
 * <p>
 * Jason Lines , Luke M. Davis , Jon Hills , Anthony Bagnall, A shapelet transform for time series classification, Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining, August 12-16, 2012, Beijing, China
 * @author Jason Lines
 */
public class QualityMeasures {

        /**
     * An enum for selecting the quality measure to use in the filter for selecting the k best shapelets.
     * <p>
     * The choices include: Information Gain (KDD12), F-Stat, (KDD12), Kruskal-Wallis (IDEAL12), and Mood's Median (IDEAL12)
     */
    public enum ShapeletQualityChoice{
        /**
         * Used to specify that the filter will use Information Gain as the shapelet quality measure (introduced in Ye & Keogh 2009)
         */
        INFORMATION_GAIN,
        /**
         * Used to specify that the filter will use F-Stat as the shapelet quality measure (introduced in Lines et. al 2012)
         */
        F_STAT,
        /**
         * Used to specify that the filter will use Kruskal-Wallis as the shapelet quality measure (introduced in Lines and Bagnall 2012)
         */
        KRUSKALL_WALLIS,
        /**
         * Used to specify that the filter will use Mood's Median as the shapelet quality measure (introduced in Lines and Bagnall 2012)
         */
        MOODS_MEDIAN
    }

    public abstract static class ShapeletQualityMeasure {

        public abstract double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution);
    }


    /**
     * A class for calculating the information gain of a shapelet, according to the set of distances from the shapelet to a dataset.
     */
    public static class InformationGain extends ShapeletQualityMeasure{

        /**
         * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance
 from the shapelet to each element of the dataset.
         *
         * @param orderline the pre-computed set of distances for a dataset to a single shapelet
         * @param classDistribution the distibution of all possible class values in the orderline
         * @return a measure of shapelet quality according to information gain
         */
        @Override
        public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution){
            Collections.sort(orderline);
            // for each split point, starting between 0 and 1, ending between end-1 and end
            // addition: track the last threshold that was used, don't bother if it's the same as the last one
            double lastDist = -1;//orderline.get(0).getDistance(); // must be initialised as not visited(no point breaking before any data!)
            double thisDist = -1;

            double bsfGain = -1;

            // initialise class counts
            TreeMap<Double, Integer> lessClasses = new TreeMap<Double, Integer>();
            TreeMap<Double, Integer> greaterClasses = new TreeMap<Double, Integer>();

            // parent entropy will always be the same, so calculate just once
            double parentEntropy = entropy(classDistribution);

            int sumOfAllClasses = 0;
            for(double j : classDistribution.keySet()){
                lessClasses.put(j, 0);
                greaterClasses.put(j, classDistribution.get(j));
                sumOfAllClasses+=classDistribution.get(j);
            }
            int sumOfLessClasses = 0;
            int sumOfGreaterClasses = sumOfAllClasses;

            double thisClassVal;
            int oldCount;

            for(int i = 0; i < orderline.size()-1; i++){
                thisDist = orderline.get(i).getDistance();

                //move the threshold along one (effectively by adding this dist to lessClasses
                thisClassVal = orderline.get(i).getClassVal();
                oldCount = lessClasses.get(thisClassVal)+1;
                lessClasses.put(thisClassVal,oldCount);
                oldCount = greaterClasses.get(thisClassVal)-1;
                greaterClasses.put(thisClassVal,oldCount);

                // adjust counts - maybe makes more sense if these are called counts, rather than sums!
                sumOfLessClasses++;
                sumOfGreaterClasses--;

                // check to see if the threshold has moved (ie if thisDist isn't the same as lastDist)
                // important, else gain calculations will be made 'in the middle' of a threshold, resulting in different info gain for
                // the split point, that won't actually be valid as it is 'on' a distances, rather than 'between' them/
                if(thisDist != lastDist){

                    // calculate the info gain below the threshold
                    double lessFrac =(double) sumOfLessClasses / sumOfAllClasses;
                    double entropyLess = entropy(lessClasses);

                    // calculate the info gain above the threshold
                    double greaterFrac =(double) sumOfGreaterClasses / sumOfAllClasses;
                    double entropyGreater = entropy(greaterClasses);

                    double gain = parentEntropy - lessFrac * entropyLess - greaterFrac * entropyGreater;
                    if(gain > bsfGain){
                        bsfGain = gain;
                    }
                }
                lastDist = thisDist;
            }
            return bsfGain;
        }

        public static double entropy(TreeMap<Double, Integer> classDistributions){
            if(classDistributions.size() == 1){
                return 0;
            }

            double thisPart;
            double toAdd;
            int total = 0;
            for(Double d : classDistributions.keySet()){
                total += classDistributions.get(d);
            }
            // to avoid NaN calculations, the individual parts of the entropy are calculated and summed.
            // i.e. if there is 0 of a class, then that part would calculate as NaN, but this can be caught and
            // set to 0.
            ArrayList<Double> entropyParts = new ArrayList<Double>();
            for(Double d : classDistributions.keySet()){
                thisPart =(double) classDistributions.get(d) / total;
                toAdd = -thisPart * Math.log10(thisPart) / Math.log10(2);
                if(Double.isNaN(toAdd))
                    toAdd=0;
                entropyParts.add(toAdd);
            }

            double entropy = 0;
            for(int i = 0; i < entropyParts.size(); i++){
                entropy += entropyParts.get(i);
            }
            return entropy;
        }


    }

    /**
     * A class for calculating the F-Statistic of a shapelet, according to the set of distances from the shapelet to a dataset.
     */
    public static class FStat extends ShapeletQualityMeasure{
        /**
         * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance
 from the shapelet to each element of the dataset.
         *
         * @param orderline the pre-computed set of distances for a dataset to a single shapelet
         * @param classDistribution the distibution of all possible class values in the orderline
         * @return a measure of shapelet quality according to f-stat
         */
        public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution) {
            Collections.sort(orderline);
            int numClasses = classDistribution.size();
            int numInstances = orderline.size();

            double[] sums = new double[numClasses];
            double[] sumsSquared = new double[numClasses];
            double[] sumOfSquares = new double[numClasses];

            for (int i = 0; i < numClasses; i++) {
                sums[i] = 0;
                sumsSquared[i] = 0;
                sumOfSquares[i] = 0;
            }

            for (int i = 0; i < orderline.size(); i++) {
                int c = (int) orderline.get(i).getClassVal();
                double thisDist = orderline.get(i).getDistance();
                sums[c] += thisDist;
                sumOfSquares[c] += thisDist * thisDist;
            }

            for (int i = 0; i < numClasses; i++) {
                sumsSquared[i] = sums[i] * sums[i];
            }

            double ssTotal = 0;
            double part1 = 0;
            double part2 = 0;

            for (int i = 0; i < numClasses; i++) {
                part1 += sumOfSquares[i];
                part2 += sums[i];
            }

            part2 *= part2;
            part2 /= numInstances;
            ssTotal = part1 - part2;

            double ssAmoung = 0;
            part1 = 0;
            part2 = 0;
            for (int i = 0; i < numClasses; i++) {
                part1 += (double) sumsSquared[i] / classDistribution.get((double) i);//.data[i].size();
                part2 += sums[i];
            }
            ssAmoung = part1 - (part2 * part2) / numInstances;
            double ssWithin = ssTotal - ssAmoung;

            int dfAmoung = numClasses - 1;
            int dfWithin = numInstances - numClasses;

            double msAmoung = ssAmoung / dfAmoung;
            double msWithin = ssWithin / dfWithin;

            double f = msAmoung / msWithin;

            return Double.isNaN(f) ? 0.0 : f;
        }

        /**
         *
         * @param orderline
         * @param classDistribution
         * @return a va
         */
        public double calculateQualityNew(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution) {
            Collections.sort(orderline);
            int numClasses = classDistribution.size();
            int numInstances = orderline.size();

            double[] sums = new double[numClasses];
            double[] sumsSquared = new double[numClasses];
            double[] sumOfSquares = new double[numClasses];

            for (int i = 0; i < orderline.size(); i++) {
                int c = (int) orderline.get(i).getClassVal();
                double thisDist = orderline.get(i).getDistance();
                sums[c] += thisDist;
                sumOfSquares[c] += thisDist * thisDist;
            }

            double ssTotal = 0;
            double part1 = 0;
            double part2 = 0;

            for (int i = 0; i < numClasses; i++) {
                sumsSquared[i] = sums[i] * sums[i];
                part1 += sumOfSquares[i];
                part2 += sums[i];
            }

            part2 *= part2;
            part2 /= numInstances;
            ssTotal = part1 - part2;

            double ssAmoung = 0;
            part1 = 0;
            part2 = 0;
            for (int i = 0; i < numClasses; i++) {
                part1 += (double) sumsSquared[i] / classDistribution.get((double) i);//.data[i].size();
                part2 += sums[i];
            }
            ssAmoung = part1 - (part2 * part2) / numInstances;
            double ssWithin = ssTotal - ssAmoung;

            int dfAmoung = numClasses - 1;
            int dfWithin = numInstances - numClasses;

            double msAmoung = ssAmoung / dfAmoung;
            double msWithin = ssWithin / dfWithin;

            double f = msAmoung / msWithin;

            return f;
        }
    }

    /**
     * A class for calculating the Mood's Median statistic of a shapelet, according to the set of distances from the shapelet to a dataset.
     */
    public static class MoodsMedian extends ShapeletQualityMeasure{

        /**
         * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance
 from the shapelet to each element of the dataset.
         *
         * @param orderline the pre-computed set of distances for a dataset to a single shapelet
         * @param classDistributions the distibution of all possible class values in the orderline
         * @return a measure of shapelet quality according to Mood's Median
         */
        @Override
        public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistributions){


            //naive implementation as a benchmark for finding median - actually faster than manual quickSelect! Probably due to optimised java implementation
            Collections.sort(orderline);
            int lengthOfOrderline = orderline.size();
            double median;
            if(lengthOfOrderline%2==0){
                median = (orderline.get(lengthOfOrderline/2-1).getDistance()+orderline.get(lengthOfOrderline/2).getDistance())/2;
            }else{
                median = orderline.get(lengthOfOrderline/2).getDistance();
            }

            int totalCount = orderline.size();
            int countBelow = 0;
            int countAbove = 0;
            int numClasses = classDistributions.size();
            int[] classCountsBelowMedian = new int[numClasses];
            int[] classCountsAboveMedian = new int[numClasses];

            double distance;
            double classVal;
            int countSoFar;
            // count class distributions above and below the median
            for(int i = 0; i < orderline.size(); i++){
                distance = orderline.get(i).getDistance();
                classVal = orderline.get(i).getClassVal();
                if(distance < median){
                    countBelow++;
                    classCountsBelowMedian[(int)classVal]++;
                }else{
                    countAbove++;
                    classCountsAboveMedian[(int)classVal]++;
                }
            }

            double chi = 0;
            double expectedAbove, expectedBelow;
            for(int i = 0; i < numClasses; i++){
                expectedBelow = (double)(countBelow*classDistributions.get((double)i))/totalCount;
                chi += ((classCountsBelowMedian[i]-expectedBelow)*(classCountsBelowMedian[i]-expectedBelow))/expectedBelow;

                expectedAbove = (double)(countAbove*classDistributions.get((double)i))/totalCount;
                chi += ((classCountsAboveMedian[i]-expectedAbove))*(classCountsAboveMedian[i]-expectedAbove)/expectedAbove;
            }

            if(Double.isNaN(chi)){
                chi = 0; // fix for cases where the shapelet is a straight line and chi is calc'd as NaN
            }
            return chi;
        }

    }
    
    /**
     * A class for calculating the Kruskal-Wallis statistic of a shapelet, according to the set of distances from the shapelet to a dataset.
     */
    public static class KruskalWallis extends ShapeletQualityMeasure{

        /**
         * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance
 from the shapelet to each element of the dataset.
         *
         * @param orderline the pre-computed set of distances for a dataset to a single shapelet
         * @param classDistribution the distibution of all possible class values in the orderline
         * @return a measure of shapelet quality according to Kruskal-Wallis
         */
        public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution){
            // sort
            Collections.sort(orderline);
            int numClasses = classDistribution.size();
            int[] classRankCounts = new int[numClasses];
            double[] classRankMeans = new double[numClasses];
            
            double lastDistance = orderline.get(0).getDistance();
            double thisDistance = lastDistance;
            double classVal = orderline.get(0).getClassVal();
            classRankCounts[(int)classVal]+=1;

            int duplicateCount = 0;


            for(int i=1; i< orderline.size(); i++){
                thisDistance = orderline.get(i).getDistance();
                if(duplicateCount == 0 && thisDistance!=lastDistance){ // standard entry
                    classRankCounts[(int)orderline.get(i).getClassVal()]+=i+1;
                    
                }else if(duplicateCount > 0 && thisDistance!=lastDistance){ // non-duplicate following duplicates
                    // set ranks for dupicates

                    double minRank = i-duplicateCount;
                    double maxRank = i;
                    double avgRank = (minRank+maxRank)/2;

                    for(int j = i-duplicateCount-1; j < i; j++){
                        classRankCounts[(int)orderline.get(j).getClassVal()]+=avgRank;
                    }


                    duplicateCount = 0;
                    // then set this rank
                    classRankCounts[(int)orderline.get(i).getClassVal()]+=i+1;
                } else{// thisDistance==lastDistance
                    if(i == orderline.size() - 1){ // last one so must do the avg ranks here (basically copied from above, BUT includes this element too now)

                        double minRank = i-duplicateCount;
                        double maxRank = i+1;
                        double avgRank = (minRank+maxRank)/2;

                        for(int j = i-duplicateCount-1; j <= i; j++){
                            classRankCounts[(int)orderline.get(j).getClassVal()]+=avgRank;
                        }
                    }
                    duplicateCount++;
                }
                lastDistance = thisDistance;
            }

            //3) overall mean rank
            double overallMeanRank = (1.0+orderline.size())/2;
          
            //4) sum of squared deviations from the overall mean rank
            double s = 0;
            for(int i = 0; i < numClasses; i++){
                classRankMeans[i] = (double)classRankCounts[i]/classDistribution.get((double)i);
                s+= classDistribution.get((double)i)*(classRankMeans[i]-overallMeanRank)*(classRankMeans[i]-overallMeanRank);
            }

            //5) weight s with the scale factor
            double h = 12.0/(orderline.size()*(orderline.size()+1))*s;

            return h;
        }
    }
}