package weka.core.shapelet; import java.util.ArrayList; import java.util.TreeMap; import java.util.Collections; import weka.filters.timeseries.shapelet_transforms.FullShapeletTransform; /** * * copyright: Anthony Bagnall * A class to store shapelet quality measure implementations. This includes an abstract quality measure class, * and implementations of each of the four shapelet quality measures used in: * <p> * Jason Lines , Anthony Bagnall, Alternative quality measures for time series shapelets, Proceedings of the 13th international conference on Intelligent Data Engineering and Automated Learning, August 29-31, 2012, Natal, Brazil * <p> * and * <p> * Jason Lines , Luke M. Davis , Jon Hills , Anthony Bagnall, A shapelet transform for time series classification, Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining, August 12-16, 2012, Beijing, China * @author Jason Lines */ public class QualityMeasures { /** * An enum for selecting the quality measure to use in the filter for selecting the k best shapelets. * <p> * The choices include: Information Gain (KDD12), F-Stat, (KDD12), Kruskal-Wallis (IDEAL12), and Mood's Median (IDEAL12) */ public enum ShapeletQualityChoice{ /** * Used to specify that the filter will use Information Gain as the shapelet quality measure (introduced in Ye & Keogh 2009) */ INFORMATION_GAIN, /** * Used to specify that the filter will use F-Stat as the shapelet quality measure (introduced in Lines et. al 2012) */ F_STAT, /** * Used to specify that the filter will use Kruskal-Wallis as the shapelet quality measure (introduced in Lines and Bagnall 2012) */ KRUSKALL_WALLIS, /** * Used to specify that the filter will use Mood's Median as the shapelet quality measure (introduced in Lines and Bagnall 2012) */ MOODS_MEDIAN } public abstract static class ShapeletQualityMeasure { public abstract double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution); } /** * A class for calculating the information gain of a shapelet, according to the set of distances from the shapelet to a dataset. */ public static class InformationGain extends ShapeletQualityMeasure{ /** * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance from the shapelet to each element of the dataset. * * @param orderline the pre-computed set of distances for a dataset to a single shapelet * @param classDistribution the distibution of all possible class values in the orderline * @return a measure of shapelet quality according to information gain */ @Override public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution){ Collections.sort(orderline); // for each split point, starting between 0 and 1, ending between end-1 and end // addition: track the last threshold that was used, don't bother if it's the same as the last one double lastDist = -1;//orderline.get(0).getDistance(); // must be initialised as not visited(no point breaking before any data!) double thisDist = -1; double bsfGain = -1; // initialise class counts TreeMap<Double, Integer> lessClasses = new TreeMap<Double, Integer>(); TreeMap<Double, Integer> greaterClasses = new TreeMap<Double, Integer>(); // parent entropy will always be the same, so calculate just once double parentEntropy = entropy(classDistribution); int sumOfAllClasses = 0; for(double j : classDistribution.keySet()){ lessClasses.put(j, 0); greaterClasses.put(j, classDistribution.get(j)); sumOfAllClasses+=classDistribution.get(j); } int sumOfLessClasses = 0; int sumOfGreaterClasses = sumOfAllClasses; double thisClassVal; int oldCount; for(int i = 0; i < orderline.size()-1; i++){ thisDist = orderline.get(i).getDistance(); //move the threshold along one (effectively by adding this dist to lessClasses thisClassVal = orderline.get(i).getClassVal(); oldCount = lessClasses.get(thisClassVal)+1; lessClasses.put(thisClassVal,oldCount); oldCount = greaterClasses.get(thisClassVal)-1; greaterClasses.put(thisClassVal,oldCount); // adjust counts - maybe makes more sense if these are called counts, rather than sums! sumOfLessClasses++; sumOfGreaterClasses--; // check to see if the threshold has moved (ie if thisDist isn't the same as lastDist) // important, else gain calculations will be made 'in the middle' of a threshold, resulting in different info gain for // the split point, that won't actually be valid as it is 'on' a distances, rather than 'between' them/ if(thisDist != lastDist){ // calculate the info gain below the threshold double lessFrac =(double) sumOfLessClasses / sumOfAllClasses; double entropyLess = entropy(lessClasses); // calculate the info gain above the threshold double greaterFrac =(double) sumOfGreaterClasses / sumOfAllClasses; double entropyGreater = entropy(greaterClasses); double gain = parentEntropy - lessFrac * entropyLess - greaterFrac * entropyGreater; if(gain > bsfGain){ bsfGain = gain; } } lastDist = thisDist; } return bsfGain; } public static double entropy(TreeMap<Double, Integer> classDistributions){ if(classDistributions.size() == 1){ return 0; } double thisPart; double toAdd; int total = 0; for(Double d : classDistributions.keySet()){ total += classDistributions.get(d); } // to avoid NaN calculations, the individual parts of the entropy are calculated and summed. // i.e. if there is 0 of a class, then that part would calculate as NaN, but this can be caught and // set to 0. ArrayList<Double> entropyParts = new ArrayList<Double>(); for(Double d : classDistributions.keySet()){ thisPart =(double) classDistributions.get(d) / total; toAdd = -thisPart * Math.log10(thisPart) / Math.log10(2); if(Double.isNaN(toAdd)) toAdd=0; entropyParts.add(toAdd); } double entropy = 0; for(int i = 0; i < entropyParts.size(); i++){ entropy += entropyParts.get(i); } return entropy; } } /** * A class for calculating the F-Statistic of a shapelet, according to the set of distances from the shapelet to a dataset. */ public static class FStat extends ShapeletQualityMeasure{ /** * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance from the shapelet to each element of the dataset. * * @param orderline the pre-computed set of distances for a dataset to a single shapelet * @param classDistribution the distibution of all possible class values in the orderline * @return a measure of shapelet quality according to f-stat */ public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution) { Collections.sort(orderline); int numClasses = classDistribution.size(); int numInstances = orderline.size(); double[] sums = new double[numClasses]; double[] sumsSquared = new double[numClasses]; double[] sumOfSquares = new double[numClasses]; for (int i = 0; i < numClasses; i++) { sums[i] = 0; sumsSquared[i] = 0; sumOfSquares[i] = 0; } for (int i = 0; i < orderline.size(); i++) { int c = (int) orderline.get(i).getClassVal(); double thisDist = orderline.get(i).getDistance(); sums[c] += thisDist; sumOfSquares[c] += thisDist * thisDist; } for (int i = 0; i < numClasses; i++) { sumsSquared[i] = sums[i] * sums[i]; } double ssTotal = 0; double part1 = 0; double part2 = 0; for (int i = 0; i < numClasses; i++) { part1 += sumOfSquares[i]; part2 += sums[i]; } part2 *= part2; part2 /= numInstances; ssTotal = part1 - part2; double ssAmoung = 0; part1 = 0; part2 = 0; for (int i = 0; i < numClasses; i++) { part1 += (double) sumsSquared[i] / classDistribution.get((double) i);//.data[i].size(); part2 += sums[i]; } ssAmoung = part1 - (part2 * part2) / numInstances; double ssWithin = ssTotal - ssAmoung; int dfAmoung = numClasses - 1; int dfWithin = numInstances - numClasses; double msAmoung = ssAmoung / dfAmoung; double msWithin = ssWithin / dfWithin; double f = msAmoung / msWithin; return Double.isNaN(f) ? 0.0 : f; } /** * * @param orderline * @param classDistribution * @return a va */ public double calculateQualityNew(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution) { Collections.sort(orderline); int numClasses = classDistribution.size(); int numInstances = orderline.size(); double[] sums = new double[numClasses]; double[] sumsSquared = new double[numClasses]; double[] sumOfSquares = new double[numClasses]; for (int i = 0; i < orderline.size(); i++) { int c = (int) orderline.get(i).getClassVal(); double thisDist = orderline.get(i).getDistance(); sums[c] += thisDist; sumOfSquares[c] += thisDist * thisDist; } double ssTotal = 0; double part1 = 0; double part2 = 0; for (int i = 0; i < numClasses; i++) { sumsSquared[i] = sums[i] * sums[i]; part1 += sumOfSquares[i]; part2 += sums[i]; } part2 *= part2; part2 /= numInstances; ssTotal = part1 - part2; double ssAmoung = 0; part1 = 0; part2 = 0; for (int i = 0; i < numClasses; i++) { part1 += (double) sumsSquared[i] / classDistribution.get((double) i);//.data[i].size(); part2 += sums[i]; } ssAmoung = part1 - (part2 * part2) / numInstances; double ssWithin = ssTotal - ssAmoung; int dfAmoung = numClasses - 1; int dfWithin = numInstances - numClasses; double msAmoung = ssAmoung / dfAmoung; double msWithin = ssWithin / dfWithin; double f = msAmoung / msWithin; return f; } } /** * A class for calculating the Mood's Median statistic of a shapelet, according to the set of distances from the shapelet to a dataset. */ public static class MoodsMedian extends ShapeletQualityMeasure{ /** * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance from the shapelet to each element of the dataset. * * @param orderline the pre-computed set of distances for a dataset to a single shapelet * @param classDistributions the distibution of all possible class values in the orderline * @return a measure of shapelet quality according to Mood's Median */ @Override public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistributions){ //naive implementation as a benchmark for finding median - actually faster than manual quickSelect! Probably due to optimised java implementation Collections.sort(orderline); int lengthOfOrderline = orderline.size(); double median; if(lengthOfOrderline%2==0){ median = (orderline.get(lengthOfOrderline/2-1).getDistance()+orderline.get(lengthOfOrderline/2).getDistance())/2; }else{ median = orderline.get(lengthOfOrderline/2).getDistance(); } int totalCount = orderline.size(); int countBelow = 0; int countAbove = 0; int numClasses = classDistributions.size(); int[] classCountsBelowMedian = new int[numClasses]; int[] classCountsAboveMedian = new int[numClasses]; double distance; double classVal; int countSoFar; // count class distributions above and below the median for(int i = 0; i < orderline.size(); i++){ distance = orderline.get(i).getDistance(); classVal = orderline.get(i).getClassVal(); if(distance < median){ countBelow++; classCountsBelowMedian[(int)classVal]++; }else{ countAbove++; classCountsAboveMedian[(int)classVal]++; } } double chi = 0; double expectedAbove, expectedBelow; for(int i = 0; i < numClasses; i++){ expectedBelow = (double)(countBelow*classDistributions.get((double)i))/totalCount; chi += ((classCountsBelowMedian[i]-expectedBelow)*(classCountsBelowMedian[i]-expectedBelow))/expectedBelow; expectedAbove = (double)(countAbove*classDistributions.get((double)i))/totalCount; chi += ((classCountsAboveMedian[i]-expectedAbove))*(classCountsAboveMedian[i]-expectedAbove)/expectedAbove; } if(Double.isNaN(chi)){ chi = 0; // fix for cases where the shapelet is a straight line and chi is calc'd as NaN } return chi; } } /** * A class for calculating the Kruskal-Wallis statistic of a shapelet, according to the set of distances from the shapelet to a dataset. */ public static class KruskalWallis extends ShapeletQualityMeasure{ /** * A method to calculate the quality of a FullShapeletTransform, given the orderline produced by computing the distance from the shapelet to each element of the dataset. * * @param orderline the pre-computed set of distances for a dataset to a single shapelet * @param classDistribution the distibution of all possible class values in the orderline * @return a measure of shapelet quality according to Kruskal-Wallis */ public double calculateQuality(ArrayList<OrderLineObj> orderline, TreeMap<Double, Integer> classDistribution){ // sort Collections.sort(orderline); int numClasses = classDistribution.size(); int[] classRankCounts = new int[numClasses]; double[] classRankMeans = new double[numClasses]; double lastDistance = orderline.get(0).getDistance(); double thisDistance = lastDistance; double classVal = orderline.get(0).getClassVal(); classRankCounts[(int)classVal]+=1; int duplicateCount = 0; for(int i=1; i< orderline.size(); i++){ thisDistance = orderline.get(i).getDistance(); if(duplicateCount == 0 && thisDistance!=lastDistance){ // standard entry classRankCounts[(int)orderline.get(i).getClassVal()]+=i+1; }else if(duplicateCount > 0 && thisDistance!=lastDistance){ // non-duplicate following duplicates // set ranks for dupicates double minRank = i-duplicateCount; double maxRank = i; double avgRank = (minRank+maxRank)/2; for(int j = i-duplicateCount-1; j < i; j++){ classRankCounts[(int)orderline.get(j).getClassVal()]+=avgRank; } duplicateCount = 0; // then set this rank classRankCounts[(int)orderline.get(i).getClassVal()]+=i+1; } else{// thisDistance==lastDistance if(i == orderline.size() - 1){ // last one so must do the avg ranks here (basically copied from above, BUT includes this element too now) double minRank = i-duplicateCount; double maxRank = i+1; double avgRank = (minRank+maxRank)/2; for(int j = i-duplicateCount-1; j <= i; j++){ classRankCounts[(int)orderline.get(j).getClassVal()]+=avgRank; } } duplicateCount++; } lastDistance = thisDistance; } //3) overall mean rank double overallMeanRank = (1.0+orderline.size())/2; //4) sum of squared deviations from the overall mean rank double s = 0; for(int i = 0; i < numClasses; i++){ classRankMeans[i] = (double)classRankCounts[i]/classDistribution.get((double)i); s+= classDistribution.get((double)i)*(classRankMeans[i]-overallMeanRank)*(classRankMeans[i]-overallMeanRank); } //5) weight s with the scale factor double h = 12.0/(orderline.size()*(orderline.size()+1))*s; return h; } } }