package weka.filters.timeseries.shapelet_transforms; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Random; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.shapelet.OrderLineObj; import weka.core.shapelet.QualityBound; import weka.core.shapelet.QualityMeasures; import weka.core.shapelet.Shapelet; /** * An optimised filter to transform a dataset by k shapelets. * * This method uses the distance calculation early abandons described in the * "Trillions" KDD paper from Eamonn's group * * @author Edgaras Baranauskas */ public class ShapeletTransform2 extends FullShapeletTransform2 { private static long subseqDistOpCount; /** * Default constructor; Quality measure defaults to information gain. */ public ShapeletTransform2() { super(); } /** * Single param constructor: filter is unusable until min/max params are * initialised. Quality measure defaults to information gain. * * @param k the number of shapelets to be generated */ public ShapeletTransform2(int k) { super(k); } /** * Full constructor to create a usable filter. Quality measure defaults to * information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets */ public ShapeletTransform2(int k, int minShapeletLength, int maxShapeletLength) { super(k, minShapeletLength, maxShapeletLength); } /** * Full, exhaustive, constructor for a filter. Quality measure set via enum, * invalid selection defaults to information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets * @param qualityChoice the shapelet quality measure to be used with this * filter */ public ShapeletTransform2(int k, int minShapeletLength, int maxShapeletLength, QualityMeasures.ShapeletQualityChoice qualityChoice) { super(k, minShapeletLength, maxShapeletLength, qualityChoice); } @Override protected Instances buildTansformedDataset(Instances data) { //Reorder the training data and reset the shapelet indexes Instances output = determineOutputFormat(data); Shapelet s; double[][] sortedIndexes; // for each data, get distance to each shapelet and create new instance int size = shapelets.size(); int dataSize = data.numInstances(); //create our data instances for(int j = 0; j < dataSize; j++) { output.add(new DenseInstance(size + 1)); } double dist; for (int i = 0; i < size; i++) { s = shapelets.get(i); sortedIndexes = sortIndexes(s.content); for (int j = 0; j < dataSize; j++) { dist = onlineSubsequenceDistance(s.content, sortedIndexes, data.instance(j)); output.instance(j).setValue(i, dist); } } //do the classValues. for(int j=0; j < dataSize; j++) { output.instance(j).setValue(size, data.instance(j).classValue()); } return output; } @Override protected Shapelet checkCandidate(double[] candidate, Instances data, int seriesId, int startPos, QualityBound.ShapeletQualityBound qualityBound) { // create orderline by looping through data set and calculating the subsequence // distance from candidate to all data, inserting in order. ArrayList<OrderLineObj> orderline = new ArrayList<>(); boolean pruned = false; double[][] sortedIndexes = sortIndexes(candidate); int dataSize = data.numInstances(); for (int i = 0; i < dataSize; i++) { //Check if it is possible to prune the candidate if (qualityBound != null) { if (qualityBound.pruneCandidate()) { pruned = true; break; } } double distance = 0.0; if (i != seriesId) { distance = onlineSubsequenceDistance(candidate, sortedIndexes, data.instance(i)); } double classVal = data.instance(i).classValue(); // without early abandon, it is faster to just add and sort at the end orderline.add(new OrderLineObj(distance, classVal)); //Update qualityBound - presumably each bounding method for different quality measures will have a different update procedure. if (qualityBound != null) { qualityBound.updateOrderLine(orderline.get(orderline.size() - 1)); } } // note: early abandon entropy pruning would appear here, but has been ommitted // in favour of a clear multi-class information gain calculation. Could be added in // this method in the future for speed up, but distance early abandon is more important //If shapelet is pruned then it should no longer be considered in further processing if (!pruned) { // create a shapelet object to store all necessary info, i.e. Shapelet shapelet = new Shapelet(candidate, dataSourceIDs[seriesId], startPos, qualityMeasure); shapelet.calculateQuality(orderline, classDistributions); return shapelet; } return null; } @Override protected double[] zNorm(double[] input, boolean classValOn) { return optimizedZNormalise(input, classValOn); } /** * Calculate the distance between a candidate series and an Instance object * * @param candidate a double[] representation of a shapelet candidate * @param sortedIndices * @param timeSeriesIns an Instance object of a whole time series * @return the distance between a candidate and a time series */ public static double onlineSubsequenceDistance(double[] candidate, double[][] sortedIndices, Instance timeSeriesIns) { double[] timeSeries = timeSeriesIns.toDoubleArray(); return onlineSubsequenceDistance(candidate, sortedIndices, timeSeries); } /** * Calculate the distance between a shapelet candidate and a full time * series (both double[]). * * @param candidate a double[] representation of a shapelet candidate * @param sortedIndices * @param timeSeries a double[] representation of a whole time series (inc. * class value) * @return the distance between a candidate and a time series * * * NOTE: it seems that the reordering is repeated for each new time series. * This could be avoided, but not sure how to structure the code to do it */ public static double onlineSubsequenceDistance(double[] candidate, double[][] sortedIndices, double[] timeSeries) { DoubleWrapper sumPointer = new DoubleWrapper(); DoubleWrapper sum2Pointer = new DoubleWrapper(); //Generate initial subsequence double[] subseq = new double[candidate.length]; System.arraycopy(timeSeries, 0, subseq, 0, subseq.length); subseq = optimizedZNormalise(subseq, false, sumPointer, sum2Pointer); //Keep count of fundamental ops for experiment subseqDistOpCount += subseq.length; double sum = sumPointer.get(); double sum2 = sum2Pointer.get(); double bestDist = 0.0; double mean; double stdv; double temp; //Compute initial distance for (int i = 0; i < candidate.length; i++) { temp = candidate[i] - subseq[i]; bestDist += temp * temp; } //Keep count of fundamental ops for experiment subseqDistOpCount+= candidate.length; // Scan through all possible subsequences of two for (int i = 1; i < timeSeries.length - candidate.length; i++) { //Update the running sums sum = sum - timeSeries[i - 1] + timeSeries[i - 1 + candidate.length]; sum2 = sum2 - (timeSeries[i - 1] * timeSeries[i - 1]) + (timeSeries[i - 1 + candidate.length] * timeSeries[i - 1 + candidate.length]); //Compute the stats for new series mean = sum / candidate.length; //Get rid of rounding errors double stdv2 = (sum2 - (mean * mean * candidate.length)) / candidate.length; stdv = (stdv2 < ROUNDING_ERROR_CORRECTION) ? 0.0 : Math.sqrt(stdv2); int j = 0; double currentDist = 0.0; double toAdd; int reordedIndex; while (j < candidate.length && currentDist < bestDist) { reordedIndex = (int) sortedIndices[j][0]; toAdd = candidate[reordedIndex] - (stdv == 0.0 ? 0.0 : ((timeSeries[i + reordedIndex] - mean) / stdv)); currentDist += (toAdd * toAdd); j++; //Keep count of fundamental ops for experiment subseqDistOpCount++; } if (currentDist < bestDist) { bestDist = currentDist; } } return (bestDist == 0.0) ? 0.0 : (1.0 / candidate.length * bestDist); } /** * A method to sort the array indeces according to their corresponding * values * * @param series a time series, which indeces need to be sorted * @return */ public static double[][] sortIndexes(double[] series) { //Create an boxed array of values with corresponding indexes double[][] sortedSeries = new double[series.length][2]; for (int i = 0; i < series.length; i++) { sortedSeries[i][0] = i; sortedSeries[i][1] = Math.abs(series[i]); } Arrays.sort(sortedSeries, new Comparator<double[]>() { @Override public int compare(double[] o1, double[] o2) { return Double.compare(o1[1], o2[1]); } }); return sortedSeries; } /** * Z-Normalise a time series * * @param input the input time series to be z-normalised * @param classValOn specify whether the time series includes a class value * (e.g. an full instance might, a candidate shapelet wouldn't) * @return a z-normalised version of input */ public static double[] optimizedZNormalise(double[] input, boolean classValOn) { return optimizedZNormalise(input, classValOn, null, null); } /** * Z-Normalise a time series * * @param input the input time series to be z-normalised * @param classValOn specify whether the time series includes a class value * (e.g. an full instance might, a candidate shapelet wouldn't) * @param storeGlobally specify whether the sum and sum of squares should be * stored globally - this is used in subsequence distance method * @return a z-normalised version of input */ private static double[] optimizedZNormalise(double[] input, boolean classValOn, DoubleWrapper sum, DoubleWrapper sum2) { double mean; double stdv; double classValPenalty = classValOn ? 1:0; double[] output = new double[input.length]; double seriesTotal = 0; double seriesTotal2 = 0; for (int i = 0; i < input.length - classValPenalty; i++) { seriesTotal += input[i]; seriesTotal2 += (input[i] * input[i]); } if (sum != null && sum2 != null) { sum.set(seriesTotal); sum2.set(seriesTotal2); } mean = seriesTotal / (input.length - classValPenalty); double num = (seriesTotal2 - (mean * mean * (input.length - classValPenalty))) / (input.length - classValPenalty); stdv = (num <= ROUNDING_ERROR_CORRECTION) ? 0.0 : Math.sqrt(num); for (int i = 0; i < input.length - classValPenalty; i++) { output[i] = (stdv == 0.0) ? 0.0 : (input[i] - mean) / stdv; } if (classValOn) { output[output.length - 1] = input[input.length - 1]; } return output; } private static class DoubleWrapper { private double d; public DoubleWrapper() { d = 0.0; } public DoubleWrapper(double d) { this.d = d; } public void set(double d) { this.d = d; } public double get() { return d; } } @Override public long opCountForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception { data = FullShapeletTransform2.roundRobinData(data, null); subseqDistOpCount = 0; findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength); return subseqDistOpCount; } /** * * @param args */ public static void main(String[] args) { //################ Test 1 ################ System.out.println("1) Testing index sorter: "); double[] series = new double[10]; double[] subseq = new double[series.length / 2]; int min = -5; int max = 5; for (int i = 0; i < series.length; i++) { series[i] = min + (int) (Math.random() * ((max - min) + 1)); if (i < series.length / 2) { subseq[i] = min + (int) (Math.random() * ((max - min) + 1)); } } printSeries(series); double[][] indices = sortIndexes(series); for (int i = 0; i < series.length; i++) { System.out.print(series[(int) indices[i][0]] + ((i == series.length - 1) ? "\n" : ", ")); } //################ Test 2 ################ System.out.println("\n 2) Testing normalization: "); double[] normSeries; normSeries = FullShapeletTransform2.zNormalise(series, false); System.out.print("Original: "); printSeries(normSeries); normSeries = optimizedZNormalise(series, false); System.out.print("Optimized: "); printSeries(normSeries); //################ Test 3 ################ System.out.println("\n 2) Testing subsequence distance: "); System.out.println("Original dist: " + FullShapeletTransform2.subsequenceDistance(subseq, normSeries)); double[][] sortedIndexes = sortIndexes(subseq); System.out.println("Optimized dist: " + onlineSubsequenceDistance(subseq, sortedIndexes, normSeries)); } /* Method to estimate the range */ private static int[] estimateMinMax(Instances data, int runs, int sampleSize) { int[] minMax = new int[2]; ArrayList<Integer> lengths = new ArrayList<>(); // System.out.println("Performing length estimation"); for (int i = 0; i < runs; i++) { // System.out.println("Sample "+i); // jacknife: sample without replacement Instances copy = new Instances(data); Random ran = new Random(); copy.randomize(ran); Instances sample = new Instances(copy, 10); for (int j = 0; j < sampleSize; j++) { sample.add(copy.instance(j)); } ShapeletTransformDistCaching shp = new ShapeletTransformDistCaching(sampleSize, 3, data.numAttributes() - 1); shp.setCandidatePruning(false); shp.setUseSeparationGap(true); shp.setRoundRobin(true); shp.supressOutput(); try { shp.process(sample); } catch (Exception ex) { Logger.getLogger(ShapeletTransform.class.getName()).log(Level.SEVERE, null, ex); } ArrayList<Integer> sampleLengths = shp.getShapeletLengths(); // print(sampleLengths); lengths.addAll(sampleLengths); System.out.println("Completed sample " + i); } Collections.sort(lengths); int numShapelets = sampleSize * runs; int lowerQuartile = numShapelets / 4 - 1; int upperQuartile = (numShapelets / 4) * 3 - 1; minMax[0] = lengths.get(lowerQuartile); minMax[1] = lengths.get(upperQuartile); // System.out.println("Minimum shapelet length = "+minMax[0]+" maximum" // + " shapelet length = "+minMax[1]); return minMax; } /** * * @param series */ public static void printSeries(double[] series) { for (int i = 0; i < series.length; i++) { System.out.print(series[i] + ((i == series.length - 1) ? "\n" : ", ")); } } }