package weka.filters.timeseries.shapelet_transforms; import java.io.IOException; import java.util.ArrayList; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.shapelet.QualityMeasures; import weka.core.shapelet.Shapelet; import weka.filters.unsupervised.instance.Resample; /** * An approximate filter to transform a dataset by k shapelets. The approximation * is achieved by means of sampling the dataset according to supplied percentages * * @author Edgaras Baranauskas */ public class ApproximateShapeletTransform extends ShapeletTransformDistCaching{ /** * Size of the subsample, as a percentage of the original set */ protected int seriesSampleLevel; /** * Size of approximated series, as a percentage of the original series */ protected int dataPointsSize; private ArrayList<Integer> sampledIDs; /** * Default constructor; Quality measure defaults to information gain. */ public ApproximateShapeletTransform(){ super(); seriesSampleLevel = 50; dataPointsSize = 50; } /** * Single param constructor: filter is unusable until min/max params are initialised. * Quality measure defaults to information gain. * @param k the number of shapelets to be generated */ public ApproximateShapeletTransform(int k){ super(k); seriesSampleLevel = 50; dataPointsSize = 50; } /** * Full constructor to create a usable filter. Quality measure defaults to information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets */ public ApproximateShapeletTransform(int k, int minShapeletLength, int maxShapeletLength){ super(k, minShapeletLength, maxShapeletLength); seriesSampleLevel = 50; dataPointsSize = 50; } /** * Full, exhaustive, constructor for a filter. Quality measure set via enum, invalid * selection defaults to information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets * @param qualityChoice the shapelet quality measure to be used with this filter */ public ApproximateShapeletTransform(int k, int minShapeletLength, int maxShapeletLength, QualityMeasures.ShapeletQualityChoice qualityChoice){ super(k, minShapeletLength, maxShapeletLength, qualityChoice); seriesSampleLevel = 50; dataPointsSize = 50; } /** * Method to set the sampling levels for series and data points. The default * percentages are 50, 50. * * @param series the percentage of series to be sampled * @param dataPoints the percentage of data points to be used in PAA series */ public void setSampleLevels(int series, int dataPoints) throws IOException{ if(series < 1 || series > 100){ throw new IOException ("Series sample level must be in range [1, 100]"); } if(dataPoints < 1 || dataPoints > 100){ throw new IOException ("Piece aggregate approximation must be in range [1, 100]"); } seriesSampleLevel = series; dataPointsSize = dataPoints; } @Override public Instances process(Instances dataInst) throws Exception{ if(this.numShapelets < 1){ throw new Exception("Number of shapelets initialised incorrectly - please select value of k (Usage: setNumberOfShapelets"); } int maxPossibleLength; if(dataInst.classIndex() < 0) { maxPossibleLength = dataInst.instance(0).numAttributes(); }else{ maxPossibleLength = dataInst.instance(0).numAttributes() - 1; } if(this.minShapeletLength < 1 || this.maxShapeletLength < 1 || this.maxShapeletLength < this.minShapeletLength || this.maxShapeletLength > maxPossibleLength){ throw new Exception("Shapelet length parameters initialised incorrectly"); } //Approximate data Instances orderedInst = null; if(this.shapeletsTrained == false){ sampledIDs = new ArrayList<Integer>(); dataInst = approximateInstanes(dataInst); //Sort data in round robin order dataSourceIDs = new int[dataInst.numInstances()]; int[] roundRobidIDs = new int[dataInst.numInstances()]; orderedInst = roundRobinData(dataInst, roundRobidIDs); //Generate ID of the orignal source dataSourceIDs = new int[dataInst.numInstances()]; for(int i = 0; i < dataSourceIDs.length; i++){ dataSourceIDs[i] = sampledIDs.get(roundRobidIDs[i]); } }else{ dataInst = performPAA(dataInst); } if(this.shapeletsTrained == false){ // shapelets discovery has not yet been caried out, so do so this.shapelets = findBestKShapeletsCache(this.numShapelets, orderedInst, this.minShapeletLength, this.maxShapeletLength); // get k shapelets ATTENTION this.shapeletsTrained = true; if(!supressOutput){ System.out.println(shapelets.size()+" Shapelets have been generated"); } }else{ stats = null; data = null; } Instances output = determineOutputFormat(dataInst); if(data != null){ stats = new Stats(); //Normalise all time series for furhter processing data = new double[dataInst.numInstances()][]; for(int i = 0; i < dataInst.numInstances(); i++){ data[i] = FullShapeletTransform.zNormalise(dataInst.instance(i).toDoubleArray(), true); } } for(int i = 0; i < shapelets.size() + 1; i++){ Shapelet s = null; if(i < shapelets.size()){ s = shapelets.get(i); if(data != null && stats != null){ stats.computeStats(s.getSeriesId(), data); } } for(int j = 0; j < dataInst.numInstances(); j++){ if(i < shapelets.size()){ double dist; if(data != null && stats != null){ stats.setCurrentY(j); dist = cachedSubsequenceDistance(s.getStartPos(), s.getContent().length, data[j].length, stats); }else{ dist = subseqDistance(s.getContent(), dataInst.instance(j)); } if(i == 0){ output.add(new DenseInstance(this.shapelets.size() + 1)); output.instance(j).setValue(i, dist); }else{ output.instance(j).setValue(i, dist); } }else{ output.instance(j).setValue(i, dataInst.instance(j).classValue()); } } } return output; } //Method to apprimiate the training data private Instances approximateInstanes(Instances data){ Instances output = sampleInstances(data); output = performPAA(output); //Make shapelet length relative to that of the original minShapeletLength = (output.numAttributes() - 1) * minShapeletLength / (data.numAttributes()-1); maxShapeletLength = (output.numAttributes() - 1) * maxShapeletLength / (data.numAttributes()-1); return output; } //Method to sample instances private Instances sampleInstances(Instances data){ if(seriesSampleLevel == 100){ return data; }else{ Resample sampler = new Resample(); //Set up sampler try { sampler.setInputFormat(data); } catch (Exception ex) { Logger.getLogger(ApproximateShapeletTransform.class.getName()).log(Level.SEVERE, null, ex); } sampler.setNoReplacement(true); sampler.setSampleSizePercent(seriesSampleLevel); //Queue data for processing for(int i = 0; i < data.numInstances(); i++){ sampler.input(data.instance(i)); } sampler.batchFinished(); //Retrieve output Instances sampledData = new Instances(data, data.numInstances() * seriesSampleLevel / 100); boolean isFinished = false; while(!isFinished){ Instance toAdd = sampler.output(); if(toAdd == null){ isFinished = true; }else{ sampledData.add(toAdd); //Find source id for(int sIndex = 0; sIndex < data.numInstances(); sIndex++){ for(int attIndex = 0; attIndex < data.numAttributes(); attIndex++){ if(data.instance(sIndex).value(attIndex) != toAdd.value(attIndex)){ break; }else if(attIndex == data.numAttributes()-1){ sampledIDs.add(sIndex); } } } } } /* Used for testing TreeMap<Double, Integer> dist = FullShapeletTransform.getClassDistributions(data); TreeMap<Double, Integer> dist2 = FullShapeletTransform.getClassDistributions(sampledData); printTreeMap(dist); printTreeMap(dist2); System.out.println("Original size: " + data.numInstances()); System.out.println("Percentage: " + seriesSampleLevel); System.out.println("Sampled size: " + sampledData.numInstances()); */ return sampledData; } } //Method to perform Piecewise Aggregate Approximation for a given data private Instances performPAA(Instances data){ if(dataPointsSize == 100){ return data; }else{ int paaSize = (data.numAttributes()-1) * dataPointsSize / 100; //Determine output format Instances output = null; try { output = determinePAAOutputFormat(data, paaSize); } catch (Exception ex) { Logger.getLogger(ApproximateShapeletTransform.class.getName()).log(Level.SEVERE, null, ex); } double portionLength = ((double)(data.numAttributes() - 1)) / paaSize; //For each data, compute PAA components for(int i = 0; i < data.numInstances(); i++){ Instance currentInstance = data.instance(i); Instance toAdd = new DenseInstance(paaSize + 1); //Normalise series double[] series = currentInstance.toDoubleArray(); series = FullShapeletTransform.zNormalise(series, true); double[] paaSublists = new double[paaSize]; int[] paaSublistsSizes = new int[paaSize]; double currentPortion = portionLength; int seriesIndex = 0; int subListIndex = 0; boolean advance = false; while(!advance){ if(currentPortion >= 0.999999999999){//Get rid of accumulated error paaSublistsSizes[subListIndex]++; paaSublists[subListIndex] += series[seriesIndex++]; currentPortion -= 1.0; if(currentPortion < 0.0){ currentPortion = 0.0; } }else{ if(seriesIndex < series.length-1){ //Required portion paaSublistsSizes[subListIndex]++; paaSublists[subListIndex++] += currentPortion * series[seriesIndex]; //Remaining portion currentPortion = 1.0 - currentPortion; paaSublistsSizes[subListIndex]++; paaSublists[subListIndex] += currentPortion * series[seriesIndex]; currentPortion = portionLength - currentPortion; }else{ advance = true; } seriesIndex++; } } for(int j = 0; j < paaSublists.length; j++){ toAdd.setValue(j, paaSublists[j]/paaSublistsSizes[j]); } toAdd.setValue(paaSize, currentInstance.classValue()); output.add(toAdd); } return output; } } //Method to determine output format of Piecewise Aggregate Approximation of the time series private Instances determinePAAOutputFormat(Instances inputFormat, int length) throws Exception{ FastVector atts = new FastVector(); String name; for(int i = 0; i < length; i++){ name = "PAA" + i; atts.addElement(new Attribute(name)); } if(inputFormat.classIndex() >= 0){ //Classification set, set class //Get the class values as a fast vector Attribute target = inputFormat.attribute(inputFormat.classIndex()); FastVector vals = new FastVector(target.numValues()); for(int i = 0; i < target.numValues(); i++){ vals.addElement(target.value(i)); } atts.addElement(new Attribute(inputFormat.attribute(inputFormat.classIndex()).name(), vals)); } Instances result = new Instances("PAA" + inputFormat.relationName(), atts, inputFormat.numInstances()); if(inputFormat.classIndex() >= 0){ result.setClassIndex(result.numAttributes() - 1); } return result; } @Override public double timingForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception { Instances output = approximateInstanes(data); minShapeletLength = (output.numAttributes() - 1) * minShapeletLength / (data.numAttributes()-1); maxShapeletLength = (output.numAttributes() - 1) * maxShapeletLength / (data.numAttributes()-1); output = roundRobinData(output, null); long startTime = System.nanoTime(); findBestKShapeletsCache(1, output, minShapeletLength, maxShapeletLength); long finishTime = System.nanoTime(); return (double)(finishTime - startTime) / 1000000000.0; } //Method used for testing private void printTreeMap(TreeMap<Double, Integer> dist){ System.out.println("\nTREEMAP"); for(Double d: dist.keySet()){ System.out.println(d + ": " +dist.get(d)); } } //Method used for testing private double[] testPAA(double[] data) throws IOException{ FastVector atts = new FastVector(); String name; for(int i = 0; i < data.length-1; i++){ name = "Attribute" + i; atts.addElement(new Attribute(name)); } FastVector classValues = new FastVector(); classValues.addElement("0"); classValues.addElement("1"); Attribute classAtt = new Attribute("Binary", classValues); atts.addElement(classAtt); //Create dataset Instances instances = new Instances("Test", atts, 1); instances.setClassIndex(data.length-1); //Create instance Instance inst = new DenseInstance(1, data); instances.add(inst); Instances output = performPAA(instances); return output.instance(0).toDoubleArray(); } /** * * @param args */ public static void main(String[] args){ //Create some time series for testing System.out.println("\n1.) Create series for testing: "); int seriesLength = 11; double[] dataEven = new double[seriesLength]; int min = -5; int max = 5; for(int j = 0; j < seriesLength; j++){ if(j == seriesLength-1){ dataEven[j] = 0; }else{ dataEven[j] = min + (int)(Math.random() * ((max - min) + 1)); } } seriesLength = 10; double[] dataUneven = new double[seriesLength]; for(int j = 0; j < seriesLength; j++){ if(j == seriesLength-1){ dataUneven[j] = 0; }else{ dataUneven[j] = min + (int)(Math.random() * ((max - min) + 1)); } } ApproximateShapeletTransform ast = new ApproximateShapeletTransform(); double[] out = null; try { ast.setSampleLevels(100, 50); out = ast.testPAA(dataEven); } catch (IOException ex) { Logger.getLogger(ApproximateShapeletTransform.class.getName()).log(Level.SEVERE, null, ex); } System.out.println("Even Test: "); ShapeletTransform.printSeries(dataEven); ShapeletTransform.printSeries(out); } }