/* * copyright: Anthony Bagnall * NOTE: As shapelet extraction can be time consuming, there is an option to output shapelets * to a text file (Default location is in the root dir of the project, file name "defaultShapeletOutput.txt"). * * Default settings are TO NOT PRODUCE OUTPUT FILE - unless file name is changed, each successive filter will * overwrite the output (see "setLogOutputFile(String fileName)" to change file dir and name). * * To reconstruct a filter from this output, please see the method "createFilterFromFile(String fileName)". */ package weka.filters.timeseries.shapelet_transforms; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Scanner; import java.util.TreeMap; import weka.core.*; import weka.core.shapelet.*; import weka.filters.SimpleBatchFilter; /** * A filter to transform a dataset by k shapelets. Once built on a training set, the * filter can be used to transform subsequent datasets using the extracted shapelets. * <p> * See <a href="http://delivery.acm.org/10.1145/2340000/2339579/p289-lines.pdf?ip=139.222.14.198&acc=ACTIVE%20SERVICE&CFID=221649628&CFTOKEN=31860141&__acm__=1354814450_3dacfa9c5af84445ea2bfd7cc48180c8">Lines, J., Davis, L., Hills, J., Bagnall, A.: A shapelet transform for time series classification. In: Proc. 18th ACM SIGKDD (2012)</a> * @author Jason Lines */ public class FullShapeletTransform extends SimpleBatchFilter{ @Override public String globalInfo() { throw new UnsupportedOperationException("Not supported yet."); } protected boolean supressOutput = false; // defaults to print in System.out AS WELL as file, set to true to stop printing to console protected int minShapeletLength; protected int maxShapeletLength; protected int numShapelets; protected boolean shapeletsTrained; protected ArrayList<Shapelet> shapelets; protected String ouputFileLocation = "defaultShapeletOutput.txt"; // default store location protected boolean recordShapelets = true; // default action is to write an output file protected boolean roundRobin = false; public static int DEFAULT_NUMSHAPELETS=100; public static int DEFAULT_MINSHAPELETLENGTH=3; public static int DEFAULT_MAXSHAPELETLENGTH=23; protected QualityMeasures.ShapeletQualityMeasure qualityMeasure; protected QualityMeasures.ShapeletQualityChoice qualityChoice; protected boolean useCandidatePruning; protected boolean useSeparationGap=false; protected boolean useRoundRobin=false; public void setUseSeparationGap(boolean b){useSeparationGap=b;} public void setUseRoundRobin(boolean b){useRoundRobin=b;} protected int candidatePruningStartPercentage; protected static final double ROUNDING_ERROR_CORRECTION = 0.000000000000001; protected int[] dataSourceIDs; //Variables for experiments private static long subseqDistOpCount; /** * Default constructor; Quality measure defaults to information gain. */ public FullShapeletTransform(){ this(DEFAULT_NUMSHAPELETS,DEFAULT_MINSHAPELETLENGTH,DEFAULT_MAXSHAPELETLENGTH,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN); } /** * Constructor for generating a shapelet transform from an ArrayList of * Shapelets. * @param shapes */ public FullShapeletTransform(ArrayList<Shapelet> shapes) { this(); this.shapelets = shapes; this.shapeletsTrained = true; this.numShapelets=shapelets.size(); } /** * Single param constructor: * Quality measure defaults to information gain. * @param k the number of shapelets to be generated */ public FullShapeletTransform(int k){ this(k,DEFAULT_MINSHAPELETLENGTH,DEFAULT_MAXSHAPELETLENGTH,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN); } /** * Full constructor to create a usable filter. Quality measure defaults to information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets */ public FullShapeletTransform(int k, int minShapeletLength, int maxShapeletLength){ this(k,minShapeletLength,maxShapeletLength,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN); } /** * Full, exhaustive, constructor for a filter. Quality measure set via enum, invalid * selection defaults to information gain. * * @param k the number of shapelets to be generated * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets * @param qualityChoice the shapelet quality measure to be used with this filter */ public FullShapeletTransform(int k, int minShapeletLength, int maxShapeletLength, weka.core.shapelet.QualityMeasures.ShapeletQualityChoice qualityChoice){ this.minShapeletLength = minShapeletLength; this.maxShapeletLength = maxShapeletLength; this.numShapelets = k; this.shapelets = new ArrayList<Shapelet>(); this.shapeletsTrained = false; this.useCandidatePruning = false; this.qualityChoice=qualityChoice; switch(qualityChoice){ case F_STAT: this.qualityMeasure = new QualityMeasures.FStat(); break; case KRUSKALL_WALLIS: this.qualityMeasure = new QualityMeasures.KruskalWallis(); break; case MOODS_MEDIAN: this.qualityMeasure = new QualityMeasures.MoodsMedian(); break; default: this.qualityMeasure = new QualityMeasures.InformationGain(); } } /** * Returns the set of shapelets for this transform as an ArrayList. * * @return An ArrayList of Shapelets representing the shapelets found for * this Shapelet Transform. */ public ArrayList<Shapelet> getShapelets() { return this.shapelets; } /** * Set the transform to round robin the data or not. This transform defaults * round robin to false to keep the instances in the same order as the * original data. If round robin is set to true, the transformed data * will be reordered which can make it more difficult to use the ensemble. * * @param val */ public void setRoundRobin(boolean val) { this.roundRobin = val; } /** * Supresses filter output to the console; useful when running timing experiments. */ public void supressOutput(){ this.supressOutput=true; } /** * Use candidate pruning technique when checking candidate quality. This * speeds up the transform processing time. */ public void useCandidatePruning(){ this.useCandidatePruning = true; this.candidatePruningStartPercentage = 10; } /** * * @param f */ public void setCandidatePruning(boolean f){ this.useCandidatePruning = f; if(f) this.candidatePruningStartPercentage = 10; else //Not necessary this.candidatePruningStartPercentage = 100; } /** * Use candidate pruning technique when checking candidate quality. This * speeds up the transform processing time. * @param percentage the percentage of data to be precocessed before pruning * is initiated. In most cases the higher the percentage the less effective * pruning becomes */ public void useCandidatePruning(int percentage){ this.useCandidatePruning = true; this.candidatePruningStartPercentage = percentage; } /** * Mutator method to set the number of shapelets to be stored by the filter. * * @param k the number of shapelets to be generated */ public void setNumberOfShapelets(int k){ this.numShapelets = k; } /** * * @return */ public int getNumberOfShapelets(){ return numShapelets; } /** * Mutator method to set the minimum and maximum shapelet lengths for the filter. * * @param minShapeletLength minimum length of shapelets * @param maxShapeletLength maximum length of shapelets */ public void setShapeletMinAndMax(int minShapeletLength, int maxShapeletLength){ this.minShapeletLength = minShapeletLength; this.maxShapeletLength = maxShapeletLength; } /** * Mutator method to set the quality measure used by the filter. As with constructors, default * selection is information gain unless another valid selection is specified. * * @return */ public QualityMeasures.ShapeletQualityChoice getQualityMeasure(){ return qualityChoice; } /** * * @param qualityChoice */ public void setQualityMeasure(QualityMeasures.ShapeletQualityChoice qualityChoice){ this.qualityChoice=qualityChoice; switch(qualityChoice){ case F_STAT: this.qualityMeasure = new QualityMeasures.FStat(); break; case KRUSKALL_WALLIS: this.qualityMeasure = new QualityMeasures.KruskalWallis(); break; case MOODS_MEDIAN: this.qualityMeasure = new QualityMeasures.MoodsMedian(); break; default: this.qualityMeasure = new QualityMeasures.InformationGain(); } } /** * Sets the format of the filtered instances that are output. I.e. will include k attributes each shapelet * distance and a class value * * @param inputFormat the format of the input data * @return a new Instances object in the desired output format * @throws Exception if all required parameters of the filter are not initialised correctly */ @Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception{ if(this.numShapelets < 1){ throw new Exception("ShapeletFilter not initialised correctly - please specify a value of k that is greater than or equal to 1"); } //Set up instances size and format. //int length = this.numShapelets; int length = this.shapelets.size(); FastVector atts = new FastVector(); String name; for(int i = 0; i < length; i++){ name = "Shapelet_" + i; atts.addElement(new Attribute(name)); } if(inputFormat.classIndex() >= 0){ //Classification set, set class //Get the class values as a fast vector Attribute target = inputFormat.attribute(inputFormat.classIndex()); FastVector vals = new FastVector(target.numValues()); for(int i = 0; i < target.numValues(); i++){ vals.addElement(target.value(i)); } atts.addElement(new Attribute(inputFormat.attribute(inputFormat.classIndex()).name(), vals)); } Instances result = new Instances("Shapelets" + inputFormat.relationName(), atts, inputFormat.numInstances()); if(inputFormat.classIndex() >= 0){ result.setClassIndex(result.numAttributes() - 1); } return result; } /** * The main logic of the filter; when called for the first time, k shapelets are extracted from the input Instances 'data'. * The input 'data' is transformed by the k shapelets, and the filtered data is returned as an output. * <p> * If called multiple times, shapelet extraction DOES NOT take place again; once k shapelets are established from the initial * call to process(), the k shapelets are used to transform subsequent Instances. * <p> * Intended use: <p> * 1. Extract k shapelets from raw training data to build filter; <p> * 2. Use the filter to transform the raw training data into transformed training data; <p> * 3. Use the filter to transform the raw testing data into transformed testing data (e.g. filter never extracts shapelets from training data, therefore avoiding bias); <p> * 4. Build a classifier using transformed training data, perform classification on transformed test data. * * @param data the input data to be transformed (and to find the shapelets if this is the first run) * @return the transformed representation of data, according to the distances from each instance to each of the k shapelets * @throws Exception if the number of shapelets or the length parameters specified are incorrect */ @Override public Instances process(Instances data) throws Exception{ if(this.numShapelets < 1){ throw new Exception("Number of shapelets initialised incorrectly - please select value of k greater than or equal to 1 (Usage: setNumberOfShapelets"); } int maxPossibleLength = data.instance(0).numAttributes() - 1; if(data.classIndex() < 0) { throw new Exception("Require that the class be set for the ShapeletTransform"); } if(this.minShapeletLength < 1 || this.maxShapeletLength < 1 || this.maxShapeletLength < this.minShapeletLength || this.maxShapeletLength > maxPossibleLength){ throw new Exception("Shapelet length parameters initialised incorrectly"); } if(this.shapeletsTrained == false){ // shapelets discovery has not yet been caried out, so this must be training data dataSourceIDs = new int[data.numInstances()]; if(roundRobin){ //Reorder the data in round robin order data = roundRobinData(data, dataSourceIDs); }else{ for(int i=0;i<data.numInstances();i++) dataSourceIDs[i]=i; } this.shapelets = findBestKShapeletsCache(this.numShapelets, data, this.minShapeletLength, this.maxShapeletLength); // get k shapelets ATTENTION this.shapeletsTrained = true; if(!supressOutput){ System.out.println(shapelets.size()+" Shapelets have been generated"); } //Reorder the training data and reset the shapelet indexes if(roundRobin){ resetDataOrder(data, dataSourceIDs); resetShapeletIndices(shapelets,dataSourceIDs); } } Instances output = determineOutputFormat(data); // for each data, get distance to each shapelet and create new instance for(int i = 0; i < data.numInstances(); i++){ // for each data Instance toAdd = new DenseInstance(this.shapelets.size() + 1); int shapeletNum = 0; for(Shapelet s: this.shapelets){ double dist = subseqDistance(s.content, data.instance(i)); toAdd.setValue(shapeletNum++, dist); } toAdd.setValue(this.shapelets.size(), data.instance(i).classValue()); output.add(toAdd); } return output; } /** * Set file path for the filter log. Filter log includes shapelet quality, seriesId, startPosition, and content for each shapelet. * @param fileName the updated file path of the filter log */ public void setLogOutputFile(String fileName){ this.recordShapelets = true; this.ouputFileLocation = fileName; } /** * Turns off log saving; useful for timing experiments where speed is essential. */ public void turnOffLog(){ this.recordShapelets = false; } /** * protected method for extracting k shapelets. * * @param numShapelets the target number of shapelets to generate * @param data the data that the shapelets will be taken from * @param minShapeletLength the minimum length of possible shapelets * @param maxShapeletLength the maximum length of possible shapelets * @return an ArrayList of FullShapeletTransform objects in order of their fitness (by infoGain, seperationGap then shortest length) * @throws Exception */ public ArrayList<Shapelet> findBestKShapeletsCache(int numShapelets, Instances data, int minShapeletLength, int maxShapeletLength)throws Exception{ ArrayList<Shapelet> kShapelets = new ArrayList<Shapelet>(); // store (upto) the best k shapelets overall ArrayList<Shapelet> seriesShapelets; // temp store of all shapelets for each time series /* new version to allow caching: * - for all time series, calculate the gain of all candidates of all possible lengths * - insert into a strucutre in order of fitness - arraylist with comparable implementation of shapelets * - once all candidates for a series are established, integrate into store of k best */ TreeMap<Double, Integer> classDistributions = getClassDistributions(data); // used to calc info gain //for all time series if(!supressOutput){ System.out.println("Processing data: "); } int numInstances = data.numInstances(); for(int i = 0; i < numInstances; i++){ if(!supressOutput){ System.out.println("Currently processing instance "+(i+1)+" of "+ numInstances); } //get our time series as a double array. double[] wholeCandidate = data.instance(i).toDoubleArray(); seriesShapelets = new ArrayList<Shapelet>(); for(int length = minShapeletLength; length <= maxShapeletLength; length++){ //for all possible starting positions of that length for(int start = 0; start <= wholeCandidate.length - length-1; start++){ //-1 = avoid classVal - handle later for series with no class val // CANDIDATE ESTABLISHED - got original series, length and starting position // extract relevant part into a double[] for processing double[] candidate = new double[length]; for(int m = start; m < start + length; m++){ candidate[m - start] = wholeCandidate[m]; } // znorm candidate here so it's only done once, rather than in each distance calculation candidate = zNorm(candidate, false); //Initialize bounding algorithm for current candidate QualityBound.ShapeletQualityBound qualityBound = initializeQualityBound(classDistributions); //Set bound of the bounding algorithm if(qualityBound != null && kShapelets.size() == numShapelets){ qualityBound.setBsfQuality(kShapelets.get(numShapelets-1).qualityValue); } Shapelet candidateShapelet = checkCandidate(candidate, data, i, start, classDistributions, qualityBound); //If shapelet was pruned then null will be returned so need to check for that if(candidateShapelet != null){ seriesShapelets.add(candidateShapelet); } } } // now that we have all shapelets, self similarity can be fairly assessed without fear of removing potentially // good shapelets if(useSeparationGap) Collections.sort(seriesShapelets,new Shapelet.ReverseSeparationGap()); else Collections.sort(seriesShapelets,new Shapelet.ReverseOrder()); seriesShapelets = removeSelfSimilar(seriesShapelets); kShapelets = combine(numShapelets,kShapelets,seriesShapelets); } this.numShapelets = kShapelets.size(); if(this.recordShapelets){ //just in case the file doesn't exist or the directories. File file = new File(this.ouputFileLocation); file.getParentFile().mkdirs(); FileWriter out = new FileWriter(file); for(int i = 0; i < kShapelets.size();i++){ out.append(kShapelets.get(i).qualityValue+","+kShapelets.get(i).seriesId+","+kShapelets.get(i).startPos+"\n"); double[] shapeletContent = kShapelets.get(i).content; for(int j = 0; j < shapeletContent.length; j++){ out.append(shapeletContent[j]+","); } out.append("\n"); } out.close(); } if(!supressOutput){ System.out.println(); System.out.println("Output Shapelets:"); System.out.println("-------------------"); System.out.println("informationGain,seriesId,startPos"); System.out.println("<shapelet>"); System.out.println("-------------------"); System.out.println(); for(int i = 0; i < kShapelets.size();i++){ System.out.println(kShapelets.get(i).qualityValue+","+kShapelets.get(i).seriesId+","+kShapelets.get(i).startPos); double[] shapeletContent = kShapelets.get(i).content; for(int j = 0; j < shapeletContent.length; j++){ System.out.print(shapeletContent[j]+","); } System.out.println(); } } return kShapelets; } /** * * @param classDist * @return */ protected QualityBound.ShapeletQualityBound initializeQualityBound(TreeMap<Double, Integer> classDist){ if(useCandidatePruning){ if(qualityMeasure instanceof QualityMeasures.InformationGain){ return new QualityBound.InformationGainBound(classDist, candidatePruningStartPercentage); }else if(qualityMeasure instanceof QualityMeasures.MoodsMedian){ return new QualityBound.MoodsMedianBound(classDist, candidatePruningStartPercentage); }else if(qualityMeasure instanceof QualityMeasures.FStat){ return new QualityBound.FStatBound(classDist, candidatePruningStartPercentage); }else if(qualityMeasure instanceof QualityMeasures.KruskalWallis){ return new QualityBound.KruskalWallisBound(classDist, candidatePruningStartPercentage); } } return null; } /** * protected method to remove self-similar shapelets from an ArrayList (i.e. if they come from the same series * and have overlapping indicies) * * @param shapelets the input Shapelets to remove self similar FullShapeletTransform objects from * @return a copy of the input ArrayList with self-similar shapelets removed */ protected static ArrayList<Shapelet> removeSelfSimilar(ArrayList<Shapelet> shapelets){ // return a new pruned array list - more efficient than removing // self-similar entries on the fly and constantly reindexing ArrayList<Shapelet> outputShapelets = new ArrayList<Shapelet>(); boolean[] selfSimilar = new boolean[shapelets.size()]; // to keep track of self similarity - assume nothing is similar to begin with //TODO remove. Aaron: all values in a boolean array default to false. Don't need this loop. for(int i = 0; i < shapelets.size(); i++){ selfSimilar[i] = false; } //TODO: Aaron: tidy up logic. for(int i = 0; i < shapelets.size();i++){ if(selfSimilar[i]==false){ outputShapelets.add(shapelets.get(i)); for(int j = i+1; j < shapelets.size(); j++){ if(selfSimilar[j]==false && selfSimilarity(shapelets.get(i),shapelets.get(j))){ // no point recalc'ing if already self similar to something selfSimilar[j] = true; } } } } return outputShapelets; } /** * Private method to combine two ArrayList collections of FullShapeletTransform objects. * * @param k the maximum number of shapelets to be returned after combining the two lists * @param kBestSoFar the (up to) k best shapelets that have been observed so far, passed in to combine with shapelets from a new series * @param timeSeriesShapelets the shapelets taken from a new series that are to be merged in descending order of fitness with the kBestSoFar * @return an ordered ArrayList of the best k (or less) FullShapeletTransform objects from the union of the input ArrayLists */ //NOTE: could be more efficient here protected ArrayList<Shapelet> combine(int k, ArrayList<Shapelet> kBestSoFar, ArrayList<Shapelet> timeSeriesShapelets){ ArrayList<Shapelet> newBestSoFar = new ArrayList<Shapelet>(); for(int i = 0; i < timeSeriesShapelets.size();i++){ kBestSoFar.add(timeSeriesShapelets.get(i)); } if(useSeparationGap) Collections.sort(kBestSoFar,new Shapelet.ReverseSeparationGap()); else Collections.sort(kBestSoFar,new Shapelet.ReverseOrder()); if(kBestSoFar.size()<k) { // no need to return up to k, as there are not k shapelets yet return kBestSoFar; } for(int i = 0; i < k; i++){ newBestSoFar.add(kBestSoFar.get(i)); } return newBestSoFar; } /** * Private method to calculate the class distributions of a dataset. Main purpose is for computing shapelet qualities. * * @param data the input data set that the class distributions are to be derived from * @return a TreeMap<Double, Integer> in the form of <Class Value, Frequency> */ public static TreeMap<Double, Integer> getClassDistributions(Instances data){ TreeMap<Double, Integer> classDistribution = new TreeMap<Double, Integer>(); double classValue; for(int i = 0; i < data.numInstances(); i++){ classValue = data.instance(i).classValue(); boolean classExists = false; for(Double d : classDistribution.keySet()){ if(d == classValue){ int temp = classDistribution.get(d); temp++; classDistribution.put(classValue, temp); classExists = true; } } if(classExists == false){ classDistribution.put(classValue, 1); } } return classDistribution; } /** * protected method to check a candidate shapelet. Functions by passing in the raw data, and returning an assessed Shapelet object. * * @param candidate the data from the candidate FullShapeletTransform * @param data the entire data set to compare the candidate to * @param seriesId series id from the dataset that the candidate came from * @param startPos start position in the series where the candidate came from * @param classDistribution a TreeMap<Double, Integer> in the form of <Class Value, Frequency> to describe the dataset composition * @param qualityBound * @return a fully-computed FullShapeletTransform, including the quality of this candidate */ protected Shapelet checkCandidate(double[] candidate, Instances data, int seriesId, int startPos, TreeMap classDistribution, QualityBound.ShapeletQualityBound qualityBound){ // create orderline by looping through data set and calculating the subsequence // distance from candidate to all data, inserting in order. ArrayList<OrderLineObj> orderline = new ArrayList<OrderLineObj>(); boolean pruned = false; for(int i = 0; i < data.numInstances(); i++){ //Check if it is possible to prune the candidate if(qualityBound != null){ if(qualityBound.pruneCandidate()){ pruned = true; break; } } double distance = 0.0; if(i != seriesId){ distance = subseqDistance(candidate, data.instance(i)); } double classVal = data.instance(i).classValue(); // without early abandon, it is faster to just add and sort at the end orderline.add(new OrderLineObj(distance, classVal)); //Update qualityBound - presumably each bounding method for different quality measures will have a different update procedure. if(qualityBound != null){ qualityBound.updateOrderLine(orderline.get(orderline.size()-1)); } } // note: early abandon entropy pruning would appear here, but has been ommitted // in favour of a clear multi-class information gain calculation. Could be added in // this method in the future for speed up, but distance early abandon is more important //If shapelet is pruned then it should no longer be considered in further processing if(pruned){ return null; }else{ // create a shapelet object to store all necessary info, i.e. Shapelet shapelet = new Shapelet(candidate, dataSourceIDs[seriesId], startPos, this.qualityMeasure); shapelet.calculateQuality(orderline, classDistribution); return shapelet; } } public static double[] getInfoGain(Instances trans) { double[] quals = new double[trans.numAttributes()-1]; TreeMap map = getClassDistributions(trans); for(int i=0;i<quals.length;i++) { ArrayList<OrderLineObj> orderline = new ArrayList<OrderLineObj>(); double[] dists = trans.attributeToDoubleArray(i); for(int j = 0; j < dists.length; j++) { double distance = dists[j]; double classVal = trans.instance(j).classValue(); orderline.add(new OrderLineObj(distance, classVal)); } QualityMeasures.InformationGain ig = new QualityMeasures.InformationGain(); double qual = ig.calculateQuality(orderline, map); quals[i] = qual; } return quals; } /** * Calculate the distance between a candidate series and an Instance object * * @param candidate a double[] representation of a shapelet candidate * @param timeSeriesIns an Instance object of a whole time series * @return the distance between a candidate and a time series */ protected double subseqDistance(double[] candidate, Instance timeSeriesIns){ return subsequenceDistance(candidate, timeSeriesIns); } /** * * @param candidate * @param timeSeriesIns * @return */ public static double subsequenceDistance(double[] candidate, Instance timeSeriesIns){ double[] timeSeries = timeSeriesIns.toDoubleArray(); return subsequenceDistance(candidate, timeSeries); } /** * Calculate the distance between a shapelet candidate and a full time series (both double[]). * * @param candidate a double[] representation of a shapelet candidate * @param timeSeries a double[] representation of a whole time series (inc. class value) * @return the distance between a candidate and a time series */ public static double subsequenceDistance(double[] candidate, double[] timeSeries){ double bestSum = Double.MAX_VALUE; double sum; double[] subseq; // for all possible subsequences of two for(int i = 0; i <= timeSeries.length - candidate.length - 1; i++){ sum = 0; // get subsequence of two that is the same lenght as one subseq = new double[candidate.length]; for(int j = i; j < i + candidate.length; j++){ subseq[j - i] = timeSeries[j]; //Keep count of fundamental ops for experiment subseqDistOpCount++; } subseq = zNormalise(subseq, false); // Z-NORM HERE //Keep count of fundamental ops for experiment subseqDistOpCount += 3 * subseq.length; for(int j = 0; j < candidate.length; j++){ sum +=(candidate[j] - subseq[j]) *(candidate[j] - subseq[j]); //Keep count of fundamental ops for experiment subseqDistOpCount++; } if(sum < bestSum){ bestSum = sum; } } return (bestSum == 0.0) ? 0.0 : (1.0 / candidate.length * bestSum); } /** * * @param input * @param classValOn * @return */ protected double[] zNorm(double[] input, boolean classValOn){ return FullShapeletTransform.zNormalise(input, classValOn); } /** * Z-Normalise a time series * * @param input the input time series to be z-normalised * @param classValOn specify whether the time series includes a class value (e.g. an full instance might, a candidate shapelet wouldn't) * @return a z-normalised version of input */ public static double[] zNormalise(double[] input, boolean classValOn){ double mean; double stdv; double classValPenalty = 0; if(classValOn){ classValPenalty = 1; } double[] output = new double[input.length]; double seriesTotal = 0; for(int i = 0; i < input.length - classValPenalty; i++){ seriesTotal += input[i]; } mean = seriesTotal /(input.length - classValPenalty); stdv = 0; for(int i = 0; i < input.length - classValPenalty; i++){ stdv +=(input[i] - mean) *(input[i] - mean); } stdv = stdv / (input.length - classValPenalty); if(stdv < ROUNDING_ERROR_CORRECTION){ stdv = 0.0; }else{ stdv = Math.sqrt(stdv); } for(int i = 0; i < input.length - classValPenalty; i++){ if(stdv == 0.0){ output[i] = 0.0; }else{ output[i] =(input[i] - mean) / stdv; } } if(classValOn == true){ output[output.length - 1] = input[input.length - 1]; } return output; } /** * Load a set of Instances from an ARFF * * @param fileName the file name of the ARFF * @return a set of Instances from the ARFF */ public static Instances loadData(String fileName){ Instances data = null; try{ FileReader r; r = new FileReader(fileName); data = new Instances(r); data.setClassIndex(data.numAttributes() - 1); } catch(Exception e){ System.out.println(" Error =" + e + " in method loadData"); e.printStackTrace(); } return data; } /** * A private method to assess the self similarity of two FullShapeletTransform objects (i.e. whether they have overlapping indicies and * are taken from the same time series). * * @param shapelet the first FullShapeletTransform object (in practice, this will be the dominant shapelet with quality >= candidate) * @param candidate the second FullShapeletTransform * @return */ private static boolean selfSimilarity(Shapelet shapelet, Shapelet candidate){ if(candidate.seriesId == shapelet.seriesId){ if(candidate.startPos >= shapelet.startPos && candidate.startPos < shapelet.startPos + shapelet.content.length){ //candidate starts within exisiting shapelet return true; } if(shapelet.startPos >= candidate.startPos && shapelet.startPos < candidate.startPos + candidate.content.length){ return true; } } return false; } /** * A method to read in a FullShapeletTransform log file to reproduce a FullShapeletTransform * <p> * NOTE: assumes shapelets from log are Z-NORMALISED * * @param fileName the name and path of the log file * @return a duplicate FullShapeletTransform to the object that created the original log file * @throws Exception */ public static FullShapeletTransform createFilterFromFile(String fileName) throws Exception{ return createFilterFromFile(fileName, Integer.MAX_VALUE); } /** * Returns a list of the lengths of the shapelets found by this transform. * * @return An ArrayList of Integers representing the lengths of the * shapelets. */ public ArrayList<Integer> getShapeletLengths() { ArrayList<Integer> shapeletLengths = new ArrayList<>(); if(this.shapeletsTrained) { for(Shapelet s : this.shapelets) { shapeletLengths.add(s.content.length); } } return shapeletLengths; } /** * A method to read in a FullShapeletTransform log file to reproduce a FullShapeletTransform, * <p> * NOTE: assumes shapelets from log are Z-NORMALISED * * @param fileName the name and path of the log file * @param maxShapelets * @return a duplicate FullShapeletTransform to the object that created the original log file * @throws Exception */ public static FullShapeletTransform createFilterFromFile(String fileName, int maxShapelets) throws Exception{ File input = new File(fileName); Scanner scan = new Scanner(input); scan.useDelimiter("\n"); FullShapeletTransform sf = new FullShapeletTransform(); ArrayList<Shapelet> shapelets = new ArrayList<Shapelet>(); String shapeletContentString; String shapeletStatsString; ArrayList<Double> content; double[] contentArray; Scanner lineScan; Scanner statScan; double qualVal; int serID; int starPos; int shapeletCount = 0; while(shapeletCount < maxShapelets && scan.hasNext()){ shapeletStatsString = scan.next(); shapeletContentString = scan.next(); //Get the shapelet stats statScan = new Scanner(shapeletStatsString); statScan.useDelimiter(","); qualVal = Double.parseDouble(statScan.next().trim()); serID = Integer.parseInt(statScan.next().trim()); starPos = Integer.parseInt(statScan.next().trim()); //End of shapelet stats lineScan = new Scanner(shapeletContentString); // System.out.println(shapeletContentString); lineScan.useDelimiter(","); content = new ArrayList<Double>(); while(lineScan.hasNext()){ String next = lineScan.next().trim(); if(!next.isEmpty()){ content.add(Double.parseDouble(next)); } } contentArray = new double[content.size()]; for(int i = 0; i < content.size(); i++){ contentArray[i] = content.get(i); } contentArray = zNormalise(contentArray, false); Shapelet s = new Shapelet(contentArray,qualVal,serID,starPos); shapelets.add(s); shapeletCount++; } sf.shapelets = shapelets; sf.shapeletsTrained = true; sf.numShapelets=shapelets.size(); sf.setShapeletMinAndMax(1, 1); return sf; } public void setShapelets(ArrayList<Shapelet> list) { this.shapelets = list; this.shapeletsTrained = true; this.numShapelets = list.size(); this.minShapeletLength = 1; this.maxShapeletLength = 1; } /** * * @return */ public boolean foundShapelets(){ return shapeletsTrained;} /** * A method to obtain time taken to find a single best shapelet in the data set * @param data the data set to be processed * @param minShapeletLength minimum shapelet length * @param maxShapeletLength maximum shapelet length * @return time in seconds to find the best shapelet * @throws Exception */ public double timingForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception { data = roundRobinData(data, null); long startTime = System.nanoTime(); findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength); long finishTime = System.nanoTime(); return (double)(finishTime - startTime) / 1000000000.0; } /** * * @param data * @param minShapeletLength * @param maxShapeletLength * @return * @throws Exception */ public long opCountForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception { data = roundRobinData(data, null); subseqDistOpCount = 0; findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength); return subseqDistOpCount; } /** * Outputs the log file to the appropriate location. * * @throws Exception */ public void outputLog () throws Exception { //just in case the file doesn't exist, or the directories. File file = new File(this.ouputFileLocation); file.getParentFile().mkdirs(); FileWriter out = new FileWriter(this.ouputFileLocation, file.exists()); for(int i = 0; i < this.shapelets.size();i++){ out.append(this.shapelets.get(i).qualityValue+","+this.shapelets.get(i).seriesId+","+this.shapelets.get(i).startPos+"\n"); double[] shapeletContent = this.shapelets.get(i).content; for(int j = 0; j < shapeletContent.length; j++){ out.append(shapeletContent[j]+","); } out.append("\n"); } out.close(); } /** * Method to reset shapelet indices into the values given in sourcePos * @param data Instances to be reordered * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored. * @return Instances in round robin order */ public static void resetShapeletIndices(ArrayList<Shapelet> shapelets, int[] sourcePos){ for(Shapelet s:shapelets){ int pos=s.getSeriesId(); s.setSeriesID(sourcePos[pos]); } } /** * Method to reorder the given Instances into the order given in sourcePos * @param data Instances to be reordered * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored. * @return Instances in round robin order */ public static void resetDataOrder(Instances data, int[] sourcePos){ if(data.numInstances()!=sourcePos.length){//ERROR System.out.println(" ERROR, cannot reorder, because the series are different lengths"); return; } Instance[] newOrder=new Instance[sourcePos.length]; for(int i=0;i<sourcePos.length;i++) newOrder[sourcePos[i]]=data.instance(i); for(int i=0;i<data.numInstances();i++) data.set(i,newOrder[i]); } /** * Method to reorder the given Instances in round robin order * @param data Instances to be reordered * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored. * @return Instances in round robin order */ public static Instances roundRobinData(Instances data, int[] sourcePos){ //Count number of classes TreeMap<Double, ArrayList<Instance>> instancesByClass = new TreeMap<Double, ArrayList<Instance>>(); TreeMap<Double, ArrayList<Integer>> positionsByClass = new TreeMap<Double, ArrayList<Integer>>(); //Get class distributions TreeMap<Double, Integer> classDistribution = FullShapeletTransform.getClassDistributions(data); //Allocate arrays for instances of every class for(Double key : classDistribution.keySet()){ int frequency = classDistribution.get(key); instancesByClass.put(key, new ArrayList<Instance>(frequency)); positionsByClass.put(key, new ArrayList<Integer>(frequency)); } //Split data according to their class memebership for(int i = 0; i < data.numInstances();i++){ Instance inst = data.instance(i); instancesByClass.get(inst.classValue()).add(inst); positionsByClass.get(inst.classValue()).add(i); } //Merge data into single list in round robin order Instances roundRobinData = new Instances(data, data.numInstances()); for(int i = 0; i < data.numInstances();){ //Allocate arrays for instances of every class for(Double key : classDistribution.keySet()){ ArrayList<Instance> currentList = instancesByClass.get(key); ArrayList<Integer> currentPositions = positionsByClass.get(key); if(!currentList.isEmpty()){ roundRobinData.add(currentList.remove(currentList.size() - 1)); if(sourcePos != null && sourcePos.length == data.numInstances()){ sourcePos[i] = currentPositions.remove(currentPositions.size()-1); } i++; } } } return roundRobinData; } //print out all the shapelets public String toString(){ String str="Shapelets: "; for(Shapelet s:shapelets) str+=s.toString()+"\n"; return str; } /** * An example use of a FullShapeletTransform * @param args command line args. arg[0] should spcify a set of training instances to transform */ public static void main(String[] args){ try{ // mandatory requirements: numShapelets (k), min shapelet length, max shapelet length, input data // additional information: log output dir // example filter, k = 10, minLength = 20, maxLength = 40, data = , output = exampleOutput.txt int k = 10; int minLength = 10; int maxLength = 20; // Instances data= FullShapeletTransform.loadData("ItalyPowerDemand_TRAIN.arff"); // for example Instances data= FullShapeletTransform.loadData(args[0]); FullShapeletTransform sf = new FullShapeletTransform(k, minLength, maxLength); sf.setQualityMeasure(QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN); sf.setLogOutputFile("exampleOutput.txt"); // log file stores shapelet output // Note: sf.process returns a transformed set of Instances. The first time that // thisFilter.process(data) is called, shapelet extraction occurs. Subsequent calls to process // uses the previously extracted shapelets to transform the data. For example: // // Instances transformedTrain = sf.process(trainingData); -> extracts shapelets and can be used to transform training data // Instances transformedTest = sf.process(testData); -> uses shapelets extracted from trainingData to transform testData Instances transformed = sf.process(data); }catch(Exception e){ e.printStackTrace(); } } }