FullShapeletTransform.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
/*
      * copyright: Anthony Bagnall
* NOTE: As shapelet extraction can be time consuming, there is an option to output shapelets
 * to a text file (Default location is in the root dir of the project, file name "defaultShapeletOutput.txt").
 *
 * Default settings are TO NOT PRODUCE OUTPUT FILE - unless file name is changed, each successive filter will
 * overwrite the output (see "setLogOutputFile(String fileName)" to change file dir and name).
 *
 * To reconstruct a filter from this output, please see the method "createFilterFromFile(String fileName)".
 */

package weka.filters.timeseries.shapelet_transforms;

import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Scanner;
import java.util.TreeMap;
import weka.core.*;
import weka.core.shapelet.*;
import weka.filters.SimpleBatchFilter;
/**
 * A filter to transform a dataset by k shapelets. Once built on a training set, the
 * filter can be used to transform subsequent datasets using the extracted shapelets.
 * <p>
 * See <a href="http://delivery.acm.org/10.1145/2340000/2339579/p289-lines.pdf?ip=139.222.14.198&acc=ACTIVE%20SERVICE&CFID=221649628&CFTOKEN=31860141&__acm__=1354814450_3dacfa9c5af84445ea2bfd7cc48180c8">Lines, J., Davis, L., Hills, J., Bagnall, A.: A shapelet transform for time series classification. In: Proc. 18th ACM SIGKDD (2012)</a>
 * @author Jason Lines
 */
public class FullShapeletTransform extends SimpleBatchFilter{

    @Override
    public String globalInfo() {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    protected boolean supressOutput = false; // defaults to print in System.out AS WELL as file, set to true to stop printing to console
    protected int minShapeletLength;
    protected int maxShapeletLength;
    protected int numShapelets;
    protected boolean shapeletsTrained;
    protected ArrayList<Shapelet> shapelets;
    protected String ouputFileLocation = "defaultShapeletOutput.txt"; // default store location
    protected boolean recordShapelets = true; // default action is to write an output file
    protected boolean roundRobin = false;

    public static int DEFAULT_NUMSHAPELETS=100;
    public static int DEFAULT_MINSHAPELETLENGTH=3;
    public static int DEFAULT_MAXSHAPELETLENGTH=23;
  
    protected QualityMeasures.ShapeletQualityMeasure qualityMeasure;
    protected QualityMeasures.ShapeletQualityChoice qualityChoice;
    protected boolean useCandidatePruning;
    protected boolean useSeparationGap=false;
    protected boolean useRoundRobin=false;
    public void setUseSeparationGap(boolean b){useSeparationGap=b;}
    public void setUseRoundRobin(boolean b){useRoundRobin=b;}
    
    protected int candidatePruningStartPercentage;

    protected static final double ROUNDING_ERROR_CORRECTION = 0.000000000000001;
    protected int[] dataSourceIDs;
    
    //Variables for experiments
    private static long subseqDistOpCount;
    
    /**
     * Default constructor; Quality measure defaults to information gain.
     */
    public FullShapeletTransform(){
        this(DEFAULT_NUMSHAPELETS,DEFAULT_MINSHAPELETLENGTH,DEFAULT_MAXSHAPELETLENGTH,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN);
    }

    /**
     * Constructor for generating a shapelet transform from an ArrayList of
     * Shapelets.
     * @param shapes 
     */
    public FullShapeletTransform(ArrayList<Shapelet> shapes)
    {
        this();
        this.shapelets = shapes;
        this.shapeletsTrained = true;
        this.numShapelets=shapelets.size();
    }
    
    /**
     * Single param constructor: 
     * Quality measure defaults to information gain.
     * @param k the number of shapelets to be generated
     */
    public FullShapeletTransform(int k){
        this(k,DEFAULT_MINSHAPELETLENGTH,DEFAULT_MAXSHAPELETLENGTH,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN);
    }
    /**
     * Full constructor to create a usable filter. Quality measure defaults to information gain.
     *
     * @param k the number of shapelets to be generated
     * @param minShapeletLength minimum length of shapelets
     * @param maxShapeletLength maximum length of shapelets
     */
    public FullShapeletTransform(int k, int minShapeletLength, int maxShapeletLength){
        this(k,minShapeletLength,maxShapeletLength,QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN);

    }

    /**
     * Full, exhaustive, constructor for a filter. Quality measure set via enum, invalid 
     * selection defaults to information gain.
     *
     * @param k the number of shapelets to be generated
     * @param minShapeletLength minimum length of shapelets
     * @param maxShapeletLength maximum length of shapelets
     * @param qualityChoice the shapelet quality measure to be used with this filter
     */
    public FullShapeletTransform(int k, int minShapeletLength, int maxShapeletLength, weka.core.shapelet.QualityMeasures.ShapeletQualityChoice qualityChoice){
        this.minShapeletLength = minShapeletLength;
        this.maxShapeletLength = maxShapeletLength;
        this.numShapelets = k;
        this.shapelets = new ArrayList<Shapelet>();
        this.shapeletsTrained = false;
        this.useCandidatePruning = false;
        this.qualityChoice=qualityChoice;
        switch(qualityChoice){
            case F_STAT:
                this.qualityMeasure = new QualityMeasures.FStat();
                break;
            case KRUSKALL_WALLIS:
                this.qualityMeasure = new QualityMeasures.KruskalWallis();
                break;
            case MOODS_MEDIAN:
                this.qualityMeasure = new QualityMeasures.MoodsMedian();
                break;
            default:
                this.qualityMeasure = new QualityMeasures.InformationGain();
        }
    }

    /**
     * Returns the set of shapelets for this transform as an ArrayList.
     * 
     * @return An ArrayList of Shapelets representing the shapelets found for
     * this Shapelet Transform.
     */
    public ArrayList<Shapelet> getShapelets()
    {
        return this.shapelets;
    }
    
    /**
     * Set the transform to round robin the data or not. This transform defaults
     * round robin to false to keep the instances in the same order as the 
     * original data. If round robin is set to true, the transformed data
     * will be reordered which can make it more difficult to use the ensemble.
     * 
     * @param val 
     */
    public void setRoundRobin(boolean val)
    {
        this.roundRobin = val;
    }
    
    /**
     * Supresses filter output to the console; useful when running timing experiments.
     */
    public void supressOutput(){
        this.supressOutput=true;
    }

    /**
     * Use candidate pruning technique when checking candidate quality. This 
     * speeds up the transform processing time. 
     */
    public void useCandidatePruning(){
        this.useCandidatePruning = true;
        this.candidatePruningStartPercentage = 10;
    }
    /**
     *
     * @param f
     */
    public void setCandidatePruning(boolean f){
        this.useCandidatePruning = f;
        if(f)
            this.candidatePruningStartPercentage = 10;
        else    //Not necessary
            this.candidatePruningStartPercentage = 100;
            
    }
       
    /**
     * Use candidate pruning technique when checking candidate quality. This 
     * speeds up the transform processing time. 
     * @param percentage the percentage of data to be precocessed before pruning
     * is initiated. In most cases the higher the percentage the less effective 
     * pruning becomes
     */
    public void useCandidatePruning(int percentage){
        this.useCandidatePruning = true;
        this.candidatePruningStartPercentage = percentage;
    }
    
    /**
     * Mutator method to set the number of shapelets to be stored by the filter.
     *
     * @param k the number of shapelets to be generated
     */
    public void setNumberOfShapelets(int k){
        this.numShapelets = k;
    }
    /**
     *
     * @return
     */
    public int getNumberOfShapelets(){ return numShapelets;
    }

    /**
     *  Mutator method to set the minimum and maximum shapelet lengths for the filter.
     *
     * @param minShapeletLength minimum length of shapelets
     * @param maxShapeletLength maximum length of shapelets
     */
    public void setShapeletMinAndMax(int minShapeletLength, int maxShapeletLength){
        this.minShapeletLength = minShapeletLength;
        this.maxShapeletLength = maxShapeletLength;
    }

    /**
     * Mutator method to set the quality measure used by the filter. As with constructors, default 
     * selection is information gain unless another valid selection is specified.
     *
     * @return 
     */
    public QualityMeasures.ShapeletQualityChoice getQualityMeasure(){
        return qualityChoice;
    }
    /**
     *
     * @param qualityChoice
     */
    public void setQualityMeasure(QualityMeasures.ShapeletQualityChoice qualityChoice){
        this.qualityChoice=qualityChoice;
        switch(qualityChoice){
            case F_STAT:
                this.qualityMeasure = new QualityMeasures.FStat();
                break;
            case KRUSKALL_WALLIS:
                this.qualityMeasure = new QualityMeasures.KruskalWallis();
                break;
            case MOODS_MEDIAN:
                this.qualityMeasure = new QualityMeasures.MoodsMedian();
                break;
            default:
                this.qualityMeasure = new QualityMeasures.InformationGain();
        }
    }
    
    /**
     * Sets the format of the filtered instances that are output. I.e. will include k attributes each shapelet 
     * distance and a class value
     *
     * @param inputFormat the format of the input data
     * @return a new Instances object in the desired output format
     * @throws Exception if all required parameters of the filter are not initialised correctly
     */
    @Override
    protected Instances determineOutputFormat(Instances inputFormat) throws Exception{

        if(this.numShapelets < 1){
            throw new Exception("ShapeletFilter not initialised correctly - please specify a value of k that is greater than or equal to 1");
        }

        //Set up instances size and format.
        //int length = this.numShapelets;
        int length = this.shapelets.size();
        FastVector atts = new FastVector();
        String name;
        for(int i = 0; i < length; i++){
            name = "Shapelet_" + i;
            atts.addElement(new Attribute(name));
        }

        if(inputFormat.classIndex() >= 0){ //Classification set, set class
            //Get the class values as a fast vector
            Attribute target = inputFormat.attribute(inputFormat.classIndex());

            FastVector vals = new FastVector(target.numValues());
            for(int i = 0; i < target.numValues(); i++){
                vals.addElement(target.value(i));
            }
            atts.addElement(new Attribute(inputFormat.attribute(inputFormat.classIndex()).name(), vals));
        }
        Instances result = new Instances("Shapelets" + inputFormat.relationName(), atts, inputFormat.numInstances());
        if(inputFormat.classIndex() >= 0){
            result.setClassIndex(result.numAttributes() - 1);
        }
        return result;
    }


    /**
     * The main logic of the filter; when called for the first time, k shapelets are extracted from the input Instances 'data'.
     * The input 'data' is transformed by the k shapelets, and the filtered data is returned as an output.
     * <p>
     * If called multiple times, shapelet extraction DOES NOT take place again; once k shapelets are established from the initial
     * call to process(), the k shapelets are used to transform subsequent Instances. 
     * <p>
     * Intended use: <p>
     * 1. Extract k shapelets from raw training data to build filter; <p>
     * 2. Use the filter to transform the raw training data into transformed training data; <p>
     * 3. Use the filter to transform the raw testing data into transformed testing data (e.g. filter never extracts shapelets from training data, therefore avoiding bias); <p>
     * 4. Build a classifier using transformed training data, perform classification on transformed test data.
     *
     * @param data the input data to be transformed (and to find the shapelets if this is the first run)
     * @return the transformed representation of data, according to the distances from each instance to each of the k shapelets
     * @throws Exception if the number of shapelets or the length parameters specified are incorrect
     */
    @Override
    public Instances process(Instances data) throws Exception{
        if(this.numShapelets < 1){
            throw new Exception("Number of shapelets initialised incorrectly - please select value of k greater than or equal to 1 (Usage: setNumberOfShapelets");
        }

        int maxPossibleLength = data.instance(0).numAttributes() - 1;
        if(data.classIndex() < 0) {
            throw new Exception("Require that the class be set for the ShapeletTransform");
        }

        if(this.minShapeletLength < 1 || this.maxShapeletLength < 1 || this.maxShapeletLength < this.minShapeletLength || this.maxShapeletLength > maxPossibleLength){
            throw new Exception("Shapelet length parameters initialised incorrectly");
        }
        
         if(this.shapeletsTrained == false){ 
            // shapelets discovery has not yet been caried out, so this must be training data
            dataSourceIDs = new int[data.numInstances()];
            if(roundRobin){
                 //Reorder the data in round robin order
                 data = roundRobinData(data, dataSourceIDs);
             }else{
                for(int i=0;i<data.numInstances();i++)
                    dataSourceIDs[i]=i;
            }
            this.shapelets = findBestKShapeletsCache(this.numShapelets, data, this.minShapeletLength, this.maxShapeletLength); // get k shapelets ATTENTION
            this.shapeletsTrained = true;
            if(!supressOutput){
                System.out.println(shapelets.size()+" Shapelets have been generated");
            }
//Reorder the training data and reset the shapelet indexes
             if(roundRobin){
                resetDataOrder(data, dataSourceIDs);
                resetShapeletIndices(shapelets,dataSourceIDs);
             }
        }
        
        Instances output = determineOutputFormat(data);

        // for each data, get distance to each shapelet and create new instance
        for(int i = 0; i < data.numInstances(); i++){ // for each data
            Instance toAdd = new DenseInstance(this.shapelets.size() + 1);
            int shapeletNum = 0;
            for(Shapelet s: this.shapelets){
                double dist = subseqDistance(s.content, data.instance(i));
                toAdd.setValue(shapeletNum++, dist);
            }
            toAdd.setValue(this.shapelets.size(), data.instance(i).classValue());
            output.add(toAdd);
        }
        return output;
    }


    /**
     * Set file path for the filter log. Filter log includes shapelet quality, seriesId, startPosition, and content for each shapelet.
     * @param fileName the updated file path of the filter log
     */
    public void setLogOutputFile(String fileName){
        this.recordShapelets = true;
        this.ouputFileLocation = fileName;
    }

    /**
     * Turns off log saving; useful for timing experiments where speed is essential.
     */
    public void turnOffLog(){
        this.recordShapelets = false;
    }

    /**
     * protected method for extracting k shapelets.
     *
     * @param numShapelets the target number of shapelets to generate
     * @param data the data that the shapelets will be taken from
     * @param minShapeletLength the minimum length of possible shapelets
     * @param maxShapeletLength the maximum length of possible shapelets
     * @return an ArrayList of FullShapeletTransform objects in order of their fitness (by infoGain, seperationGap then shortest length)
     * @throws Exception  
     */
    public ArrayList<Shapelet> findBestKShapeletsCache(int numShapelets, Instances data, int minShapeletLength, int maxShapeletLength)throws Exception{
                            
        ArrayList<Shapelet> kShapelets = new ArrayList<Shapelet>();         // store (upto) the best k shapelets overall
        ArrayList<Shapelet> seriesShapelets;                                // temp store of all shapelets for each time series
        
        /* new version to allow caching:
         * - for all time series, calculate the gain of all candidates of all possible lengths
         * - insert into a strucutre in order of fitness - arraylist with comparable implementation of shapelets
         * - once all candidates for a series are established, integrate into store of k best
        */

        TreeMap<Double, Integer> classDistributions = getClassDistributions(data); // used to calc info gain
                
        //for all time series
        if(!supressOutput){
            System.out.println("Processing data: ");
        }

        int numInstances = data.numInstances();
        for(int i = 0; i < numInstances; i++){
            if(!supressOutput){
                System.out.println("Currently processing instance "+(i+1)+" of "+ numInstances);
            }

            //get our time series as a double array.
            double[] wholeCandidate = data.instance(i).toDoubleArray();
            seriesShapelets = new ArrayList<Shapelet>();

            for(int length = minShapeletLength; length <= maxShapeletLength; length++){

                //for all possible starting positions of that length
                for(int start = 0; start <= wholeCandidate.length - length-1; start++){ //-1 = avoid classVal - handle later for series with no class val
                    // CANDIDATE ESTABLISHED - got original series, length and starting position
                    // extract relevant part into a double[] for processing
                    double[] candidate = new double[length];
                    for(int m = start; m < start + length; m++){
                        candidate[m - start] = wholeCandidate[m];
                    }
                                        
                    // znorm candidate here so it's only done once, rather than in each distance calculation
                    candidate = zNorm(candidate, false);
                       
                    //Initialize bounding algorithm for current candidate
                    QualityBound.ShapeletQualityBound qualityBound = initializeQualityBound(classDistributions);
        
                    //Set bound of the bounding algorithm
                    if(qualityBound != null && kShapelets.size() == numShapelets){
                        qualityBound.setBsfQuality(kShapelets.get(numShapelets-1).qualityValue);
                    }
                            
                    Shapelet candidateShapelet = checkCandidate(candidate, data, i, start, classDistributions, qualityBound);
                    
                    //If shapelet was pruned then null will be returned so need to check for that
                    if(candidateShapelet != null){
                        seriesShapelets.add(candidateShapelet);
                    }
                }
            }
            // now that we have all shapelets, self similarity can be fairly assessed without fear of removing potentially
            // good shapelets
            if(useSeparationGap)
                Collections.sort(seriesShapelets,new Shapelet.ReverseSeparationGap());
            else
                Collections.sort(seriesShapelets,new Shapelet.ReverseOrder());
            seriesShapelets = removeSelfSimilar(seriesShapelets);
            
            kShapelets = combine(numShapelets,kShapelets,seriesShapelets);
        }
        
        this.numShapelets = kShapelets.size();

        if(this.recordShapelets){
            //just in case the file doesn't exist or the directories.
            File file = new File(this.ouputFileLocation);
            file.getParentFile().mkdirs();
            FileWriter out = new FileWriter(file);
            
            for(int i = 0; i < kShapelets.size();i++){
                out.append(kShapelets.get(i).qualityValue+","+kShapelets.get(i).seriesId+","+kShapelets.get(i).startPos+"\n");

                double[] shapeletContent = kShapelets.get(i).content;
                for(int j = 0; j < shapeletContent.length; j++){
                    out.append(shapeletContent[j]+",");
                }
                out.append("\n");
            }
            out.close();
        }
        if(!supressOutput){
            System.out.println();
            System.out.println("Output Shapelets:");
            System.out.println("-------------------");
            System.out.println("informationGain,seriesId,startPos");
            System.out.println("<shapelet>");
            System.out.println("-------------------");
            System.out.println();
            for(int i = 0; i < kShapelets.size();i++){
                System.out.println(kShapelets.get(i).qualityValue+","+kShapelets.get(i).seriesId+","+kShapelets.get(i).startPos);
                double[] shapeletContent = kShapelets.get(i).content;
                for(int j = 0; j < shapeletContent.length; j++){
                    System.out.print(shapeletContent[j]+",");
                }
                System.out.println();
            }
        }


        return kShapelets;
    }

    
    /**
     *
     * @param classDist
     * @return
     */
    protected QualityBound.ShapeletQualityBound initializeQualityBound(TreeMap<Double, Integer> classDist){
        if(useCandidatePruning){
            if(qualityMeasure instanceof QualityMeasures.InformationGain){
                return new QualityBound.InformationGainBound(classDist, candidatePruningStartPercentage);
            }else if(qualityMeasure instanceof QualityMeasures.MoodsMedian){
                return new QualityBound.MoodsMedianBound(classDist, candidatePruningStartPercentage);    
            }else if(qualityMeasure instanceof QualityMeasures.FStat){
                return new QualityBound.FStatBound(classDist, candidatePruningStartPercentage);
            }else if(qualityMeasure instanceof QualityMeasures.KruskalWallis){
                return new QualityBound.KruskalWallisBound(classDist, candidatePruningStartPercentage);
            }
        }
        return null;
    }
    
    /**
     * protected method to remove self-similar shapelets from an ArrayList (i.e. if they come from the same series
     * and have overlapping indicies)
     *
     * @param shapelets the input Shapelets to remove self similar FullShapeletTransform objects from
     * @return a copy of the input ArrayList with self-similar shapelets removed
     */
    protected static ArrayList<Shapelet> removeSelfSimilar(ArrayList<Shapelet> shapelets){
        // return a new pruned array list - more efficient than removing
        // self-similar entries on the fly and constantly reindexing
        ArrayList<Shapelet> outputShapelets = new ArrayList<Shapelet>();
        boolean[] selfSimilar = new boolean[shapelets.size()];

        // to keep track of self similarity - assume nothing is similar to begin with
        
        //TODO remove.  Aaron: all values in a boolean array default to false. Don't need this loop.
        for(int i = 0; i < shapelets.size(); i++){
            selfSimilar[i] = false;
        }

        
        //TODO: Aaron: tidy up logic.
        for(int i = 0; i < shapelets.size();i++){
            if(selfSimilar[i]==false){
                outputShapelets.add(shapelets.get(i));
                for(int j = i+1; j < shapelets.size(); j++){
                    if(selfSimilar[j]==false && selfSimilarity(shapelets.get(i),shapelets.get(j))){ // no point recalc'ing if already self similar to something
                        selfSimilar[j] = true;
                    }
                }
            }
        }
        return outputShapelets;
    }


    /**
     * Private method to combine two ArrayList collections of FullShapeletTransform objects.
     *
     * @param k the maximum number of shapelets to be returned after combining the two lists
     * @param kBestSoFar the (up to) k best shapelets that have been observed so far, passed in to combine with shapelets from a new series
     * @param timeSeriesShapelets the shapelets taken from a new series that are to be merged in descending order of fitness with the kBestSoFar
     * @return an ordered ArrayList of the best k (or less) FullShapeletTransform objects from the union of the input ArrayLists
     */

    //NOTE: could be more efficient here
    protected ArrayList<Shapelet> combine(int k, ArrayList<Shapelet> kBestSoFar, ArrayList<Shapelet> timeSeriesShapelets){

        ArrayList<Shapelet> newBestSoFar = new ArrayList<Shapelet>();
        for(int i = 0; i < timeSeriesShapelets.size();i++){
            kBestSoFar.add(timeSeriesShapelets.get(i));
        }
        if(useSeparationGap)
            Collections.sort(kBestSoFar,new Shapelet.ReverseSeparationGap());
        else
            Collections.sort(kBestSoFar,new Shapelet.ReverseOrder());
        if(kBestSoFar.size()<k) { // no need to return up to k, as there are not k shapelets yet
            return kBestSoFar;
        } 

        for(int i = 0; i < k; i++){
            newBestSoFar.add(kBestSoFar.get(i));
        }

        return newBestSoFar;
    }

    /**
     *  Private method to calculate the class distributions of a dataset. Main purpose is for computing shapelet qualities.
     *
     * @param data the input data set that the class distributions are to be derived from
     * @return a TreeMap<Double, Integer> in the form of <Class Value, Frequency>
     */
    public static TreeMap<Double, Integer> getClassDistributions(Instances data){
        TreeMap<Double, Integer> classDistribution = new TreeMap<Double, Integer>();
        double classValue;
        for(int i = 0; i < data.numInstances(); i++){
            classValue = data.instance(i).classValue();
            boolean classExists = false;
            for(Double d : classDistribution.keySet()){
                if(d == classValue){
                    int temp = classDistribution.get(d);
                    temp++;
                    classDistribution.put(classValue, temp);
                    classExists = true;
                }
            }
            
            if(classExists == false){
                classDistribution.put(classValue, 1);
            }
        }
        return classDistribution;
    }

    /**
     * protected method to check a candidate shapelet. Functions by passing in the raw data, and returning an assessed Shapelet object.
     *
     * @param candidate the data from the candidate FullShapeletTransform
     * @param data the entire data set to compare the candidate to
     * @param seriesId series id from the dataset that the candidate came from
     * @param startPos start position in the series where the candidate came from
     * @param classDistribution a TreeMap<Double, Integer> in the form of <Class Value, Frequency> to describe the dataset composition
     * @param qualityBound 
     * @return a fully-computed FullShapeletTransform, including the quality of this candidate
     */
    protected Shapelet checkCandidate(double[] candidate, Instances data, int seriesId, int startPos, TreeMap classDistribution, QualityBound.ShapeletQualityBound qualityBound){
        
        // create orderline by looping through data set and calculating the subsequence
        // distance from candidate to all data, inserting in order.
        ArrayList<OrderLineObj> orderline = new ArrayList<OrderLineObj>();

        boolean pruned = false;
        
        for(int i = 0; i < data.numInstances(); i++){
            //Check if it is possible to prune the candidate
            if(qualityBound != null){
                if(qualityBound.pruneCandidate()){
                    pruned = true;
                    break;
                }
            }

            double distance = 0.0;
            if(i != seriesId){
                distance = subseqDistance(candidate, data.instance(i));  
            }
            
            double classVal = data.instance(i).classValue();
            // without early abandon, it is faster to just add and sort at the end
            orderline.add(new OrderLineObj(distance, classVal));
            
            //Update qualityBound - presumably each bounding method for different quality measures will have a different update procedure.
            if(qualityBound != null){
                qualityBound.updateOrderLine(orderline.get(orderline.size()-1));
            }
        }

        // note: early abandon entropy pruning would appear here, but has been ommitted
        // in favour of a clear multi-class information gain calculation. Could be added in
        // this method in the future for speed up, but distance early abandon is more important
        
        //If shapelet is pruned then it should no longer be considered in further processing
        if(pruned){
            return null;
        }else{
            // create a shapelet object to store all necessary info, i.e.
            Shapelet shapelet = new Shapelet(candidate, dataSourceIDs[seriesId], startPos, this.qualityMeasure); 
            shapelet.calculateQuality(orderline, classDistribution);
            return shapelet;
        }
    }
    
    public static double[] getInfoGain(Instances trans)
    {
       double[] quals = new double[trans.numAttributes()-1];
        
        
       TreeMap map = getClassDistributions(trans);
       
       for(int i=0;i<quals.length;i++)
       {
            ArrayList<OrderLineObj> orderline = new ArrayList<OrderLineObj>();
            double[] dists = trans.attributeToDoubleArray(i);


             for(int j = 0; j < dists.length; j++)
             {    

                 double distance = dists[j];
                 double classVal = trans.instance(j).classValue();
                 orderline.add(new OrderLineObj(distance, classVal));

             } 

            QualityMeasures.InformationGain ig = new QualityMeasures.InformationGain();
            double qual = ig.calculateQuality(orderline, map);
            quals[i] = qual;
       }

       return quals;
    }
    
    /**
     * Calculate the distance between a candidate series and an Instance object
     *
     * @param candidate a double[] representation of a shapelet candidate
     * @param timeSeriesIns an Instance object of a whole time series
     * @return the distance between a candidate and a time series
     */
    protected double subseqDistance(double[] candidate, Instance timeSeriesIns){
        return subsequenceDistance(candidate, timeSeriesIns);
    }    
    
    /**
     *
     * @param candidate
     * @param timeSeriesIns
     * @return
     */
    public static double subsequenceDistance(double[] candidate, Instance timeSeriesIns){
        double[] timeSeries = timeSeriesIns.toDoubleArray();
        return subsequenceDistance(candidate, timeSeries);
    }


    /**
     * Calculate the distance between a shapelet candidate and a full time series (both double[]).
     *
     * @param candidate a double[] representation of a shapelet candidate
     * @param timeSeries a double[] representation of a whole time series (inc. class value)
     * @return the distance between a candidate and a time series
     */
    public static double subsequenceDistance(double[] candidate, double[] timeSeries){

        double bestSum = Double.MAX_VALUE;
        double sum;
        double[] subseq;

        // for all possible subsequences of two
        for(int i = 0; i <= timeSeries.length - candidate.length - 1; i++){
            sum = 0;
            // get subsequence of two that is the same lenght as one
            subseq = new double[candidate.length];

            for(int j = i; j < i + candidate.length; j++){
                subseq[j - i] = timeSeries[j];
                
                //Keep count of fundamental ops for experiment
                subseqDistOpCount++;
            }
            subseq = zNormalise(subseq, false); // Z-NORM HERE
            
            //Keep count of fundamental ops for experiment
            subseqDistOpCount += 3 * subseq.length;
            
            for(int j = 0; j < candidate.length; j++){
                sum +=(candidate[j] - subseq[j]) *(candidate[j] - subseq[j]);
                
                //Keep count of fundamental ops for experiment
                subseqDistOpCount++;
            }
            if(sum < bestSum){
                bestSum = sum;
            }
        }
        return (bestSum == 0.0) ? 0.0 : (1.0 / candidate.length * bestSum); 
    }
    /**
     *
     * @param input
     * @param classValOn
     * @return
     */
    protected double[] zNorm(double[] input, boolean classValOn){        
        return FullShapeletTransform.zNormalise(input, classValOn);
    }
       
    /**
     * Z-Normalise a time series
     *
     * @param input the input time series to be z-normalised
     * @param classValOn specify whether the time series includes a class value (e.g. an full instance might, a candidate shapelet wouldn't)
     * @return a z-normalised version of input
     */
    public static double[] zNormalise(double[] input, boolean classValOn){
        double mean;
        double stdv;

        double classValPenalty = 0;
        if(classValOn){
            classValPenalty = 1;
        }
        double[] output = new double[input.length];
        double seriesTotal = 0;

        for(int i = 0; i < input.length - classValPenalty; i++){
            seriesTotal += input[i];
        }

        mean = seriesTotal /(input.length - classValPenalty);
        stdv = 0;
        for(int i = 0; i < input.length - classValPenalty; i++){
            stdv +=(input[i] - mean) *(input[i] - mean);
        }

        stdv = stdv / (input.length - classValPenalty);
        if(stdv < ROUNDING_ERROR_CORRECTION){
            stdv = 0.0;
        }else{
            stdv = Math.sqrt(stdv);
        }

        for(int i = 0; i < input.length - classValPenalty; i++){
            if(stdv == 0.0){
                output[i] = 0.0;
            }else{
                output[i] =(input[i] - mean) / stdv;
            }
        }

        if(classValOn == true){
            output[output.length - 1] = input[input.length - 1];
        }

        return output;
    }

    /**
     * Load a set of Instances from an ARFF
     *
     * @param fileName the file name of the ARFF
     * @return a set of Instances from the ARFF
     */
    public static Instances loadData(String fileName){
        Instances data = null;
        try{
            FileReader r;
            r = new FileReader(fileName);
            data = new Instances(r);

            data.setClassIndex(data.numAttributes() - 1);
        } catch(Exception e){
            System.out.println(" Error =" + e + " in method loadData");
            e.printStackTrace();
        }
        return data;
    }

    /**
     * A private method to assess the self similarity of two FullShapeletTransform objects (i.e. whether they have overlapping indicies and
     * are taken from the same time series).
     *
     * @param shapelet the first FullShapeletTransform object (in practice, this will be the dominant shapelet with quality >= candidate)
     * @param candidate the second FullShapeletTransform
     * @return
     */
    private static boolean selfSimilarity(Shapelet shapelet, Shapelet candidate){
        if(candidate.seriesId == shapelet.seriesId){
            if(candidate.startPos >= shapelet.startPos && candidate.startPos < shapelet.startPos + shapelet.content.length){ //candidate starts within exisiting shapelet
                return true;
            }
            if(shapelet.startPos >= candidate.startPos && shapelet.startPos < candidate.startPos + candidate.content.length){
                return true;
            }
        }
        return false;
    }
    /**
     * A method to read in a FullShapeletTransform log file to reproduce a FullShapeletTransform
     * <p>
     * NOTE: assumes shapelets from log are Z-NORMALISED
     *
     * @param fileName the name and path of the log file
     * @return a duplicate FullShapeletTransform to the object that created the original log file
     * @throws Exception
     */
    public static FullShapeletTransform createFilterFromFile(String fileName) throws Exception{
        return createFilterFromFile(fileName, Integer.MAX_VALUE);
    }
    
    /**
     * Returns a list of the lengths of the shapelets found by this transform.
     * 
     * @return An ArrayList of Integers representing the lengths of the
     * shapelets.
     */
    public ArrayList<Integer> getShapeletLengths()
    {
        ArrayList<Integer> shapeletLengths = new ArrayList<>();
        
        if(this.shapeletsTrained)
        {
           for(Shapelet s : this.shapelets)
           {
               shapeletLengths.add(s.content.length);
           }
        }
        
        return shapeletLengths;
    }

    /**
     * A method to read in a FullShapeletTransform log file to reproduce a FullShapeletTransform,
     * <p>
     * NOTE: assumes shapelets from log are Z-NORMALISED
     *
     * @param fileName the name and path of the log file
     * @param maxShapelets
     * @return a duplicate FullShapeletTransform to the object that created the original log file
     * @throws Exception
     */
    public static FullShapeletTransform createFilterFromFile(String fileName, int maxShapelets) throws Exception{

        File input = new File(fileName);
        Scanner scan = new Scanner(input);
        scan.useDelimiter("\n");

        FullShapeletTransform sf = new FullShapeletTransform();
        ArrayList<Shapelet> shapelets = new ArrayList<Shapelet>();

        String shapeletContentString;
        String shapeletStatsString;
        ArrayList<Double> content;
        double[] contentArray;
        Scanner lineScan;
        Scanner statScan;
        double qualVal;
        int serID;
        int starPos;
        
        int shapeletCount = 0;

        while(shapeletCount < maxShapelets && scan.hasNext()){
            shapeletStatsString = scan.next();                                    
            shapeletContentString = scan.next();
            
            //Get the shapelet stats
            statScan = new Scanner(shapeletStatsString);
            statScan.useDelimiter(",");

            qualVal = Double.parseDouble(statScan.next().trim());
            serID = Integer.parseInt(statScan.next().trim());
            starPos = Integer.parseInt(statScan.next().trim());
            //End of shapelet stats

            lineScan = new Scanner(shapeletContentString);
//            System.out.println(shapeletContentString);
            lineScan.useDelimiter(",");

            content = new ArrayList<Double>();
            while(lineScan.hasNext()){
                String next = lineScan.next().trim();
                if(!next.isEmpty()){
                    content.add(Double.parseDouble(next));
                    }
            }

            contentArray = new double[content.size()];
            for(int i = 0; i < content.size(); i++){
                contentArray[i] = content.get(i);
            }


            contentArray = zNormalise(contentArray, false);


            Shapelet s = new Shapelet(contentArray,qualVal,serID,starPos);
            
            shapelets.add(s);
            shapeletCount++;
        }
        sf.shapelets = shapelets;
        sf.shapeletsTrained = true;
        sf.numShapelets=shapelets.size();
        sf.setShapeletMinAndMax(1, 1);

        return sf;
    }
    
    public void setShapelets(ArrayList<Shapelet> list)
    {
        this.shapelets = list;
        this.shapeletsTrained = true;
        this.numShapelets = list.size();
        this.minShapeletLength = 1;
        this.maxShapeletLength = 1;
    }

    /**
     *
     * @return
     */
    public boolean foundShapelets(){ return shapeletsTrained;}
    
    /**
     * A method to obtain time taken to find a single best shapelet in the data set
     * @param data the data set to be processed
     * @param minShapeletLength minimum shapelet length
     * @param maxShapeletLength maximum shapelet length
     * @return time in seconds to find the best shapelet
     * @throws Exception 
     */
    public double timingForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception {
        data = roundRobinData(data, null);
        long startTime = System.nanoTime();
        findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength);
        long finishTime = System.nanoTime();
        return (double)(finishTime - startTime) / 1000000000.0;
    }
       
    /**
     *
     * @param data
     * @param minShapeletLength
     * @param maxShapeletLength
     * @return
     * @throws Exception
     */
    public long opCountForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception {
        data = roundRobinData(data, null);
        subseqDistOpCount = 0;
        findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength);
        return subseqDistOpCount;
    }
    
    /**
     * Outputs the log file to the appropriate location.
     * 
     * @throws Exception 
     */
    public void outputLog () throws Exception
    {
            //just in case the file doesn't exist, or the directories.
            File file = new File(this.ouputFileLocation);
            file.getParentFile().mkdirs();
            
            FileWriter out = new FileWriter(this.ouputFileLocation, file.exists());
            for(int i = 0; i < this.shapelets.size();i++){
                out.append(this.shapelets.get(i).qualityValue+","+this.shapelets.get(i).seriesId+","+this.shapelets.get(i).startPos+"\n");

                double[] shapeletContent = this.shapelets.get(i).content;
                for(int j = 0; j < shapeletContent.length; j++){
                    out.append(shapeletContent[j]+",");
                }
                out.append("\n");
            }
            out.close();
        
    }
    /**
     * Method to reset shapelet indices into the values given in sourcePos
     * @param data Instances to be reordered
     * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored.
     * @return Instances in round robin order
     */
    public static void resetShapeletIndices(ArrayList<Shapelet> shapelets, int[] sourcePos){
        for(Shapelet s:shapelets){
            int pos=s.getSeriesId();
            s.setSeriesID(sourcePos[pos]);
        }

    }
    
    /**
     * Method to reorder the given Instances into the order given in sourcePos
     * @param data Instances to be reordered
     * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored.
     * @return Instances in round robin order
     */
    public static void resetDataOrder(Instances data, int[] sourcePos){
        
        if(data.numInstances()!=sourcePos.length){//ERROR
            System.out.println(" ERROR, cannot reorder, because the series are different lengths");
            return;
        }
        Instance[] newOrder=new Instance[sourcePos.length];
        for(int i=0;i<sourcePos.length;i++)
            newOrder[sourcePos[i]]=data.instance(i);
        for(int i=0;i<data.numInstances();i++)
            data.set(i,newOrder[i]);

    }
    
    
    /**
     * Method to reorder the given Instances in round robin order
     * @param data Instances to be reordered
     * @param sourcePos Pointer to array of ints, where old positions of instances are to be stored.
     * @return Instances in round robin order
     */
    public static Instances roundRobinData(Instances data, int[] sourcePos){

        //Count number of classes 
        TreeMap<Double, ArrayList<Instance>> instancesByClass = new TreeMap<Double, ArrayList<Instance>>();
        TreeMap<Double, ArrayList<Integer>> positionsByClass = new TreeMap<Double, ArrayList<Integer>>();
        
        //Get class distributions 
        TreeMap<Double, Integer> classDistribution = FullShapeletTransform.getClassDistributions(data);

        //Allocate arrays for instances of every class
        for(Double key : classDistribution.keySet()){
            int frequency = classDistribution.get(key);
            instancesByClass.put(key, new ArrayList<Instance>(frequency));
            positionsByClass.put(key, new ArrayList<Integer>(frequency));
        }

        //Split data according to their class memebership
        for(int i = 0; i < data.numInstances();i++){
            Instance inst = data.instance(i);                   
            instancesByClass.get(inst.classValue()).add(inst);
            positionsByClass.get(inst.classValue()).add(i);
        }

        //Merge data into single list in round robin order
        Instances roundRobinData = new Instances(data, data.numInstances());
        for(int i = 0; i < data.numInstances();){
            //Allocate arrays for instances of every class
            for(Double key : classDistribution.keySet()){
                ArrayList<Instance> currentList = instancesByClass.get(key);
                ArrayList<Integer> currentPositions = positionsByClass.get(key);
                
                if(!currentList.isEmpty()){
                    roundRobinData.add(currentList.remove(currentList.size() - 1));
                    if(sourcePos != null && sourcePos.length == data.numInstances()){
                        sourcePos[i] = currentPositions.remove(currentPositions.size()-1);
                    }
                    i++;
                }
            }
        }    

        return roundRobinData;
    }
//print out all the shapelets    
    public String toString(){
        String str="Shapelets: ";
        for(Shapelet s:shapelets)
            str+=s.toString()+"\n";
        return str;
    }
    /**
     * An example use of a FullShapeletTransform
     * @param args command line args. arg[0] should spcify a set of training instances to transform
     */
   public static void main(String[] args){
        try{
            // mandatory requirements:  numShapelets (k), min shapelet length, max shapelet length, input data
            // additional information:  log output dir

            // example filter, k = 10, minLength = 20, maxLength = 40, data = , output = exampleOutput.txt
            int k = 10;
            int minLength = 10;
            int maxLength = 20;
//            Instances data= FullShapeletTransform.loadData("ItalyPowerDemand_TRAIN.arff"); // for example
            Instances data= FullShapeletTransform.loadData(args[0]);

            FullShapeletTransform sf = new FullShapeletTransform(k, minLength, maxLength);
            sf.setQualityMeasure(QualityMeasures.ShapeletQualityChoice.INFORMATION_GAIN);
            sf.setLogOutputFile("exampleOutput.txt"); // log file stores shapelet output

            // Note: sf.process returns a transformed set of Instances. The first time that
            //      thisFilter.process(data) is called, shapelet extraction occurs. Subsequent calls to process
            //      uses the previously extracted shapelets to transform the data. For example:
            //
            //      Instances transformedTrain = sf.process(trainingData); -> extracts shapelets and can be used to transform training data
            //      Instances transformedTest = sf.process(testData); -> uses shapelets extracted from trainingData to transform testData
            Instances transformed = sf.process(data);
        }catch(Exception e){
            e.printStackTrace();
        }
    }
   

   

}