package weka.filters.timeseries.shapelet_transforms;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Random;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.shapelet.OrderLineObj;
import weka.core.shapelet.QualityBound;
import weka.core.shapelet.QualityMeasures;
import weka.core.shapelet.Shapelet;
/**
* An optimised filter to transform a dataset by k shapelets.
*
* This method uses the distance calculation early abandons described in the
* "Trillions" KDD paper from Eamonn's group
*
* @author Edgaras Baranauskas
*/
public class ShapeletTransform2 extends FullShapeletTransform2
{
private static long subseqDistOpCount;
/**
* Default constructor; Quality measure defaults to information gain.
*/
public ShapeletTransform2()
{
super();
}
/**
* Single param constructor: filter is unusable until min/max params are
* initialised. Quality measure defaults to information gain.
*
* @param k the number of shapelets to be generated
*/
public ShapeletTransform2(int k)
{
super(k);
}
/**
* Full constructor to create a usable filter. Quality measure defaults to
* information gain.
*
* @param k the number of shapelets to be generated
* @param minShapeletLength minimum length of shapelets
* @param maxShapeletLength maximum length of shapelets
*/
public ShapeletTransform2(int k, int minShapeletLength, int maxShapeletLength)
{
super(k, minShapeletLength, maxShapeletLength);
}
/**
* Full, exhaustive, constructor for a filter. Quality measure set via enum,
* invalid selection defaults to information gain.
*
* @param k the number of shapelets to be generated
* @param minShapeletLength minimum length of shapelets
* @param maxShapeletLength maximum length of shapelets
* @param qualityChoice the shapelet quality measure to be used with this
* filter
*/
public ShapeletTransform2(int k, int minShapeletLength, int maxShapeletLength, QualityMeasures.ShapeletQualityChoice qualityChoice)
{
super(k, minShapeletLength, maxShapeletLength, qualityChoice);
}
@Override
protected Instances buildTansformedDataset(Instances data)
{
//Reorder the training data and reset the shapelet indexes
Instances output = determineOutputFormat(data);
Shapelet s;
double[][] sortedIndexes;
// for each data, get distance to each shapelet and create new instance
int size = shapelets.size();
int dataSize = data.numInstances();
//create our data instances
for(int j = 0; j < dataSize; j++)
{
output.add(new DenseInstance(size + 1));
}
double dist;
for (int i = 0; i < size; i++)
{
s = shapelets.get(i);
sortedIndexes = sortIndexes(s.content);
for (int j = 0; j < dataSize; j++)
{
dist = onlineSubsequenceDistance(s.content, sortedIndexes, data.instance(j));
output.instance(j).setValue(i, dist);
}
}
//do the classValues.
for(int j=0; j < dataSize; j++)
{
output.instance(j).setValue(size, data.instance(j).classValue());
}
return output;
}
@Override
protected Shapelet checkCandidate(double[] candidate, Instances data, int seriesId, int startPos, QualityBound.ShapeletQualityBound qualityBound)
{
// create orderline by looping through data set and calculating the subsequence
// distance from candidate to all data, inserting in order.
ArrayList<OrderLineObj> orderline = new ArrayList<>();
boolean pruned = false;
double[][] sortedIndexes = sortIndexes(candidate);
int dataSize = data.numInstances();
for (int i = 0; i < dataSize; i++)
{
//Check if it is possible to prune the candidate
if (qualityBound != null)
{
if (qualityBound.pruneCandidate())
{
pruned = true;
break;
}
}
double distance = 0.0;
if (i != seriesId)
{
distance = onlineSubsequenceDistance(candidate, sortedIndexes, data.instance(i));
}
double classVal = data.instance(i).classValue();
// without early abandon, it is faster to just add and sort at the end
orderline.add(new OrderLineObj(distance, classVal));
//Update qualityBound - presumably each bounding method for different quality measures will have a different update procedure.
if (qualityBound != null)
{
qualityBound.updateOrderLine(orderline.get(orderline.size() - 1));
}
}
// note: early abandon entropy pruning would appear here, but has been ommitted
// in favour of a clear multi-class information gain calculation. Could be added in
// this method in the future for speed up, but distance early abandon is more important
//If shapelet is pruned then it should no longer be considered in further processing
if (!pruned)
{
// create a shapelet object to store all necessary info, i.e.
Shapelet shapelet = new Shapelet(candidate, dataSourceIDs[seriesId], startPos, qualityMeasure);
shapelet.calculateQuality(orderline, classDistributions);
return shapelet;
}
return null;
}
@Override
protected double[] zNorm(double[] input, boolean classValOn)
{
return optimizedZNormalise(input, classValOn);
}
/**
* Calculate the distance between a candidate series and an Instance object
*
* @param candidate a double[] representation of a shapelet candidate
* @param sortedIndices
* @param timeSeriesIns an Instance object of a whole time series
* @return the distance between a candidate and a time series
*/
public static double onlineSubsequenceDistance(double[] candidate, double[][] sortedIndices, Instance timeSeriesIns)
{
double[] timeSeries = timeSeriesIns.toDoubleArray();
return onlineSubsequenceDistance(candidate, sortedIndices, timeSeries);
}
/**
* Calculate the distance between a shapelet candidate and a full time
* series (both double[]).
*
* @param candidate a double[] representation of a shapelet candidate
* @param sortedIndices
* @param timeSeries a double[] representation of a whole time series (inc.
* class value)
* @return the distance between a candidate and a time series
*
*
* NOTE: it seems that the reordering is repeated for each new time series.
* This could be avoided, but not sure how to structure the code to do it
*/
public static double onlineSubsequenceDistance(double[] candidate, double[][] sortedIndices, double[] timeSeries)
{
DoubleWrapper sumPointer = new DoubleWrapper();
DoubleWrapper sum2Pointer = new DoubleWrapper();
//Generate initial subsequence
double[] subseq = new double[candidate.length];
System.arraycopy(timeSeries, 0, subseq, 0, subseq.length);
subseq = optimizedZNormalise(subseq, false, sumPointer, sum2Pointer);
//Keep count of fundamental ops for experiment
subseqDistOpCount += subseq.length;
double sum = sumPointer.get();
double sum2 = sum2Pointer.get();
double bestDist = 0.0;
double mean;
double stdv;
double temp;
//Compute initial distance
for (int i = 0; i < candidate.length; i++)
{
temp = candidate[i] - subseq[i];
bestDist += temp * temp;
}
//Keep count of fundamental ops for experiment
subseqDistOpCount+= candidate.length;
// Scan through all possible subsequences of two
for (int i = 1; i < timeSeries.length - candidate.length; i++)
{
//Update the running sums
sum = sum - timeSeries[i - 1] + timeSeries[i - 1 + candidate.length];
sum2 = sum2 - (timeSeries[i - 1] * timeSeries[i - 1]) + (timeSeries[i - 1 + candidate.length] * timeSeries[i - 1 + candidate.length]);
//Compute the stats for new series
mean = sum / candidate.length;
//Get rid of rounding errors
double stdv2 = (sum2 - (mean * mean * candidate.length)) / candidate.length;
stdv = (stdv2 < ROUNDING_ERROR_CORRECTION) ? 0.0 : Math.sqrt(stdv2);
int j = 0;
double currentDist = 0.0;
double toAdd;
int reordedIndex;
while (j < candidate.length && currentDist < bestDist)
{
reordedIndex = (int) sortedIndices[j][0];
toAdd = candidate[reordedIndex] - (stdv == 0.0 ? 0.0 : ((timeSeries[i + reordedIndex] - mean) / stdv));
currentDist += (toAdd * toAdd);
j++;
//Keep count of fundamental ops for experiment
subseqDistOpCount++;
}
if (currentDist < bestDist)
{
bestDist = currentDist;
}
}
return (bestDist == 0.0) ? 0.0 : (1.0 / candidate.length * bestDist);
}
/**
* A method to sort the array indeces according to their corresponding
* values
*
* @param series a time series, which indeces need to be sorted
* @return
*/
public static double[][] sortIndexes(double[] series)
{
//Create an boxed array of values with corresponding indexes
double[][] sortedSeries = new double[series.length][2];
for (int i = 0; i < series.length; i++)
{
sortedSeries[i][0] = i;
sortedSeries[i][1] = Math.abs(series[i]);
}
Arrays.sort(sortedSeries, new Comparator<double[]>()
{
@Override
public int compare(double[] o1, double[] o2)
{
return Double.compare(o1[1], o2[1]);
}
});
return sortedSeries;
}
/**
* Z-Normalise a time series
*
* @param input the input time series to be z-normalised
* @param classValOn specify whether the time series includes a class value
* (e.g. an full instance might, a candidate shapelet wouldn't)
* @return a z-normalised version of input
*/
public static double[] optimizedZNormalise(double[] input, boolean classValOn)
{
return optimizedZNormalise(input, classValOn, null, null);
}
/**
* Z-Normalise a time series
*
* @param input the input time series to be z-normalised
* @param classValOn specify whether the time series includes a class value
* (e.g. an full instance might, a candidate shapelet wouldn't)
* @param storeGlobally specify whether the sum and sum of squares should be
* stored globally - this is used in subsequence distance method
* @return a z-normalised version of input
*/
private static double[] optimizedZNormalise(double[] input, boolean classValOn, DoubleWrapper sum, DoubleWrapper sum2)
{
double mean;
double stdv;
double classValPenalty = classValOn ? 1:0;
double[] output = new double[input.length];
double seriesTotal = 0;
double seriesTotal2 = 0;
for (int i = 0; i < input.length - classValPenalty; i++)
{
seriesTotal += input[i];
seriesTotal2 += (input[i] * input[i]);
}
if (sum != null && sum2 != null)
{
sum.set(seriesTotal);
sum2.set(seriesTotal2);
}
mean = seriesTotal / (input.length - classValPenalty);
double num = (seriesTotal2 - (mean * mean * (input.length - classValPenalty))) / (input.length - classValPenalty);
stdv = (num <= ROUNDING_ERROR_CORRECTION) ? 0.0 : Math.sqrt(num);
for (int i = 0; i < input.length - classValPenalty; i++)
{
output[i] = (stdv == 0.0) ? 0.0 : (input[i] - mean) / stdv;
}
if (classValOn)
{
output[output.length - 1] = input[input.length - 1];
}
return output;
}
private static class DoubleWrapper
{
private double d;
public DoubleWrapper()
{
d = 0.0;
}
public DoubleWrapper(double d)
{
this.d = d;
}
public void set(double d)
{
this.d = d;
}
public double get()
{
return d;
}
}
@Override
public long opCountForSingleShapelet(Instances data, int minShapeletLength, int maxShapeletLength) throws Exception
{
data = FullShapeletTransform2.roundRobinData(data, null);
subseqDistOpCount = 0;
findBestKShapeletsCache(1, data, minShapeletLength, maxShapeletLength);
return subseqDistOpCount;
}
/**
*
* @param args
*/
public static void main(String[] args)
{
//################ Test 1 ################
System.out.println("1) Testing index sorter: ");
double[] series = new double[10];
double[] subseq = new double[series.length / 2];
int min = -5;
int max = 5;
for (int i = 0; i < series.length; i++)
{
series[i] = min + (int) (Math.random() * ((max - min) + 1));
if (i < series.length / 2)
{
subseq[i] = min + (int) (Math.random() * ((max - min) + 1));
}
}
printSeries(series);
double[][] indices = sortIndexes(series);
for (int i = 0; i < series.length; i++)
{
System.out.print(series[(int) indices[i][0]] + ((i == series.length - 1) ? "\n" : ", "));
}
//################ Test 2 ################
System.out.println("\n 2) Testing normalization: ");
double[] normSeries;
normSeries = FullShapeletTransform2.zNormalise(series, false);
System.out.print("Original: ");
printSeries(normSeries);
normSeries = optimizedZNormalise(series, false);
System.out.print("Optimized: ");
printSeries(normSeries);
//################ Test 3 ################
System.out.println("\n 2) Testing subsequence distance: ");
System.out.println("Original dist: " + FullShapeletTransform2.subsequenceDistance(subseq, normSeries));
double[][] sortedIndexes = sortIndexes(subseq);
System.out.println("Optimized dist: " + onlineSubsequenceDistance(subseq, sortedIndexes, normSeries));
}
/* Method to estimate the range
*/
private static int[] estimateMinMax(Instances data, int runs, int sampleSize)
{
int[] minMax = new int[2];
ArrayList<Integer> lengths = new ArrayList<>();
// System.out.println("Performing length estimation");
for (int i = 0; i < runs; i++)
{
// System.out.println("Sample "+i);
// jacknife: sample without replacement
Instances copy = new Instances(data);
Random ran = new Random();
copy.randomize(ran);
Instances sample = new Instances(copy, 10);
for (int j = 0; j < sampleSize; j++)
{
sample.add(copy.instance(j));
}
ShapeletTransformDistCaching shp = new ShapeletTransformDistCaching(sampleSize, 3, data.numAttributes() - 1);
shp.setCandidatePruning(false);
shp.setUseSeparationGap(true);
shp.setRoundRobin(true);
shp.supressOutput();
try
{
shp.process(sample);
}
catch (Exception ex)
{
Logger.getLogger(ShapeletTransform.class.getName()).log(Level.SEVERE, null, ex);
}
ArrayList<Integer> sampleLengths = shp.getShapeletLengths();
// print(sampleLengths);
lengths.addAll(sampleLengths);
System.out.println("Completed sample " + i);
}
Collections.sort(lengths);
int numShapelets = sampleSize * runs;
int lowerQuartile = numShapelets / 4 - 1;
int upperQuartile = (numShapelets / 4) * 3 - 1;
minMax[0] = lengths.get(lowerQuartile);
minMax[1] = lengths.get(upperQuartile);
// System.out.println("Minimum shapelet length = "+minMax[0]+" maximum"
// + " shapelet length = "+minMax[1]);
return minMax;
}
/**
*
* @param series
*/
public static void printSeries(double[] series)
{
for (int i = 0; i < series.length; i++)
{
System.out.print(series[i] + ((i == series.length - 1) ? "\n" : ", "));
}
}
}