package water.rapids.ast.prims.timeseries;
import org.apache.commons.math3.distribution.NormalDistribution;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.rapids.Env;
import water.rapids.Val;
import water.rapids.ast.AstPrimitive;
import water.rapids.ast.AstRoot;
import water.rapids.vals.ValFrame;
import water.util.ArrayUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
* The iSAX algorithm is a time series indexing strategy that reduces the dimensionality of a time series along the time axis.
* For example, if a time series had 1000 unique values with data across 500 rows, reduce this data set to a time series that
* uses 100 unique values, across 10 buckets along the time span.
*
* References:
* http://www.cs.ucr.edu/~eamonn/SAX.pdf
* http://www.cs.ucr.edu/~eamonn/iSAX_2.0.pdf
*
* Note: This approach assumes the frame has the form of TS-i x T where TS-i is a single time series and T is time:
*
* T-1, T-2, T-3, T-4, ... , T-N
* TS-1 ...
* TS-2 ...
* TS-3 ...
* .
* .
* .
* TS-N ...
*
* @author markchan & navdeepgill
*/
public class AstIsax extends AstPrimitive {
protected double[][] _domain_hm = null;
@Override
public String[] args() { return new String[]{"ary", "numWords", "maxCardinality", "optimize_card"}; }
@Override
public int nargs() { return 1 + 4; } // (ary isax numWords maxCardinality optimize_card)
@Override
public String str() { return "isax"; }
@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
Frame fr = stk.track(asts[1].exec(env)).getFrame();
AstRoot n = asts[2];
AstRoot mc = asts[3];
boolean optm_card = asts[4].exec(env).getNum() == 1;
//Check vecs are numeric
for(Vec v : fr.vecs()){
if(!v.isNumeric()){
throw new IllegalArgumentException("iSax only applies to numeric columns!");
}
}
int numWords = (int) n.exec(env).getNum();
int maxCardinality = (int) mc.exec(env).getNum();
//Check numWords and maxCardinality are >=0
if(numWords < 0 ){
throw new IllegalArgumentException("numWords must be greater than 0!");
}
if(maxCardinality < 0 ){
throw new IllegalArgumentException("maxCardinality must be greater than 0!");
}
ArrayList<String> columns = new ArrayList<>();
for (int i = 0; i < numWords; i++) {
columns.add("c"+i);
}
Frame fr2 = new AstIsax.IsaxTask(numWords, maxCardinality)
.doAll(numWords, Vec.T_NUM, fr).outputFrame(null, columns.toArray(new String[numWords]), null);
int[] maxCards = new int[numWords];
if(optm_card) {
_domain_hm = new double[numWords][maxCardinality];
for (double[] r : _domain_hm) Arrays.fill(r,Double.NaN);
// see if we can reduce the cardinality by checking all unique tokens in all series in a word
for (int i=0; i<fr2.numCols(); i++) {
String[] domains = fr2.vec(i).toCategoricalVec().domain();
for (int j = 0; j < domains.length; j++){
_domain_hm[i][j] = Double.valueOf(domains[j]);
}
}
// get the cardinalities of each word
for (int i = 0; i < numWords; i++) {
int cnt = 0;
for (double d : _domain_hm[i]) {
if (Double.isNaN(d)) break;
else cnt++;
}
maxCards[i] = cnt;
}
Frame fr2_reduced = new AstIsax.IsaxReduceCard(_domain_hm, maxCardinality).doAll(numWords, Vec.T_NUM, fr2)
.outputFrame(null, columns.toArray(new String[numWords]), null);
Frame fr3 = new AstIsax.IsaxStringTask(maxCards).doAll(1, Vec.T_STR, fr2_reduced)
.outputFrame(null, new String[]{"iSax_index"}, null);
fr2.delete(); //Not needed anymore
fr3.add(fr2_reduced);
return new ValFrame(fr3);
}
for(int i = 0; i < numWords; ++i){
maxCards[i] = maxCardinality;
}
Frame fr3 = new AstIsax.IsaxStringTask(maxCards).doAll(1, Vec.T_STR, fr2)
.outputFrame(null, new String[]{"iSax_index"}, null);
fr3.add(fr2);
return new ValFrame(fr3);
}
public static class IsaxReduceCard extends MRTask<AstIsax.IsaxReduceCard> {
double[][] _domain_hm;
int maxCardinality;
IsaxReduceCard(double[][] dm, int mc) {
_domain_hm = dm;
maxCardinality = mc;
}
@Override
public void map(Chunk cs[], NewChunk nc[]){
for (int i = 0; i<cs.length; i++) {
boolean ltMaxCardFlag = Double.isNaN(ArrayUtils.sum(_domain_hm[i]));
for (int j = 0; j<cs[i].len(); j++) {
int idxOf;
if (ltMaxCardFlag) {
idxOf = Arrays.binarySearch(_domain_hm[i],(int) cs[i].at8(j));
} else {
idxOf = (int) cs[i].at8(j);
}
nc[i].addNum(idxOf);
}
}
}
}
public static class IsaxStringTask extends MRTask<AstIsax.IsaxStringTask> {
int[] maxCards;
IsaxStringTask(int[] mc) { maxCards = mc; }
@Override
public void map(Chunk cs[], NewChunk nc[]) {
int csize = cs[0].len();
for (int c_i = 0; c_i < csize; c_i++) {
StringBuffer sb = new StringBuffer("");
for (int cs_i = 0; cs_i < cs.length; cs_i++) {
sb.append(cs[cs_i].at8(c_i) + "^" + maxCards[cs_i] + "_");
}
nc[0].addStr(sb.toString().substring(0,sb.length()-1));
}
}
}
public static class IsaxTask extends MRTask<AstIsax.IsaxTask> {
private int nw;
private int mc;
private static NormalDistribution nd = new NormalDistribution();
private ArrayList<Double> probBoundaries; // for tokenizing Sax
IsaxTask(int numWords, int maxCardinality) {
nw = numWords;
mc = maxCardinality;
// come up with NormalDist boundaries
double step = 1.0 / mc;
probBoundaries = new ArrayList<Double>(); //cumulative dist function boundaries R{0-1}
for (int i = 0; i < mc; i++) {
probBoundaries.add(nd.inverseCumulativeProbability(i*step));
}
}
@Override
public void map(Chunk cs[],NewChunk[] nc) {
int step = cs.length/nw;
int chunkSize = cs[0].len();
int w_i = 0; //word iterator
double[] seriesSums = new double[chunkSize];
double[] seriesCounts = new double[chunkSize];
double[] seriesSSE = new double[chunkSize];
double[][] chunkMeans = new double[chunkSize][nw];
// Loop by words in the time series
for (int i = 0; i < cs.length; i+=step) {
// Loop by each series in the chunk
for (int j = 0; j < chunkSize; j++) {
double mySum = 0.0;
double myCount = 0.0;
// Loop through all the data in the chunk for the given series in the given subset (word)
for (Chunk c : ArrayUtils.subarray(cs,i,i+step)) {
if (c != null) {
// Calculate mean and sigma in one pass
double oldMean = myCount < 1 ? 0.0 : mySum/myCount;
mySum += c.atd(j);
seriesSums[j] += c.atd(j);
myCount++;
seriesCounts[j] += 1;
seriesSSE[j] += (c.atd(j) - oldMean) * (c.atd(j) - mySum/myCount);
}
}
chunkMeans[j][w_i] = mySum / myCount;
}
w_i++;
if (w_i>= nw) break;
}
//
for (int w = 0; w < nw; w++) {
for (int i = 0; i < chunkSize; i++) {
double seriesMean = seriesSums[i] / seriesCounts[i];
double seriesStd = Math.sqrt(seriesSSE[i] / (seriesCounts[i] - 1));
double zscore = (chunkMeans[i][w] - seriesMean) / seriesStd;
int p_i = 0;
while (probBoundaries.get(p_i + 1) < zscore) {
p_i++;
if (p_i == mc - 1) break;
}
nc[w].addNum(p_i,0);
}
}
}
}
}