AstIsax.java example

Explorer
h2o-3-master
package water.rapids.ast.prims.timeseries;

import org.apache.commons.math3.distribution.NormalDistribution;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.rapids.Env;
import water.rapids.Val;
import water.rapids.ast.AstPrimitive;
import water.rapids.ast.AstRoot;
import water.rapids.vals.ValFrame;
import water.util.ArrayUtils;
import java.util.ArrayList;
import java.util.Arrays;

/**
 * The iSAX algorithm is a time series indexing strategy that reduces the dimensionality of a time series along the time axis.
 * For example, if a time series had 1000 unique values with data across 500 rows, reduce this data set to a time series that
 * uses 100 unique values, across 10 buckets along the time span.
 *
 * References:
 * http://www.cs.ucr.edu/~eamonn/SAX.pdf
 * http://www.cs.ucr.edu/~eamonn/iSAX_2.0.pdf
 *
 * Note: This approach assumes the frame has the form of TS-i x T where TS-i is a single time series and T is time:
 *
 *    T-1, T-2, T-3, T-4, ... , T-N
 * TS-1 ...
 * TS-2 ...
 * TS-3 ...
 *  .
 *  .
 *  .
 * TS-N ...
 *
 * @author markchan & navdeepgill
 */
public class AstIsax extends AstPrimitive {
    protected double[][] _domain_hm = null;
    @Override
    public String[] args() { return new String[]{"ary", "numWords", "maxCardinality", "optimize_card"}; }

    @Override
    public int nargs() { return 1 + 4; } // (ary isax numWords maxCardinality optimize_card)

    @Override
    public String str() { return "isax"; }

    @Override
    public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) {

        Frame fr = stk.track(asts[1].exec(env)).getFrame();
        AstRoot n = asts[2];
        AstRoot mc = asts[3];
        boolean optm_card = asts[4].exec(env).getNum() == 1;

        //Check vecs are numeric
        for(Vec v : fr.vecs()){
            if(!v.isNumeric()){
                throw new IllegalArgumentException("iSax only applies to numeric columns!");
            }
        }

        int numWords = (int) n.exec(env).getNum();
        int maxCardinality = (int) mc.exec(env).getNum();

        //Check numWords and maxCardinality are >=0
        if(numWords < 0 ){
            throw new IllegalArgumentException("numWords must be greater than 0!");
        }
        if(maxCardinality < 0 ){
            throw new IllegalArgumentException("maxCardinality must be greater than 0!");
        }

        ArrayList<String> columns = new ArrayList<>();
        for (int i = 0; i < numWords; i++) {
            columns.add("c"+i);
        }
        Frame fr2 = new AstIsax.IsaxTask(numWords, maxCardinality)
                .doAll(numWords, Vec.T_NUM, fr).outputFrame(null, columns.toArray(new String[numWords]), null);

        int[] maxCards = new int[numWords];

        if(optm_card) {
            _domain_hm = new double[numWords][maxCardinality];
            for (double[] r : _domain_hm) Arrays.fill(r,Double.NaN);

            // see if we can reduce the cardinality by checking all unique tokens in all series in a word
            for (int i=0; i<fr2.numCols(); i++) {
                String[] domains = fr2.vec(i).toCategoricalVec().domain();
                for (int j = 0; j < domains.length; j++){
                    _domain_hm[i][j] = Double.valueOf(domains[j]);
                }
            }
            // get the cardinalities of each word
            for (int i = 0; i < numWords; i++) {
                int cnt = 0;
                for (double d : _domain_hm[i]) {
                    if (Double.isNaN(d)) break;
                    else cnt++;
                }
                maxCards[i] = cnt;
            }
            Frame fr2_reduced = new AstIsax.IsaxReduceCard(_domain_hm, maxCardinality).doAll(numWords, Vec.T_NUM, fr2)
                    .outputFrame(null, columns.toArray(new String[numWords]), null);
            Frame fr3 = new AstIsax.IsaxStringTask(maxCards).doAll(1, Vec.T_STR, fr2_reduced)
                    .outputFrame(null, new String[]{"iSax_index"}, null);

            fr2.delete(); //Not needed anymore
            fr3.add(fr2_reduced);

            return new ValFrame(fr3);
        }
        for(int i = 0; i < numWords; ++i){
            maxCards[i] = maxCardinality;
        }
        Frame fr3 = new AstIsax.IsaxStringTask(maxCards).doAll(1, Vec.T_STR, fr2)
                .outputFrame(null, new String[]{"iSax_index"}, null);

        fr3.add(fr2);

        return new ValFrame(fr3);
    }

    public static class IsaxReduceCard extends MRTask<AstIsax.IsaxReduceCard> {
        double[][] _domain_hm;
        int maxCardinality;
        IsaxReduceCard(double[][] dm, int mc) {
            _domain_hm = dm;
            maxCardinality = mc;
        }

        @Override
        public void map(Chunk cs[], NewChunk nc[]){
            for (int i = 0; i<cs.length; i++) {
                boolean ltMaxCardFlag =  Double.isNaN(ArrayUtils.sum(_domain_hm[i]));
                for (int j = 0; j<cs[i].len(); j++) {
                    int idxOf;
                    if (ltMaxCardFlag) {
                        idxOf = Arrays.binarySearch(_domain_hm[i],(int) cs[i].at8(j));
                    } else {
                        idxOf = (int) cs[i].at8(j);
                    }
                    nc[i].addNum(idxOf);

                }
            }
        }
    }
    public static class IsaxStringTask extends MRTask<AstIsax.IsaxStringTask> {
        int[] maxCards;
        IsaxStringTask(int[] mc) { maxCards = mc; }

        @Override
        public void map(Chunk cs[], NewChunk nc[]) {
            int csize = cs[0].len();
            for (int c_i = 0; c_i < csize; c_i++) {
                StringBuffer sb = new StringBuffer("");
                for (int cs_i = 0; cs_i < cs.length; cs_i++) {
                    sb.append(cs[cs_i].at8(c_i) + "^" + maxCards[cs_i] + "_");
                }
                nc[0].addStr(sb.toString().substring(0,sb.length()-1));
            }

        }

    }

    public static class IsaxTask extends MRTask<AstIsax.IsaxTask> {
        private int nw;
        private int mc;
        private static NormalDistribution nd = new NormalDistribution();
        private ArrayList<Double> probBoundaries; // for tokenizing Sax

        IsaxTask(int numWords, int maxCardinality) {
            nw = numWords;
            mc = maxCardinality;
            // come up with NormalDist boundaries
            double step = 1.0 / mc;
            probBoundaries = new ArrayList<Double>(); //cumulative dist function boundaries R{0-1}
            for (int i = 0; i < mc; i++) {
                probBoundaries.add(nd.inverseCumulativeProbability(i*step));
            }
        }
        @Override
        public void map(Chunk cs[],NewChunk[] nc) {
            int step = cs.length/nw;
            int chunkSize = cs[0].len();
            int w_i = 0; //word iterator
            double[] seriesSums = new double[chunkSize];
            double[] seriesCounts = new double[chunkSize];
            double[] seriesSSE = new double[chunkSize];
            double[][] chunkMeans = new double[chunkSize][nw];
            // Loop by words in the time series
            for (int i = 0; i < cs.length; i+=step) {
                // Loop by each series in the chunk
                for (int j = 0; j < chunkSize; j++) {
                    double mySum = 0.0;
                    double myCount = 0.0;
                    // Loop through all the data in the chunk for the given series in the given subset (word)
                    for (Chunk c : ArrayUtils.subarray(cs,i,i+step)) {
                        if (c != null) {
                            // Calculate mean and sigma in one pass
                            double oldMean = myCount < 1 ? 0.0 : mySum/myCount;
                            mySum += c.atd(j);
                            seriesSums[j] += c.atd(j);
                            myCount++;
                            seriesCounts[j] += 1;
                            seriesSSE[j] += (c.atd(j) - oldMean) * (c.atd(j) - mySum/myCount);
                        }
                    }
                    chunkMeans[j][w_i] = mySum / myCount;
                }
                w_i++;
                if (w_i>= nw) break;
            }
            //
            for (int w = 0; w < nw; w++) {
                for (int i = 0; i < chunkSize; i++) {
                    double seriesMean = seriesSums[i] / seriesCounts[i];
                    double seriesStd = Math.sqrt(seriesSSE[i] / (seriesCounts[i] - 1));
                    double zscore = (chunkMeans[i][w] - seriesMean) / seriesStd;
                    int p_i = 0;
                    while (probBoundaries.get(p_i + 1) < zscore) {
                        p_i++;
                        if (p_i == mc - 1) break;
                    }
                    nc[w].addNum(p_i,0);
                }
            }
        }
    }


}