package net.seninp.grammarviz;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.beust.jcommander.JCommander;
import net.seninp.gi.GIAlgorithm;
import net.seninp.gi.logic.GrammarRuleRecord;
import net.seninp.gi.logic.GrammarRules;
import net.seninp.gi.logic.RuleInterval;
import net.seninp.gi.repair.RePairFactory;
import net.seninp.gi.repair.RePairGrammar;
import net.seninp.gi.rulepruner.ReducedGrammarSizeSorter;
import net.seninp.gi.rulepruner.ReductionSorter;
import net.seninp.gi.rulepruner.RulePruner;
import net.seninp.gi.rulepruner.RulePrunerFactory;
import net.seninp.gi.rulepruner.SampledPoint;
import net.seninp.gi.sequitur.SequiturFactory;
import net.seninp.grammarviz.anomaly.AnomalyAlgorithm;
import net.seninp.grammarviz.anomaly.RRAImplementation;
import net.seninp.jmotif.distance.EuclideanDistance;
import net.seninp.jmotif.sax.NumerosityReductionStrategy;
import net.seninp.jmotif.sax.SAXProcessor;
import net.seninp.jmotif.sax.TSProcessor;
import net.seninp.jmotif.sax.datastructure.SAXRecords;
import net.seninp.jmotif.sax.discord.BruteForceDiscordImplementation;
import net.seninp.jmotif.sax.discord.DiscordRecords;
import net.seninp.jmotif.sax.discord.HOTSAXImplementation;
import net.seninp.jmotif.sax.parallel.ParallelSAXImplementation;
import net.seninp.jmotif.sax.registry.LargeWindowAlgorithm;
/**
* Main executable wrapping all the discord discovery methods.
*
* @author psenin
*
*/
public class GrammarVizAnomaly {
// locale, charset, etc
//
final static Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;
private static final String CR = "\n";
// workers
//
private static TSProcessor tp = new TSProcessor();
private static EuclideanDistance ed = new EuclideanDistance();
// static block - we instantiate the logger
//
private static final Logger LOGGER = LoggerFactory.getLogger(GrammarVizAnomaly.class);
/**
* The main executable.
*
* @param args The command-line params.
* @throws Exception If error occurs.
*/
public static void main(String[] args) throws Exception {
GrammarVizAnomalyParameters params = new GrammarVizAnomalyParameters();
JCommander jct = new JCommander(params, args);
if (0 == args.length) {
jct.usage();
}
else {
// get params printed
//
StringBuffer sb = new StringBuffer(1024);
sb.append(CR).append("GrammarViz2 CLI anomaly discovery").append(CR);
sb.append("parameters:").append(CR);
sb.append(" input file: ").append(GrammarVizAnomalyParameters.IN_FILE)
.append(CR);
sb.append(" output files prefix: ").append(GrammarVizAnomalyParameters.OUT_FILE)
.append(CR);
sb.append(" Algorithm implementation: ").append(GrammarVizAnomalyParameters.ALGORITHM)
.append(CR);
sb.append(" Num. of discords to report: ").append(GrammarVizAnomalyParameters.DISCORDS_NUM)
.append(CR);
if (!(AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM))) {
sb.append(" SAX sliding window size: ")
.append(GrammarVizAnomalyParameters.SAX_WINDOW_SIZE).append(CR);
}
if (!(AnomalyAlgorithm.BRUTEFORCE.equals(GrammarVizAnomalyParameters.ALGORITHM))) {
if (!(AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM))) {
sb.append(" SAX PAA size: ")
.append(GrammarVizAnomalyParameters.SAX_PAA_SIZE).append(CR);
sb.append(" SAX alphabet size: ")
.append(GrammarVizAnomalyParameters.SAX_ALPHABET_SIZE).append(CR);
}
sb.append(" SAX numerosity reduction: ")
.append(GrammarVizAnomalyParameters.SAX_NR_STRATEGY).append(CR);
sb.append(" SAX normalization threshold: ")
.append(GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD).append(CR);
}
if (AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.RRA.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.RRAPRUNED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
sb.append(" GI Algorithm: ")
.append(GrammarVizAnomalyParameters.GI_ALGORITHM_IMPLEMENTATION).append(CR);
}
if (AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
sb.append(" Grid boundaries: ")
.append(GrammarVizAnomalyParameters.GRID_BOUNDARIES).append(CR);
}
if ((AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)
|| AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM))
&& !(Double.isNaN(GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION))) {
sb.append(" Subsampling fraction: ")
.append(GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION).append(CR);
}
System.out.println(sb.toString());
// read the file
//
LOGGER.info("Reading data ...");
double[] series = tp.readTS(GrammarVizAnomalyParameters.IN_FILE, 0);
LOGGER.info("read " + series.length + " points from " + GrammarVizAnomalyParameters.IN_FILE);
// switch logic according to the algorithm selection
//
if (AnomalyAlgorithm.BRUTEFORCE.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findBruteForce(series, GrammarVizAnomalyParameters.SAX_WINDOW_SIZE,
GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
else if (AnomalyAlgorithm.HOTSAX.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findHotSax(series, GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.SAX_WINDOW_SIZE, GrammarVizAnomalyParameters.SAX_PAA_SIZE,
GrammarVizAnomalyParameters.SAX_ALPHABET_SIZE,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY,
GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
else if (AnomalyAlgorithm.RRA.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findRRA(series, GrammarVizAnomalyParameters.SAX_WINDOW_SIZE,
GrammarVizAnomalyParameters.SAX_PAA_SIZE, GrammarVizAnomalyParameters.SAX_ALPHABET_SIZE,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY, GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.GI_ALGORITHM_IMPLEMENTATION,
GrammarVizAnomalyParameters.OUT_FILE, GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
else if (AnomalyAlgorithm.RRAPRUNED.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findRRAPruned(series, GrammarVizAnomalyParameters.SAX_WINDOW_SIZE,
GrammarVizAnomalyParameters.SAX_PAA_SIZE, GrammarVizAnomalyParameters.SAX_ALPHABET_SIZE,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY, GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.GI_ALGORITHM_IMPLEMENTATION,
GrammarVizAnomalyParameters.OUT_FILE, GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
else if (AnomalyAlgorithm.RRASAMPLED.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findRRASampled(series, GrammarVizAnomalyParameters.GRID_BOUNDARIES,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY, GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.GI_ALGORITHM_IMPLEMENTATION,
GrammarVizAnomalyParameters.OUT_FILE, GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
else if (AnomalyAlgorithm.EXPERIMENT.equals(GrammarVizAnomalyParameters.ALGORITHM)) {
findRRAExperiment(series, GrammarVizAnomalyParameters.GRID_BOUNDARIES,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY, GrammarVizAnomalyParameters.DISCORDS_NUM,
GrammarVizAnomalyParameters.GI_ALGORITHM_IMPLEMENTATION,
GrammarVizAnomalyParameters.OUT_FILE, GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
}
}
}
private static void findRRAExperiment(double[] ts, String boundaries,
NumerosityReductionStrategy saxNRStrategy, int discordsToReport, GIAlgorithm giImplementation,
String outputPrefix, double normalizationThreshold) throws Exception {
LOGGER.info("running RRA with experiment sampling algorithm...");
// Date start = new Date();
// parse the boundaries params
int[] bounds = toBoundaries(GrammarVizAnomalyParameters.GRID_BOUNDARIES);
ArrayList<SampledPoint> res = new ArrayList<SampledPoint>();
// we need to use this in the loop
RulePruner rp;
if (GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION.isNaN()) {
LOGGER.info("sampling on full time series length");
rp = new RulePruner(ts);
}
else {
int sampleIntervalStart = 0;
int sampleIntervalEnd = (int) Math
.round(ts.length * GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION);
LOGGER.info("sampling parameters on interval [" + sampleIntervalStart + ", "
+ sampleIntervalEnd + "]");
rp = new RulePruner(Arrays.copyOfRange(ts, sampleIntervalStart, sampleIntervalEnd));
}
// iterate over the grid evaluating the grammar
//
for (int WINDOW_SIZE = bounds[0]; WINDOW_SIZE < bounds[1]; WINDOW_SIZE += bounds[2]) {
for (int PAA_SIZE = bounds[3]; PAA_SIZE < bounds[4]; PAA_SIZE += bounds[5]) {
// check for invalid cases
if (PAA_SIZE > WINDOW_SIZE) {
continue;
}
for (int ALPHABET_SIZE = bounds[6]; ALPHABET_SIZE < bounds[7]; ALPHABET_SIZE += bounds[8]) {
SampledPoint p = rp.sample(WINDOW_SIZE, PAA_SIZE, ALPHABET_SIZE, GIAlgorithm.REPAIR,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY,
GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
res.add(p);
///
///
///
///
// GrammarRules rules;
// if (GIAlgorithm.SEQUITUR.equals(giImplementation)) {
// rules = SequiturFactory.series2SequiturRules(ts, WINDOW_SIZE, PAA_SIZE, ALPHABET_SIZE,
// GrammarVizAnomalyParameters.SAX_NR_STRATEGY,
// GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
// }
// else {
// ParallelSAXImplementation ps = new ParallelSAXImplementation();
// SAXRecords parallelRes = ps.process(ts, 2, WINDOW_SIZE, PAA_SIZE, ALPHABET_SIZE,
// GrammarVizAnomalyParameters.SAX_NR_STRATEGY,
// GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
// RePairGrammar rePairGrammar = RePairFactory.buildGrammar(parallelRes);
// rePairGrammar.expandRules();
// rePairGrammar.buildIntervals(parallelRes, ts, WINDOW_SIZE);
// rules = rePairGrammar.toGrammarRulesData();
// }
//
// // prune grammar' rules
// GrammarRules prunedRulesSet = RulePrunerFactory.performPruning(ts, rules);
//
// // pruned intervals
// ArrayList<RuleInterval> prunedIntervals = new ArrayList<RuleInterval>();
//
// // coverage intervals
// int[] coverageArray = new int[ts.length];
//
// // populate all intervals with their frequency
// for (GrammarRuleRecord rule : prunedRulesSet) {
// if (0 == rule.ruleNumber()) {
// continue;
// }
// for (RuleInterval ri : rule.getRuleIntervals()) {
// ri.setCoverage(rule.getRuleIntervals().size());
// ri.setId(rule.ruleNumber());
// prunedIntervals.add(ri);
// //
// int startPos = ri.getStartPos();
// int endPos = ri.getEndPos();
// for (int j = startPos; j < endPos; j++) {
// coverageArray[j] = coverageArray[j] + 1;
// }
// }
// }
//
// // look for zero-covered intervals and add those to the list
// List<RuleInterval> zeros = getZeroIntervals(coverageArray);
// if (zeros.size() > 0) {
// prunedIntervals.addAll(zeros);
// }
// // run HOTSAX with this intervals set
// DiscordRecords discords = RRAImplementation.series2RRAAnomalies(ts, 1,
// prunedIntervals);
//
// if (discords.getSize() > 0) {
// // if the discord(s) found
LOGGER.info("# " + WINDOW_SIZE + "," + PAA_SIZE + "," + ALPHABET_SIZE + ","
+ p.getApproxDist() + "," + p.getGrammarSize() + "," + p.getCompressedGrammarSize()
+ "," + p.getGrammarRules() + "," + p.getPrunedRules() + "," + p.getCoverage() + ","
+ p.getMaxFrequency());
// }
// else {
// // no discords were discovered
// // need to increase the granularity of discretization
// LOGGER.info("# " + WINDOW_SIZE + "," + PAA_SIZE + "," + ALPHABET_SIZE + ","
// + p.getApproxDist() + "," + p.getGrammarSize() + "," + p.getCompressedGrammarSize()
// + "," + p.getCoverage() + ",-1,-1");
// }
///
///
}
}
}
// Collections.sort(res, new ReductionSorter());
Collections.sort(res, new GrammarSizeSorter());
System.out.println(CR + "# GLOBALLY MIN GRAMMAR size is " + res.get(0).toString() + CR
+ "Running RRAPruned ..." + CR);
int windowSize = res.get(0).getWindow();
int paaSize = res.get(0).getPAA();
int alphabetSize = res.get(0).getAlphabet();
findRRAPruned(ts, windowSize, alphabetSize, paaSize, saxNRStrategy, discordsToReport,
giImplementation, outputPrefix, normalizationThreshold);
Collections.sort(res, new ReducedGrammarSizeSorter());
System.out.println(CR + "# GLOBALLY MIN PRUNED grammar size: " + res.get(0).toString() + CR
+ "Running RRAPruned ..." + CR);
windowSize = res.get(0).getWindow();
paaSize = res.get(0).getPAA();
alphabetSize = res.get(0).getAlphabet();
findRRAPruned(ts, windowSize, alphabetSize, paaSize, saxNRStrategy, discordsToReport,
giImplementation, outputPrefix, normalizationThreshold);
double threshold = 0.99;
ArrayList<SampledPoint> resCovered = new ArrayList<SampledPoint>();
for (SampledPoint p : res) {
if (p.getCoverage() >= threshold) {
resCovered.add(p);
}
}
// Collections.sort(resCovered, new ReductionSorter());
Collections.sort(resCovered, new GrammarSizeSorter());
System.out.println(CR + "# COVERED ABOVE THRESHOLD MIN GRAMMAR parameters are "
+ resCovered.get(0).toString() + CR + "Running RRAPruned ..." + CR);
windowSize = resCovered.get(0).getWindow();
paaSize = resCovered.get(0).getPAA();
alphabetSize = resCovered.get(0).getAlphabet();
findRRAPruned(ts, windowSize, alphabetSize, paaSize, saxNRStrategy, discordsToReport,
giImplementation, outputPrefix, normalizationThreshold);
Collections.sort(resCovered, new ReducedGrammarSizeSorter());
System.out.println(CR + "# COVERED ABOVE THRESHOLD MIN PRUNED GRAMMAR : "
+ resCovered.get(0).toString() + CR + "Running RRAPruned ..." + CR);
windowSize = resCovered.get(0).getWindow();
paaSize = resCovered.get(0).getPAA();
alphabetSize = resCovered.get(0).getAlphabet();
findRRAPruned(ts, windowSize, alphabetSize, paaSize, saxNRStrategy, discordsToReport,
giImplementation, outputPrefix, normalizationThreshold);
}
/**
* Finds discords in classic manner (i.e., using a trie).
*
* @param ts the dataset.
* @param boundaries the sampling boundaries.
* @param saxNRStrategy the NR strategy to use.
* @param discordsToReport SAX sliding window size.
* @param giImplementation the GI algorithm to use.
* @param outputPrefix the output prefix.
* @param normalizationThreshold SAX normalization threshold.
* @throws Exception if error occurs.
*/
private static void findRRASampled(double[] ts, String boundaries,
NumerosityReductionStrategy saxNRStrategy, int discordsToReport, GIAlgorithm giImplementation,
String outputPrefix, double normalizationThreshold) throws Exception {
LOGGER.info("running RRA with sampling algorithm...");
// Date start = new Date();
// parse the boundaries params
int[] bounds = toBoundaries(GrammarVizAnomalyParameters.GRID_BOUNDARIES);
// create the output file
// BufferedWriter bw = new BufferedWriter(
// new FileWriter(new File(GrammarVizAnomalyParameters.OUT_FILE)));
// bw.write(OUTPUT_HEADER);
ArrayList<SampledPoint> res = new ArrayList<SampledPoint>();
// we need to use this in the loop
RulePruner rp;
if (GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION.isNaN()) {
LOGGER.info("sampling on full time series length");
rp = new RulePruner(ts);
}
else {
int sampleIntervalStart = 0;
int sampleIntervalEnd = (int) Math
.round(ts.length * GrammarVizAnomalyParameters.SUBSAMPLING_FRACTION);
LOGGER.info("sampling parameters on interval [" + sampleIntervalStart + ", "
+ sampleIntervalEnd + "]");
rp = new RulePruner(Arrays.copyOfRange(ts, sampleIntervalStart, sampleIntervalEnd));
}
// iterate over the grid evaluating the grammar
//
for (int WINDOW_SIZE = bounds[0]; WINDOW_SIZE < bounds[1]; WINDOW_SIZE += bounds[2]) {
for (int PAA_SIZE = bounds[3]; PAA_SIZE < bounds[4]; PAA_SIZE += bounds[5]) {
// check for invalid cases
if (PAA_SIZE > WINDOW_SIZE) {
continue;
}
for (int ALPHABET_SIZE = bounds[6]; ALPHABET_SIZE < bounds[7]; ALPHABET_SIZE += bounds[8]) {
SampledPoint p = rp.sample(WINDOW_SIZE, PAA_SIZE, ALPHABET_SIZE, GIAlgorithm.REPAIR,
GrammarVizAnomalyParameters.SAX_NR_STRATEGY,
GrammarVizAnomalyParameters.SAX_NORM_THRESHOLD);
res.add(p);
}
}
}
Collections.sort(res, new ReductionSorter());
System.out.println(CR + "Apparently, the best parameters are " + res.get(0).toString() + CR
+ "Running RRAPRUNED..." + CR);
int windowSize = res.get(0).getWindow();
int paaSize = res.get(0).getPAA();
int alphabetSize = res.get(0).getAlphabet();
findRRAPruned(ts, windowSize, alphabetSize, paaSize, saxNRStrategy, discordsToReport,
giImplementation, outputPrefix, normalizationThreshold);
}
/**
* Finds discords in classic manner (i.e., using a trie).
*
* @param ts the dataset.
* @param windowSize SAX int windowSize = res.get(0).getWindow(); int paaSize =
* res.get(0).getPAA(); int alphabetSize = res.get(0).getAlphabet();
*
* GrammarRules rules;
*
* if (GIAlgorithm.SEQUITUR.equals(giImplementation)) { rules =
* SequiturFactory.series2SequiturRules(ts, windowSize, paaSize, alphabetSize, saxNRStrategy,
* normalizationThreshold); } else { ParallelSAXImplementation ps = new
* ParallelSAXImplementation(); SAXRecords parallelRes = ps.process(ts, 2, windowSize, paaSize,
* alphabetSize, NumerosityReductionStrategy.EXACT, normalizationThreshold); RePairGrammar
* rePairGrammar = RePairFactory.buildGrammar(parallelRes); rePairGrammar.expandRules();
* rePairGrammar.buildIntervals(parallelRes, ts, windowSize); rules =
* rePairGrammar.toGrammarRulesData(); }
*
* ArrayList<RuleInterval> intervals = new ArrayList<RuleInterval>();
*
* // populate all intervals with their frequency // for (GrammarRuleRecord rule : rules) { // //
* TODO: do we care about long rules? // if (0 == rule.ruleNumber() || rule.getRuleYield() > 2) {
* if (0 == rule.ruleNumber()) { continue; } for (RuleInterval ri : rule.getRuleIntervals()) {
* ri.setCoverage(rule.getRuleIntervals().size()); ri.setId(rule.ruleNumber()); intervals.add(ri);
* } }
*
* // get the coverage array // int[] coverageArray = new int[ts.length]; for (GrammarRuleRecord
* rule : rules) { if (0 == rule.ruleNumber()) { continue; } ArrayList<RuleInterval> arrPos =
* rule.getRuleIntervals(); for (RuleInterval saxPos : arrPos) { int startPos =
* saxPos.getStartPos(); int endPos = saxPos.getEndPos(); for (int j = startPos; j < endPos; j++)
* { coverageArray[j] = coverageArray[j] + 1; } } }
*
* // look for zero-covered intervals and add those to the list // List<RuleInterval> zeros =
* getZeroIntervals(coverageArray); if (zeros.size() > 0) { LOGGER.info( "found " + zeros.size() +
* " intervals not covered by rules: " + intervalsToString(zeros)); intervals.addAll(zeros); }
* else { LOGGER.info( "the whole timeseries is covered by rule intervals ..."); }
*
* // run HOTSAX with this intervals set // DiscordRecords discords =
* RRAImplementation.series2RRAAnomalies(ts, discordsToReport, intervals); Date end = new Date();
*
* System.out.println(discords.toString() + CR + "Discords found in " +
* SAXProcessor.timeToString(start.getTime(), end.getTime()) + CR);
*
* // THE DISCORD SEARCH IS DONE RIGHT HERE // BELOW IS THE CODE WHICH WRITES THE CURVE AND THE
* DISTANCE FILE ON FILESYSTEM // if (!(outputPrefix.isEmpty())) {
*
* // write the coverage array // String currentPath = new File(".").getCanonicalPath();
* BufferedWriter bw = new BufferedWriter( new FileWriter(new File(currentPath + File.separator +
* outputPrefix + "_coverage.txt"))); for (int i : coverageArray) { bw.write(i + "\n"); }
* bw.close();
*
* Collections.sort(intervals, new Comparator<RuleInterval>() { public int compare(RuleInterval
* c1, RuleInterval c2) { if (c1.getStartPos() > c2.getStartPos()) { return 1; } else if
* (c1.getStartPos() < c2.getStartPos()) { return -1; } return 0; } });
*
* // now lets find all the distances to non-self match // double[] distances = new
* double[ts.length]; double[] widths = new double[ts.length];
*
* for (RuleInterval ri : intervals) {
*
* int ruleStart = ri.getStartPos(); int ruleEnd = ruleStart + ri.getLength(); int window =
* ruleEnd - ruleStart;
*
* double[] cw = tp.subseriesByCopy(ts, ruleStart, ruleStart + window);
*
* double cwNNDist = Double.MAX_VALUE;
*
* // this effectively finds the furthest hit // for (int j = 0; j < ts.length - window - 1; j++)
* { if (Math.abs(ruleStart - j) > window) { double[] currentSubsequence = tp.subseriesByCopy(ts,
* j, j + window); double dist = ed.distance(cw, currentSubsequence); if (dist < cwNNDist) {
* cwNNDist = dist; } } }
*
* distances[ruleStart] = cwNNDist; widths[ruleStart] = ri.getLength(); }
*
* bw = new BufferedWriter( new FileWriter(new File(currentPath + File.separator + outputPrefix +
* "_distances.txt"))); for (int i = 0; i < distances.length; i++) { bw.write(i + "," +
* distances[i] + "," + widths[i] + "\n"); } bw.close(); } sliding window size.
* @param paaSize SAX PAA size.
* @param alphabetSize SAX alphabet size.
* @param saxNRStrategy the NR strategy to use.
* @param discordsToReport SAX sliding window size.
* @param giImplementation the GI algorithm to use.
* @param outputPrefix the output prefix.
* @param normalizationThreshold SAX normalization threshold.
* @throws Exception if error occurs.
*/
private static void findRRAPruned(double[] ts, int windowSize, int paaSize, int alphabetSize,
NumerosityReductionStrategy saxNRStrategy, int discordsToReport, GIAlgorithm giImplementation,
String outputPrefix, double normalizationThreshold) throws Exception {
LOGGER.info("running RRA with pruning algorithm, building the grammar ...");
Date start = new Date();
GrammarRules rules;
if (GIAlgorithm.SEQUITUR.equals(giImplementation)) {
rules = SequiturFactory.series2SequiturRules(ts, windowSize, paaSize, alphabetSize,
saxNRStrategy, normalizationThreshold);
}
else {
ParallelSAXImplementation ps = new ParallelSAXImplementation();
SAXRecords parallelRes = ps.process(ts, 2, windowSize, paaSize, alphabetSize, saxNRStrategy,
normalizationThreshold);
RePairGrammar rePairGrammar = RePairFactory.buildGrammar(parallelRes);
rePairGrammar.expandRules();
rePairGrammar.buildIntervals(parallelRes, ts, windowSize);
rules = rePairGrammar.toGrammarRulesData();
}
LOGGER.info(rules.size() + " rules inferred in "
+ SAXProcessor.timeToString(start.getTime(), new Date().getTime()) + ", pruning ...");
// prune grammar' rules
//
GrammarRules prunedRulesSet = RulePrunerFactory.performPruning(ts, rules);
LOGGER.info(
"finished pruning in " + SAXProcessor.timeToString(start.getTime(), new Date().getTime())
+ ", keeping " + prunedRulesSet.size() + " rules for anomaly discovery ...");
ArrayList<RuleInterval> intervals = new ArrayList<RuleInterval>();
// populate all intervals with their frequency
//
for (GrammarRuleRecord rule : prunedRulesSet) {
//
// TODO: do we care about long rules?
// if (0 == rule.ruleNumber() || rule.getRuleYield() > 2) {
if (0 == rule.ruleNumber()) {
continue;
}
for (RuleInterval ri : rule.getRuleIntervals()) {
ri.setCoverage(rule.getRuleIntervals().size());
ri.setId(rule.ruleNumber());
intervals.add(ri);
}
}
// get the coverage array
//
int[] coverageArray = new int[ts.length];
for (GrammarRuleRecord rule : prunedRulesSet) {
if (0 == rule.ruleNumber()) {
continue;
}
ArrayList<RuleInterval> arrPos = rule.getRuleIntervals();
for (RuleInterval saxPos : arrPos) {
int startPos = saxPos.getStart();
int endPos = saxPos.getEnd();
for (int j = startPos; j < endPos; j++) {
coverageArray[j] = coverageArray[j] + 1;
}
}
}
// look for zero-covered intervals and add those to the list
//
List<RuleInterval> zeros = getZeroIntervals(coverageArray);
if (zeros.size() > 0) {
LOGGER.info(
"found " + zeros.size() + " intervals not covered by rules: " + intervalsToString(zeros));
intervals.addAll(zeros);
}
else {
LOGGER.info("the whole timeseries is covered by rule intervals ...");
}
// run HOTSAX with this intervals set
//
DiscordRecords discords = RRAImplementation.series2RRAAnomalies(ts, discordsToReport, intervals,
normalizationThreshold);
Date end = new Date();
System.out.println(discords.toString() + CR + "Discords found in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()) + CR);
// THE DISCORD SEARCH IS DONE RIGHT HERE
// BELOW IS THE CODE WHICH WRITES THE CURVE AND THE DISTANCE FILE ON FILESYSTEM
//
if (!(outputPrefix.isEmpty())) {
// write the coverage array
//
String currentPath = new File(".").getCanonicalPath();
BufferedWriter bw = new BufferedWriter(
new FileWriter(new File(currentPath + File.separator + outputPrefix + "_coverage.txt")));
for (int i : coverageArray) {
bw.write(i + "\n");
}
bw.close();
Collections.sort(intervals, new Comparator<RuleInterval>() {
public int compare(RuleInterval c1, RuleInterval c2) {
if (c1.getStart() > c2.getStart()) {
return 1;
}
else if (c1.getStart() < c2.getStart()) {
return -1;
}
return 0;
}
});
// now lets find all the distances to non-self match
//
double[] distances = new double[ts.length];
double[] widths = new double[ts.length];
for (RuleInterval ri : intervals) {
int ruleStart = ri.getStart();
int ruleEnd = ruleStart + ri.getLength();
int window = ruleEnd - ruleStart;
double[] cw = tp.subseriesByCopy(ts, ruleStart, ruleStart + window);
double cwNNDist = Double.MAX_VALUE;
// this effectively finds the furthest hit
//
for (int j = 0; j < ts.length - window - 1; j++) {
if (Math.abs(ruleStart - j) > window) {
double[] currentSubsequence = tp.subseriesByCopy(ts, j, j + window);
double dist = ed.distance(cw, currentSubsequence);
if (dist < cwNNDist) {
cwNNDist = dist;
}
}
}
distances[ruleStart] = cwNNDist;
widths[ruleStart] = ri.getLength();
}
bw = new BufferedWriter(
new FileWriter(new File(currentPath + File.separator + outputPrefix + "_distances.txt")));
for (int i = 0; i < distances.length; i++) {
bw.write(i + "," + distances[i] + "," + widths[i] + "\n");
}
bw.close();
}
}
/**
* Finds discords in classic manner (i.e., using a trie).
*
* @param ts the dataset.
* @param windowSize SAX sliding window size.
* @param paaSize SAX PAA size.
* @param alphabetSize SAX alphabet size.
* @param saxNRStrategy the NR strategy to use.
* @param discordsToReport SAX sliding window size.
* @param giImplementation the GI algorithm to use.
* @param outputPrefix the output prefix.
* @param normalizationThreshold SAX normalization threshold.
* @throws Exception if error occurs.
*/
private static void findRRA(double[] ts, int windowSize, int paaSize, int alphabetSize,
NumerosityReductionStrategy saxNRStrategy, int discordsToReport, GIAlgorithm giImplementation,
String outputPrefix, double normalizationThreshold) throws Exception {
LOGGER.info("running RRA algorithm...");
Date start = new Date();
// [1] get the grammar induced
//
GrammarRules rules;
if (GIAlgorithm.SEQUITUR.equals(giImplementation)) {
rules = SequiturFactory.series2SequiturRules(ts, windowSize, paaSize, alphabetSize,
saxNRStrategy, normalizationThreshold);
Date end = new Date();
LOGGER.info(rules.size() + " Sequitur rules inferred in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()));
}
else {
ParallelSAXImplementation ps = new ParallelSAXImplementation();
SAXRecords parallelRes = ps.process(ts, 2, windowSize, paaSize, alphabetSize, saxNRStrategy,
normalizationThreshold);
RePairGrammar rePairGrammar = RePairFactory.buildGrammar(parallelRes);
rePairGrammar.expandRules();
rePairGrammar.buildIntervals(parallelRes, ts, windowSize);
rules = rePairGrammar.toGrammarRulesData();
Date end = new Date();
LOGGER.info(rules.size() + " RePair rules inferred in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()));
}
// [2] extract all the intervals
//
ArrayList<RuleInterval> intervals = new ArrayList<RuleInterval>(rules.size() * 2);
// populate all intervals with their frequency
//
for (GrammarRuleRecord rule : rules) {
if (0 == rule.ruleNumber()) {
continue;
}
for (RuleInterval ri : rule.getRuleIntervals()) {
RuleInterval i = (RuleInterval) ri.clone();
i.setCoverage(rule.getRuleIntervals().size()); // not a coverage used here but a rule
// frequency, will override later
i.setId(rule.ruleNumber());
intervals.add(i);
}
}
// get the coverage array
//
int[] coverageArray = new int[ts.length];
for (GrammarRuleRecord rule : rules) {
if (0 == rule.ruleNumber()) {
continue;
}
ArrayList<RuleInterval> arrPos = rule.getRuleIntervals();
for (RuleInterval saxPos : arrPos) {
int startPos = saxPos.getStart();
int endPos = saxPos.getEnd();
for (int j = startPos; j < endPos; j++) {
coverageArray[j] = coverageArray[j] + 1;
}
}
}
// look for zero-covered intervals and add those to the list
//
List<RuleInterval> zeros = getZeroIntervals(coverageArray);
if (zeros.size() > 0) {
LOGGER.info(
"found " + zeros.size() + " intervals not covered by rules: " + intervalsToString(zeros));
intervals.addAll(zeros);
}
else {
LOGGER.info("the whole timeseries is covered by rule intervals ...");
}
// run HOTSAX with this intervals set
//
DiscordRecords discords = RRAImplementation.series2RRAAnomalies(ts, discordsToReport, intervals,
normalizationThreshold);
Date end = new Date();
System.out.println(discords.toString() + CR + discords.getSize() + " discords found in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()) + CR);
// THE DISCORD SEARCH IS DONE RIGHT HERE
// BELOW IS THE CODE WHICH WRITES THE CURVE AND THE DISTANCE FILE ON FILESYSTEM
//
if (!(outputPrefix.isEmpty())) {
// write the coverage array
//
String currentPath = new File(".").getCanonicalPath();
BufferedWriter bw = new BufferedWriter(
new FileWriter(new File(currentPath + File.separator + outputPrefix + "_coverage.txt")));
for (int i : coverageArray) {
bw.write(i + "\n");
}
bw.close();
Collections.sort(intervals, new Comparator<RuleInterval>() {
public int compare(RuleInterval c1, RuleInterval c2) {
if (c1.getStart() > c2.getStart()) {
return 1;
}
else if (c1.getStart() < c2.getStart()) {
return -1;
}
return 0;
}
});
// now lets find all the distances to non-self match
//
double[] distances = new double[ts.length];
double[] widths = new double[ts.length];
for (RuleInterval ri : intervals) {
int ruleStart = ri.getStart();
int ruleEnd = ruleStart + ri.getLength();
int window = ruleEnd - ruleStart;
double[] cw = tp.subseriesByCopy(ts, ruleStart, ruleStart + window);
double cwNNDist = Double.MAX_VALUE;
// this effectively finds the furthest hit
//
for (int j = 0; j < ts.length - window - 1; j++) {
if (Math.abs(ruleStart - j) > window) {
double[] currentSubsequence = tp.subseriesByCopy(ts, j, j + window);
double dist = ed.distance(cw, currentSubsequence);
if (dist < cwNNDist) {
cwNNDist = dist;
}
}
}
distances[ruleStart] = cwNNDist;
widths[ruleStart] = ri.getLength();
}
bw = new BufferedWriter(
new FileWriter(new File(currentPath + File.separator + outputPrefix + "_distances.txt")));
for (int i = 0; i < distances.length; i++) {
bw.write(i + "," + distances[i] + "," + widths[i] + "\n");
}
bw.close();
}
}
/**
* Procedure of finding brute-force discords.
*
* @param ts timeseries to use
* @param windowSize the sliding window size.
* @param discordsToReport num of discords to report.
* @param nThreshold the z-Normlization threshold value.
* @throws Exception if error occurs.
*/
private static void findBruteForce(double[] ts, int windowSize, int discordsToReport,
double nThreshold) throws Exception {
LOGGER.info("running brute force algorithm...");
Date start = new Date();
DiscordRecords discords = BruteForceDiscordImplementation.series2BruteForceDiscords(ts,
windowSize, discordsToReport, new LargeWindowAlgorithm(), nThreshold);
Date end = new Date();
System.out.println(CR + discords.toString() + CR + discords.getSize() + " discords found in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()) + CR);
}
/**
* Finds discords using a hash-backed magic array.
*
* @param ts the dataset.
* @param discordsToReport SAX sliding window size.
* @param windowSize SAX sliding window size.
* @param paaSize SAX PAA size.
* @param alphabetSize SAX alphabet size. * @param saxNRStrategy the NR strategy to use.
* @param normalizationThreshold SAX normalization threshold.
* @throws Exception if error occurs.
*/
private static void findHotSax(double[] ts, int discordsToReport, int windowSize, int paaSize,
int alphabetSize, NumerosityReductionStrategy saxNRStrategy, double normalizationThreshold)
throws Exception {
LOGGER.info("running HOT SAX hashtable-based algorithm...");
Date start = new Date();
DiscordRecords discords = HOTSAXImplementation.series2Discords(ts, discordsToReport, windowSize,
paaSize, alphabetSize, saxNRStrategy, normalizationThreshold);
Date end = new Date();
System.out.println(CR + discords.toString() + CR + discords.getSize() + " discords found in "
+ SAXProcessor.timeToString(start.getTime(), end.getTime()) + CR);
}
/**
* Makes a zeroed interval to appear nicely in output.
*
* @param zeros the list of zeros.
* @return the intervals list as a string.
*/
private static String intervalsToString(List<RuleInterval> zeros) {
StringBuilder sb = new StringBuilder();
for (RuleInterval i : zeros) {
sb.append(i.toString()).append(",");
}
return sb.toString();
}
/**
* Run a quick scan along the timeseries coverage to find a zeroed intervals.
*
* @param coverageArray the coverage to analyze.
* @return set of zeroed intervals (if found).
*/
public static List<RuleInterval> getZeroIntervals(int[] coverageArray) {
ArrayList<RuleInterval> res = new ArrayList<RuleInterval>();
int start = -1;
boolean inInterval = false;
int intervalsCounter = -1;
for (int i = 0; i < coverageArray.length; i++) {
if (0 == coverageArray[i] && !inInterval) {
start = i;
inInterval = true;
}
if (coverageArray[i] > 0 && inInterval) {
res.add(new RuleInterval(intervalsCounter, start, i, 0));
inInterval = false;
intervalsCounter--;
}
}
return res;
}
/**
* Converts a param string to boundaries array.
*
* @param str
* @return
*/
private static int[] toBoundaries(String str) {
int[] res = new int[9];
String[] split = str.split("\\s+");
for (int i = 0; i < 9; i++) {
res[i] = Integer.valueOf(split[i]);
}
return res;
}
}