package edu.hawaii.jmotif.sax;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.logging.ConsoleHandler;
import java.util.logging.Formatter;
import java.util.logging.Handler;
import java.util.logging.Logger;
import org.hackystat.utilities.logger.HackystatLogger;
import org.hackystat.utilities.stacktrace.StackTrace;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import edu.hawaii.jmotif.distance.EuclideanDistance;
import edu.hawaii.jmotif.sax.alphabet.Alphabet;
import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet;
import edu.hawaii.jmotif.sax.datastructures.DiscordRecord;
import edu.hawaii.jmotif.sax.datastructures.DiscordRecords;
import edu.hawaii.jmotif.sax.datastructures.DiscordsAndMotifs;
import edu.hawaii.jmotif.sax.datastructures.MotifRecord;
import edu.hawaii.jmotif.sax.datastructures.MotifRecords;
import edu.hawaii.jmotif.sax.datastructures.SAXFrequencyData;
import edu.hawaii.jmotif.sax.trie.SAXTrie;
import edu.hawaii.jmotif.sax.trie.SAXTrieHitEntry;
import edu.hawaii.jmotif.sax.trie.TrieException;
import edu.hawaii.jmotif.sax.trie.VisitRegistry;
import edu.hawaii.jmotif.timeseries.TSException;
import edu.hawaii.jmotif.timeseries.TSUtils;
import edu.hawaii.jmotif.timeseries.Timeseries;
import edu.hawaii.jmotif.util.BriefFormatter;
/**
* Implements SAX algorithms.
*
* @author Pavel Senin
*
*/
public final class SAXFactory {
public static final int DEFAULT_COLLECTION_SIZE = 50;
private static Logger consoleLogger;
private static final String LOGGING_LEVEL = "SEVERE";
static {
consoleLogger = HackystatLogger.getLogger("jmotif.debug.console", "jmotif");
consoleLogger.setUseParentHandlers(false);
for (Handler handler : consoleLogger.getHandlers()) {
consoleLogger.removeHandler(handler);
}
ConsoleHandler handler = new ConsoleHandler();
Formatter formatter = new BriefFormatter();
handler.setFormatter(formatter);
consoleLogger.addHandler(handler);
HackystatLogger.setLoggingLevel(consoleLogger, LOGGING_LEVEL);
}
/**
* Constructor.
*/
private SAXFactory() {
super();
}
/**
* Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX
* conversion. NOSKIP means that ALL SAX words reported.
*
* @param ts The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param cuts The alphabet cuts to use.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static SAXFrequencyData ts2saxZnormByCutsNoSkip(Timeseries ts, int windowSize,
int paaSize, double[] cuts) throws TSException, CloneNotSupportedException {
// Initialize symbolic result data
SAXFrequencyData res = new SAXFrequencyData();
// scan across the time series extract sub sequences, and converting
// them to strings
for (int i = 0; i < ts.size() - (windowSize - 1); i++) {
// fix the current subsection
Timeseries subSection = ts.subsection(i, i + windowSize - 1);
// Z normalize it
subSection = TSUtils.zNormalize(subSection);
// perform PAA conversion if needed
Timeseries paa;
try {
paa = TSUtils.paa(subSection, paaSize);
}
catch (CloneNotSupportedException e) {
throw new TSException("Unable to clone: " + StackTrace.toString(e));
}
// Convert the PAA to a string.
char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts);
res.put(new String(currentString), i);
}
return res;
}
/**
* Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX
* conversion. Not all SAX words reported, if the new SAX word is the same as current it will not
* be reported.
*
* @param ts The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param cuts The alphabet cuts to use.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static SAXFrequencyData ts2saxZnormByCuts(Timeseries ts, int windowSize, int paaSize,
double[] cuts) throws TSException, CloneNotSupportedException {
// Initialize symbolic result data
SAXFrequencyData res = new SAXFrequencyData();
String previousString = "";
// scan across the time series extract sub sequences, and converting
// them to strings
for (int i = 0; i < ts.size() - (windowSize - 1); i++) {
// fix the current subsection
Timeseries subSection = ts.subsection(i, i + windowSize - 1);
// Z normalize it
subSection = TSUtils.zNormalize(subSection);
// perform PAA conversion if needed
Timeseries paa;
try {
paa = TSUtils.paa(subSection, paaSize);
}
catch (CloneNotSupportedException e) {
throw new TSException("Unable to clone: " + StackTrace.toString(e));
}
// Convert the PAA to a string.
char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts);
// check if previous one was the same, if so, ignore that (don't
// know why though, but guess
// cause we didn't advance much on the timeseries itself)
if (!previousString.isEmpty() && previousString.equalsIgnoreCase(new String(currentString))) {
continue;
}
previousString = new String(currentString);
res.put(new String(currentString), i);
}
return res;
}
/**
* Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX
* conversion. NOSKIP means that ALL SAX words reported.
*
* @param s The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param cuts The alphabet cuts to use.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static SAXFrequencyData ts2saxZnormByCutsNoSkip(double[] s, int windowSize, int paaSize,
double[] cuts) throws TSException, CloneNotSupportedException {
long[] ticks = new long[s.length];
for (int i = 0; i < s.length; i++) {
ticks[i] = i;
}
Timeseries ts = new Timeseries(s, ticks);
return ts2saxZnormByCutsNoSkip(ts, windowSize, paaSize, cuts);
}
/**
* Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX
* conversion. Not all SAX words reported, if the new SAX word is the same as current it will not
* be reported.
*
* @param s The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param cuts The alphabet cuts to use.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static SAXFrequencyData ts2saxZnormByCuts(double[] s, int windowSize, int paaSize,
double[] cuts) throws TSException, CloneNotSupportedException {
long[] ticks = new long[s.length];
for (int i = 0; i < s.length; i++) {
ticks[i] = i;
}
Timeseries ts = new Timeseries(s, ticks);
return ts2saxZnormByCuts(ts, windowSize, paaSize, cuts);
}
/**
* Convert the timeseries into SAX string representation. It doesn't normalize anything.
*
* @param ts The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param cuts The alphabet cuts to use.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
*/
public static SAXFrequencyData ts2saxNoZnormByCuts(Timeseries ts, int windowSize, int paaSize,
double[] cuts) throws TSException {
// Initialize symbolic result data
SAXFrequencyData res = new SAXFrequencyData();
String previousString = "";
// scan across the time series extract sub sequences, and converting
// them to strings
for (int i = 0; i < ts.size() - (windowSize - 1); i++) {
// fix the current subsection
Timeseries subSection = ts.subsection(i, i + windowSize - 1);
// Z normalize it
// subSection = TSUtils.normalize(subSection);
// perform PAA conversion if needed
Timeseries paa;
try {
paa = TSUtils.paa(subSection, paaSize);
}
catch (CloneNotSupportedException e) {
throw new TSException("Unable to clone: " + StackTrace.toString(e));
}
// Convert the PAA to a string.
char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts);
// check if previous one was the same, if so, ignore that (don't
// know why though, but guess
// cause we didn't advance much on the timeseries itself)
if (!previousString.isEmpty() && previousString.equalsIgnoreCase(new String(currentString))) {
previousString = new String(currentString);
continue;
}
previousString = new String(currentString);
res.put(new String(currentString), i);
}
return res;
}
/**
* Convert the timeseries into SAX string representation.
*
* @param ts The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param alphabet The alphabet to use.
* @param alphabetSize The alphabet size used.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static SAXFrequencyData ts2saxZNorm(Timeseries ts, int windowSize, int paaSize,
Alphabet alphabet, int alphabetSize) throws TSException, CloneNotSupportedException {
if (alphabetSize > alphabet.getMaxSize()) {
throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize());
}
return ts2saxZnormByCuts(ts, windowSize, paaSize, alphabet.getCuts(alphabetSize));
}
/**
* Convert the timeseries into SAX string representation.
*
* @param ts The timeseries given.
* @param windowSize The sliding window size used.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param alphabet The alphabet to use.
* @param alphabetSize The alphabet size used.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
*/
public static SAXFrequencyData ts2saxNoZnorm(Timeseries ts, int windowSize, int paaSize,
Alphabet alphabet, int alphabetSize) throws TSException {
if (alphabetSize > alphabet.getMaxSize()) {
throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize());
}
return ts2saxNoZnormByCuts(ts, windowSize, paaSize, alphabet.getCuts(alphabetSize));
}
/**
* Convert the timeseries into SAX string representation.
*
* @param ts The timeseries given.
* @param paaSize The number of the points used in the PAA reduction of the time series.
* @param alphabet The alphabet to use.
* @param alphabetSize The alphabet size used.
* @return The SAX representation of the timeseries.
* @throws TSException If error occurs.
* @throws CloneNotSupportedException
*/
public static String ts2string(Timeseries ts, int paaSize, Alphabet alphabet, int alphabetSize)
throws TSException, CloneNotSupportedException {
if (alphabetSize > alphabet.getMaxSize()) {
throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize());
}
int tsLength = ts.size();
if (tsLength == paaSize) {
return new String(TSUtils.ts2String(TSUtils.zNormalize(ts), alphabet, alphabetSize));
}
else {
// perform PAA conversion
Timeseries PAA;
try {
PAA = TSUtils.paa(TSUtils.zNormalize(ts), paaSize);
}
catch (CloneNotSupportedException e) {
throw new TSException("Unable to clone: " + StackTrace.toString(e));
}
return new String(TSUtils.ts2String(PAA, alphabet, alphabetSize));
}
}
/**
* Build the SAX trie out of the series.
*
* @param tsData The timeseries.
* @param windowSize PAA window size to use.
* @param alphabetSize The SAX alphabet size.
* @return Discords found within the series.
* @throws TrieException if error occurs.
* @throws TSException if error occurs.
*/
public static DiscordRecords ts2Discords(double[] tsData, int windowSize, int alphabetSize)
throws TrieException, TSException {
// make alphabet available
NormalAlphabet normalA = new NormalAlphabet();
// get a trie instance
SAXTrie trie = new SAXTrie(tsData.length - windowSize, alphabetSize);
// build the trie sliding over the series
//
int currPosition = 0;
while ((currPosition + windowSize) < tsData.length) {
// get the window SAX representation
double[] subSeries = getSubSeries(tsData, currPosition, currPosition + windowSize);
char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize));
// add result to the structure
trie.put(String.valueOf(saxVals), currPosition);
// increment the position
currPosition++;
}
// delegate the job to discords extraction engine
DiscordRecords discords = getDiscordsAlgorithm(tsData, windowSize, trie,
DEFAULT_COLLECTION_SIZE, new LargeWindowAlgorithm());
return discords;
}
/**
* Compute the distance between the two strings, this function use the numbers associated with
* ASCII codes, i.e. distance between a and b would be 1.
*
* @param a The first string.
* @param b The second string.
* @return The pairwise distance.
* @throws TSException if length are differ.
*/
public static int strDistance(char[] a, char[] b) throws TSException {
if (a.length == b.length) {
int distance = 0;
for (int i = 0; i < a.length; i++) {
int tDist = Math.abs(Character.getNumericValue(a[i]) - Character.getNumericValue(b[i]));
if (tDist > 1) {
distance += tDist;
}
}
return distance;
}
else {
throw new TSException("Unable to compute SAX distance, string lengths are not equal");
}
}
/**
* Compute the distance between the two chars based on the ASCII symbol codes.
*
* @param a The first char.
* @param b The second char.
* @return The distance.
*/
public static int strDistance(char a, char b) {
return Math.abs(Character.getNumericValue(a) - Character.getNumericValue(b));
}
/**
* This function implements SAX MINDIST function which uses alphabet based distance matrix.
*
* @param a The SAX string.
* @param b The SAX string.
* @param distanceMatrix The distance matrix to use.
* @return distance between strings.
* @throws TSException If error occurs.
*/
public static double saxMinDist(char[] a, char[] b, double[][] distanceMatrix) throws TSException {
if (a.length == b.length) {
double dist = 0.0D;
for (int i = 0; i < a.length; i++) {
if (Character.isLetter(a[i]) && Character.isLetter(b[i])) {
int numA = Character.getNumericValue(a[i]) - 10;
int numB = Character.getNumericValue(b[i]) - 10;
if (numA > 19 || numA < 0 || numB > 19 || numB < 0) {
throw new TSException("The character index greater than 19 or less than 0!");
}
double localDist = distanceMatrix[numA][numB];
dist += localDist;
}
else {
throw new TSException("Non-literal character found!");
}
}
return dist;
}
else {
throw new TSException("Data arrays lengths are not equal!");
}
}
public MotifRecords series2Motifs(double[] series, int windowSize, int alphabetSize,
int motifsNumToReport, SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException,
TSException {
// init the SAX structures
//
SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize);
StringBuilder sb = new StringBuilder();
sb.append("data size: ").append(series.length);
double max = TSUtils.max(series);
sb.append("; max: ").append(max);
double min = TSUtils.min(series);
sb.append("; min: ").append(min);
double mean = TSUtils.mean(series);
sb.append("; mean: ").append(mean);
int nans = TSUtils.countNaN(series);
sb.append("; NaNs: ").append(nans);
consoleLogger.fine(sb.toString());
consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize
+ ", SAX Trie size: " + (series.length - windowSize));
Alphabet normalA = new NormalAlphabet();
Date start = new Date();
// build the trie
//
int currPosition = 0;
while ((currPosition + windowSize) < series.length) {
// get the window SAX representation
double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize);
char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize));
// add result to the structure
trie.put(String.valueOf(saxVals), currPosition);
// increment the position
currPosition++;
}
Date end = new Date();
consoleLogger.fine("trie built in: " + timeToString(start.getTime(), end.getTime()));
start = new Date();
MotifRecords motifs = getMotifs(trie, motifsNumToReport);
end = new Date();
consoleLogger.fine("motifs retrieved in: " + timeToString(start.getTime(), end.getTime()));
return motifs;
}
/**
* Build the SAX trie out of Instances reporting discords.
*
* @param tsData The timeseries.
* @param windowSize PAA window size to use.
* @param alphabetSize The SAX alphabet size.
* @param dataAttributeName The WEKA attribute - essentially points on the instance attribute
* which bears the data value in this case.
* @param discordsNumToReport how many discords to report.
* @return Discords found within the series.
* @throws TrieException if error occurs.
* @throws TSException if error occurs.
*/
public static DiscordRecords instances2Discords(Instances tsData, String dataAttributeName,
int windowSize, int alphabetSize, int discordsNumToReport) throws TrieException, TSException {
// get the timestamps and data attributes
//
Attribute dataAttribute = tsData.attribute(dataAttributeName);
double[] series = toRealSeries(tsData, dataAttribute);
NormalAlphabet normalA = new NormalAlphabet();
SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize);
StringBuilder sb = new StringBuilder();
sb.append("data size: ").append(series.length);
double max = TSUtils.max(series);
sb.append("; max: ").append(max);
double min = TSUtils.min(series);
sb.append("; min: ").append(min);
double mean = TSUtils.mean(series);
sb.append("; mean: ").append(mean);
int nans = TSUtils.countNaN(series);
sb.append("; NaNs: ").append(nans);
consoleLogger.fine(sb.toString());
consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize
+ ", SAX Trie size: " + (series.length - windowSize));
// build the trie
//
int currPosition = 0;
while ((currPosition + windowSize) < series.length) {
// get the window SAX representation
double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize);
char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize));
// add result to the structure
trie.put(String.valueOf(saxVals), currPosition);
// increment the position
currPosition++;
}
Date start = new Date();
int reportNum = DEFAULT_COLLECTION_SIZE;
if (discordsNumToReport > 0 && discordsNumToReport < 50) {
reportNum = discordsNumToReport;
}
DiscordRecords discords = getDiscordsAlgorithm(toRealSeries(tsData, dataAttribute), windowSize,
trie, reportNum, new LargeWindowAlgorithm());
Date end = new Date();
consoleLogger.fine("discords search finished in : "
+ timeToString(start.getTime(), end.getTime()));
return discords;
}
/**
* Build the SAX trie out of the series and reports discords.
*
* @param series The timeseries.
* @param windowSize PAA window size to use.
* @param alphabetSize The SAX alphabet size.
* @param discordsNumToReport how many discords to report.
* @return Discords found within the series.
* @throws TrieException if error occurs.
* @throws TSException if error occurs.
*/
public static DiscordRecords series2Discords(double[] series, int windowSize, int alphabetSize,
int discordsNumToReport, SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException,
TSException {
// get the timestamps and data attributes
//
NormalAlphabet normalA = new NormalAlphabet();
SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize);
StringBuilder sb = new StringBuilder();
sb.append("data size: ").append(series.length);
double max = TSUtils.max(series);
sb.append("; max: ").append(max);
double min = TSUtils.min(series);
sb.append("; min: ").append(min);
double mean = TSUtils.mean(series);
sb.append("; mean: ").append(mean);
int nans = TSUtils.countNaN(series);
sb.append("; NaNs: ").append(nans);
consoleLogger.fine(sb.toString());
consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize
+ ", SAX Trie size: " + (series.length - windowSize));
// build the trie
//
int currPosition = 0;
while ((currPosition + windowSize) < series.length) {
// get the window SAX representation
double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize);
char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize));
// add result to the structure
trie.put(String.valueOf(saxVals), currPosition);
// increment the position
currPosition++;
}
Date start = new Date();
int reportNum = DEFAULT_COLLECTION_SIZE;
if (discordsNumToReport > 0 && discordsNumToReport < 50) {
reportNum = discordsNumToReport;
}
DiscordRecords discords = getDiscordsAlgorithm(series, windowSize, trie, reportNum,
markerAlgorithm);
Date end = new Date();
consoleLogger.fine("discords search finished in : "
+ timeToString(start.getTime(), end.getTime()));
return discords;
}
/**
* Builds two collections - collection of "discords" - the surprise or unique patterns and the
* collection of the motifs - most frequent patterns. This method leveraging the Trie structure -
* so the sliding window size will be translated into the alphabet size by using PAA.
*
* @param series The data series.
* @param windowSize The sliding window size.
* @param alphabetSize The alphabet size.
* @param discordCollectionSize The size of the discord collection - how many top discords we want
* to keep.
* @param motifsCollectionSize The size of the motif collection - how many top motifs we want to
* keep.
* @return All what was promised if finishes.
*
* @throws TrieException if error occurs.
* @throws TSException if error occurs.
*/
public static DiscordsAndMotifs series2DiscordsAndMotifs(double[] series, int windowSize,
int alphabetSize, int discordCollectionSize, int motifsCollectionSize,
SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException, TSException {
// init the SAX structures
//
DiscordsAndMotifs res = new DiscordsAndMotifs(discordCollectionSize, motifsCollectionSize);
SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize);
StringBuilder sb = new StringBuilder();
sb.append("data size: ").append(series.length);
double max = TSUtils.max(series);
sb.append("; max: ").append(max);
double min = TSUtils.min(series);
sb.append("; min: ").append(min);
double mean = TSUtils.mean(series);
sb.append("; mean: ").append(mean);
int nans = TSUtils.countNaN(series);
sb.append("; NaNs: ").append(nans);
consoleLogger.fine(sb.toString());
consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize
+ ", SAX Trie size: " + (series.length - windowSize));
Alphabet normalA = new NormalAlphabet();
Date start = new Date();
// build the trie
//
int currPosition = 0;
while ((currPosition + windowSize) < series.length) {
// get the window SAX representation
double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize);
char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize));
// add result to the structure
trie.put(String.valueOf(saxVals), currPosition);
// increment the position
currPosition++;
}
Date end = new Date();
consoleLogger.fine("trie built in: " + timeToString(start.getTime(), end.getTime()));
start = new Date();
MotifRecords motifs = getMotifs(trie, motifsCollectionSize);
end = new Date();
consoleLogger.fine("motifs retrieved in: " + timeToString(start.getTime(), end.getTime()));
start = new Date();
DiscordRecords discords = getDiscordsAlgorithm(series, windowSize, trie, discordCollectionSize,
markerAlgorithm);
end = new Date();
consoleLogger.fine("discords collected in: " + timeToString(start.getTime(), end.getTime()));
res.addDiscords(discords);
res.addMotifs(motifs);
return res;
}
/**
* The discords extraction method.
*
* Here I need to keep a continuous stack of knowledge with information not only about distance,
* but about abandoning or conducting a full search for words. Thus, I will not be doing the same
* expensive search on the rarest word all over again.
*
* @param series The series we work with.
* @param windowSize The series window size.
* @param marker The algorithm for marking visited locations.
* @param trie
* @param discordCollectionSize
* @return
* @throws TSException
* @throws TrieException
*/
private static DiscordRecords getDiscordsAlgorithm(double[] series, int windowSize, SAXTrie trie,
int discordCollectionSize, SlidingWindowMarkerAlgorithm marker) throws TSException,
TrieException {
consoleLogger.fine("starting discords finding routines");
// resulting discords collection
DiscordRecords discords = new DiscordRecords(discordCollectionSize);
// visit registry. the idea is to mark as visited all the discord
// locations for all searches
VisitRegistry discordsVisitRegistry = new VisitRegistry(series.length - windowSize);
// the collection of seen words and their best so far distances
// in the collection, in addition to pairs <word, distance> I store a
// semaphore
// which indicates whether the full search was conducted with this word,
// or it was
// abandoned at some point, so we do not know the final distance
//
TreeMap<String, DistanceEntry> knownWordsAndTheirCurrentDistances = new TreeMap<String, DistanceEntry>();
// the words already in the discords collection, so we do not have to
// re-consider them
//
TreeSet<String> completeWords = new TreeSet<String>();
// we conduct the search until the number of discords is less than
// desired
//
while (discords.getSize() < discordCollectionSize) {
consoleLogger.fine("currently known discords: " + discords.getSize() + " out of "
+ discordCollectionSize);
Date start = new Date();
DiscordRecord bestDiscord = findBestDiscord(series, windowSize, trie, completeWords,
knownWordsAndTheirCurrentDistances, discordsVisitRegistry, marker);
Date end = new Date();
// if the discord is null we getting out of the search
if (bestDiscord.getDistance() == 0.0D || bestDiscord.getPosition() == -1) {
consoleLogger.fine("breaking the outer search loop, discords found: " + discords.getSize()
+ " last seen discord: " + bestDiscord.toString());
break;
}
consoleLogger.fine("new discord: " + bestDiscord.getPayload() + ", position "
+ bestDiscord.getPosition() + ", distance " + bestDiscord.getDistance()
+ ", elapsed time: " + timeToString(start.getTime(), end.getTime()));
// collect the result
//
discords.add(bestDiscord);
// and maintain data structures
//
marker.markVisited(discordsVisitRegistry, bestDiscord.getPosition(), windowSize);
completeWords.add(String.valueOf(bestDiscord.getPayload()));
}
// done deal
//
return discords;
}
/**
* This method reports the best found discord. Note, that this discord is approximately the best.
* Due to the fuzzy-logic search with randomization and aggressive labeling of the magic array
* locations.
*
* @param series The series we are looking for discord in.
* @param windowSize The sliding window size.
* @param trie The trie (index of the series).
* @param foundDiscordsWords Already found discords.
* @param knownWordsAndTheirDistances The best known distances for certain word. I use the early
* search abandoning optimization in oder to reduce complexity.
* @param visitedLocations The magic array.
* @return The best discord instance.
* @throws TSException If error occurs.
* @throws TrieException If error occurs.
*/
private static DiscordRecord findBestDiscord(double[] series, int windowSize, SAXTrie trie,
TreeSet<String> foundDiscordsWords,
TreeMap<String, DistanceEntry> knownWordsAndTheirDistances, VisitRegistry visitedLocations,
SlidingWindowMarkerAlgorithm marker) throws TSException, TrieException {
// we extract all seen words from the trie
// and sort them by the frequency decrease
//
ArrayList<SAXTrieHitEntry> frequencies = trie.getFrequencies();
Collections.sort(frequencies);
// StringBuilder sb = new StringBuilder();
// for (int i = 0; i < 10; i++) {
// sb.append("top frequencies: ").append(frequencies.get(i).getStr()).append(",")
// .append(frequencies.get(i).getPosition()).append(" ; ");
// }
// consoleLogger.finer(sb.toString());
// init variables
int bestSoFarPosition = -1;
double bestSoFarDistance = 0.0D;
String bestSoFarString = "";
// we will iterate over words from rarest to frequent ones
//
int idx = 0;
int limit = frequencies.size();
while (idx < limit) {
SAXTrieHitEntry currentEntry = frequencies.get(idx);
String currentWord = String.valueOf(currentEntry.getStr());
int currentPosition = currentEntry.getPosition();
// take care about this entry
cleanUpFrequencies(frequencies, currentWord, idx);
// and update the length
limit = frequencies.size();
if (foundDiscordsWords.contains(currentWord) || visitedLocations.isVisited(currentPosition)) {
consoleLogger.finer("skipping the search for " + currentWord);
idx++;
continue;
}
else {
consoleLogger.finer("conducting search for " + currentWord + " iteration " + idx + " from "
+ frequencies.size());
}
// so, lets search begin
//
double nearestNeighborDist = Double.MAX_VALUE;
boolean doRandomSearch = true;
// get a copy of visited locations
VisitRegistry registry = new VisitRegistry(series.length - windowSize);
registry.transferVisited(visitedLocations);
// & mark visited current substring
double[] currentLocations = getSubSeries(series, currentPosition, currentPosition
+ windowSize);
marker.markVisited(registry, currentPosition, windowSize);
// WE QRE GOING TO ITERATE OVER THE CURRENT WORD OCCURENCES HERE
//
DistanceEntry bestKnownDistance = knownWordsAndTheirDistances.get(String.valueOf(currentEntry
.getStr()));
if (null != bestKnownDistance && !(bestKnownDistance.isAbandoned())) {
consoleLogger.finer("skipping iterations over " + currentWord
+ " retrieved result from known distances ");
nearestNeighborDist = bestKnownDistance.getDistance();
if (bestKnownDistance.getDistance() < bestSoFarDistance) {
consoleLogger
.finer("breaking the inner loop flow, bestKnownDistance is less than bestSoFarDistance ");
doRandomSearch = false;
idx++;
continue;
}
}
else {
List<Integer> currentOccurences = trie.getOccurences(currentEntry.getStr());
consoleLogger.finer(currentWord + " has " + currentOccurences.size()
+ " occurrences, iterating...");
for (Integer nextOccurrence : currentOccurences) {
// skip the location we standing at
if (Math.abs(nextOccurrence.intValue() - currentPosition) < windowSize) {
continue;
}
// mark current next visited
marker.markVisited(registry, nextOccurrence, windowSize);
// get the piece of the timeseries
double[] occurrenceValues = getSubSeries(series, nextOccurrence, nextOccurrence
+ windowSize);
double dist = EuclideanDistance.distance(currentLocations, occurrenceValues);
// keep track of best so far distance
if (dist < nearestNeighborDist) {
nearestNeighborDist = dist;
if (dist < bestSoFarDistance) {
consoleLogger.finer(" ** abandoning the occurrences iterations");
doRandomSearch = false;
break;
}
}
}
}
if (!(Double.MAX_VALUE == nearestNeighborDist)) {
consoleLogger.finer("for " + currentWord
+ " occurrences, smallest nearest neighbor distance: " + nearestNeighborDist);
}
else {
consoleLogger.finer("nothing changed after iterations...");
}
boolean completeSearch = true;
// check if we must continue with random neighbors
if (doRandomSearch) {
// it is heuristics here
//
int nextRandomVisitTarget = -1;
int visitCounter = 0;
while ((nextRandomVisitTarget = registry.getNextRandomUnvisitedPosition()) != -1) {
consoleLogger.finer(" random position pick step " + visitCounter + " visited: "
+ registry.getVisited().size() + ", unvisited: " + registry.getUnvisited().size()
+ "; nearest neighbor: " + nearestNeighborDist);
// registry.markVisited(nextRandomVisitTarget);
marker.markVisited(registry, nextRandomVisitTarget, windowSize);
double[] randomTargetValues = getSubSeries(series, nextRandomVisitTarget,
nextRandomVisitTarget + windowSize);
double randomTargetDistance = EuclideanDistance.distance(currentLocations,
randomTargetValues);
// early abandoning of the search, the current word is not
// discord, we seen better
if (randomTargetDistance < bestSoFarDistance) {
nearestNeighborDist = randomTargetDistance;
consoleLogger.finer(" ** abandoning random visits loop, seen distance "
+ nearestNeighborDist + " at iteration " + visitCounter);
completeSearch = false;
break;
}
// keep track
if (randomTargetDistance < nearestNeighborDist) {
nearestNeighborDist = randomTargetDistance;
}
visitCounter = visitCounter + 1;
} // while inner loop
consoleLogger.finer("random visits loop finished, total positions considered: "
+ visitCounter);
} // if break loop
if (nearestNeighborDist > bestSoFarDistance) {
bestSoFarDistance = nearestNeighborDist;
bestSoFarPosition = currentPosition;
bestSoFarString = String.valueOf(currentEntry.getStr());
}
if (knownWordsAndTheirDistances.containsKey(currentWord)
&& knownWordsAndTheirDistances.get(currentWord).isAbandoned()) {
knownWordsAndTheirDistances.put(String.valueOf(currentWord), new DistanceEntry(
nearestNeighborDist, completeSearch));
}
else {
knownWordsAndTheirDistances.put(String.valueOf(currentWord), new DistanceEntry(
nearestNeighborDist, completeSearch));
}
consoleLogger.finer(" . . iterated " + idx + " times, best distance: " + bestSoFarDistance
+ " for a string " + bestSoFarString);
idx++;
} // outer loop
return new DiscordRecord(bestSoFarPosition, bestSoFarDistance, bestSoFarString);
}
private static void cleanUpFrequencies(ArrayList<SAXTrieHitEntry> frequencies,
String currentWord, int startPosition) {
int i = startPosition + 1;
while (i < frequencies.size()) {
if (currentWord.equalsIgnoreCase(String.valueOf(frequencies.get(i).getStr()))) {
frequencies.remove(i);
}
else {
i++;
}
}
}
/**
* Get N top motifs from trie.
*
* @param trie The trie.
* @param maxMotifsNum The number of motifs to report.
* @return The motifs collection.
* @throws TrieException If error occurs.
*/
private static MotifRecords getMotifs(SAXTrie trie, int maxMotifsNum) throws TrieException {
MotifRecords res = new MotifRecords(maxMotifsNum);
ArrayList<SAXTrieHitEntry> frequencies = trie.getFrequencies();
Collections.sort(frequencies);
// all sorted - from one end we have unique words - those discords
// from the other end - we have motifs - the most frequent entries
//
// what I'll do here - is to populate non-trivial frequent entries into
// the resulting container
//
// picking those non-trivial patterns this method job
// non-trivial here means the one which are not the same letters
//
Set<SAXTrieHitEntry> seen = new TreeSet<SAXTrieHitEntry>();
int counter = 0;
// iterating backward - collection is sorted
for (int i = frequencies.size() - 1; i >= 0; i--) {
SAXTrieHitEntry entry = frequencies.get(i);
if (entry.isTrivial(2) || seen.contains(entry) || (2 > entry.getFrequency())) {
if ((2 > entry.getFrequency())) {
break;
}
continue;
}
else {
counter += 1;
res.add(new MotifRecord(entry.getStr(), trie.getOccurences(entry.getStr())));
seen.add(entry);
if (counter > maxMotifsNum) {
break;
}
}
}
return res;
}
/**
* Convert real-valued series into symbolic representation.
*
* @param vals Real valued timeseries.
* @param windowSize The PAA window size.
* @param cuts The cut values array used for SAX transform.
* @return The symbolic representation of the given real time-series.
* @throws TSException If error occurs.
*/
public static char[] getSaxVals(double[] vals, int windowSize, double[] cuts) throws TSException {
char[] saxVals;
if (windowSize == cuts.length + 1) {
saxVals = TSUtils.ts2String(TSUtils.zNormalize(vals), cuts);
}
else {
saxVals = TSUtils.ts2String(TSUtils.zNormalize(TSUtils.paa(vals, cuts.length + 1)), cuts);
}
return saxVals;
}
/**
* Extracts sub-series from the WEKA-style series.
*
* @param data The series.
* @param attribute The data-bearing attribute.
* @param start The start timestamp.
* @param end The end timestamp
* @return sub-series from start to end.
*/
private static double[] getSubSeries(Instances data, Attribute attribute, int start, int end) {
List<Instance> tmpList = data.subList(start, end);
double[] vals = new double[end - start];
for (int i = 0; i < end - start; i++) {
vals[i] = tmpList.get(i).value(attribute.index());
}
return vals;
}
/**
* Converts Instances into double array.
*
* @param tsData The instances data.
* @param dataAttribute The attribute to use in conversion.
* @return real-valued array.
*/
private static double[] toRealSeries(Instances tsData, Attribute dataAttribute) {
double[] vals = new double[tsData.size()];
for (int i = 0; i < tsData.size(); i++) {
vals[i] = tsData.get(i).value(dataAttribute.index());
}
return vals;
}
/**
* Extracts sub-series from series.
*
* @param data The series.
* @param start The start position.
* @param end The end position
* @return sub-series from start to end.
*/
public static double[] getSubSeries(double[] data, int start, int end) {
double[] vals = new double[end - start];
for (int i = 0; i < end - start; i++) {
vals[i] = data[start + i];
}
return vals;
}
/**
* Brute force calculation of the distances.
*
* @param tsData timeseries.
* @param dataAttributeName The pointer onto data-bearing attribute.
* @param controls The control values.
* @param window Window size.
* @throws TSException if error occurs.
*/
public static void maxDistances(Instances tsData, String dataAttributeName, int[] controls,
int window) throws TSException {
// get the timestamps and data attributes
//
Attribute dataAttribute = tsData.attribute(dataAttributeName);
double[] distances = new double[controls.length];
int[] maxPos = new int[controls.length];
for (int i = 0; i < controls.length; i++) {
distances[i] = Double.MAX_VALUE;
maxPos[i] = -1;
}
// [1.0] PREPROCESSING: in the sliding window loop build SAX string
// entries
//
int currPosition = 0;
while ((currPosition + window) < tsData.size()) {
double[] vals = getSubSeries(tsData, dataAttribute, currPosition, currPosition + window);
for (int i = 0; i < controls.length; i++) {
if (Math.abs(controls[i] - currPosition) < window) {
continue;
}
else {
double[] oVals = getSubSeries(tsData, dataAttribute, controls[i], controls[i] + window);
double dist = EuclideanDistance.distance(vals, oVals);
if (distances[i] > dist) {
distances[i] = dist;
maxPos[i] = currPosition;
}
}
}
currPosition++;
}
// for (int i = 0; i < controls.length; i++) {
// System.out.println(controls[i] + " - " + distances[i] + ", at " +
// maxPos[i]);
// }
// for (int i = 0; i < controls.length; i++) {
// double[] is = getSubSeries(tsData, dataAttribute, controls[i],
// controls[i] + window);
// double[] os = getSubSeries(tsData, dataAttribute, maxPos[i],
// maxPos[i] + window);
// System.out.println(Arrays.toString(is) + "\n" + Arrays.toString(os));
// }
}
// /**
// *
// * "We are given n, the length of the discords in advance, and we must
// choose two parameters,
// the
// * cardinality of the SAX alphabet size a, and the SAX word size w. We
// defer a discussion of how
// * to set these parameters until"
// *
// *
// * @param tsData timeseries.
// * @param windowLength window length.
// * @param paaSize The PAA window size.
// * @param alphabetSize The SAX alphabet size.
// * @param timeAttributeName Time-stamp attribute.
// * @param dataAttributeName Value attribute.
// * @return top discords for the time-series given
// * @throws TSException if error occurs.
// */
// public static DiscordRecords getBruteForceDiscords(Instances tsData, int
// windowLength,
// int paaSize, int alphabetSize, String timeAttributeName, String
// dataAttributeName)
// throws TSException {
//
// double[] cuts = normalAlphabet.getCuts(alphabetSize);
//
// // get the timestamps and data attributes
// //
// Attribute dataAttribute = tsData.attribute(dataAttributeName);
// double[] theRawData =
// TSUtils.zNormalize(tsData.attributeToDoubleArray(dataAttribute.index()));
//
// // Init variables
// //
// DiscordRecords discords = new DiscordRecords(10);
// DiscordRecord discord = new DiscordRecord();
//
// XMLGregorianCalendar cTstamp =
// Tstamp.makeTimestamp(System.currentTimeMillis());
//
// // run the search loop
// //
// for (int i = 0; i < tsData.size() - windowLength; i++) {
//
// if (i % 100 == 0) {
// XMLGregorianCalendar nTstamp =
// Tstamp.makeTimestamp(System.currentTimeMillis());
// System.out.println("i: " + i + ", at: " + nTstamp + ", diff: "
// + Tstamp.diff(cTstamp, nTstamp) + ", discord at: " +
// discord.getPosition()
// + ", distance: " + discord.getDistance());
// cTstamp = nTstamp;
// }
//
// // fix the i-s string
// //
// // char[] ssA = TSUtils.ts2String(TSUtils.paa(TSUtils.normalize(Arrays
// // .copyOfRange(theRawData, i, i + windowLength)), paaSize), cuts);
// //
// char[] ssA = TSUtils.ts2String(
// TSUtils.paa(Arrays.copyOfRange(theRawData, i, i + windowLength),
// paaSize), cuts);
//
// Integer nearestNeighborDist = Integer.MAX_VALUE;
//
// // the inner loop
// //
// for (int j = 0; j < tsData.size() - windowLength; j++) {
//
// // check for the trivial match
// //
// if (Math.abs(i - j) >= windowLength) {
//
// // get the SAX approximations of both series here
// //
// // char[] ssB = TSUtils.ts2String(TSUtils.paa(TSUtils.normalize(Arrays
// // .copyOfRange(theRawData, j, j + windowLength)), paaSize), cuts);
//
// char[] ssB = TSUtils.ts2String(
// TSUtils.paa(Arrays.copyOfRange(theRawData, j, j + windowLength),
// paaSize), cuts);
//
// // get the distance here and early terminate if it's less than the
// // largest
// //
// Integer tmpDist = strDistance(ssA, ssB);
// // System.out.println(String.valueOf(ssA) + " VS " +
// // String.valueOf(ssB)
// // + " : " + tmpDist);
// if (tmpDist == 0 || tmpDist < discords.getMinDistance()) {
// break;
// }
// if (tmpDist < nearestNeighborDist) {
// nearestNeighborDist = tmpDist;
// }
// }
//
// }
// if ((nearestNeighborDist != Integer.MAX_VALUE)
// && (nearestNeighborDist > discords.getMinDistance())) {
// discord.setDistance(nearestNeighborDist);
// discord.setIndex(i);
// discords.add(discord);
// discord = new DiscordRecord();
// }
// } // i loop - outer
// return discords;
// }
/**
*
* "We are given n, the length of the discords in advance, and we must choose two parameters, the
* cardinality of the SAX alphabet size a, and the SAX word size w. We defer a discussion of how
* to set these parameters until"
*
*
* @param tsData timeseries.
* @param windowLength window length.
* @return top discords for the time-series given
* @throws TSException if error occurs.
*/
public static DiscordRecords getBruteForceDiscords(double[] tsData, int windowLength)
throws TSException {
DiscordRecords discords = new DiscordRecords(100);
// run the search loop
//
for (int i = 0; i < tsData.length - windowLength; i++) {
double[] seriesA = getSubSeries(tsData, i, i + windowLength);
Double nearestNeighborDist = Double.MAX_VALUE;
// the inner loop
//
for (int j = 0; j < tsData.length - windowLength; j++) {
if (Math.abs(i - j) < windowLength) {
continue;
}
double[] seriesB = getSubSeries(tsData, j, j + windowLength);
double dist = EuclideanDistance.distance(seriesA, seriesB);
if (dist < nearestNeighborDist) {
nearestNeighborDist = dist;
}
} // inner loop
// if (nearestNeighborDist > bestSoFarDist) {
// bestSoFarDist = nearestNeighborDist;
// bestSoFarLoc = i;
discords.add(new DiscordRecord(i, nearestNeighborDist));
// }
}
return discords;
}
/**
* Generic method to convert the milliseconds into the elapsed time string.
*
* @param start Start timestamp.
* @param finish End timestamp.
* @return String representation of the elapsed time.
*/
private static String timeToString(long start, long finish) {
long diff = finish - start;
long secondInMillis = 1000;
long minuteInMillis = secondInMillis * 60;
long hourInMillis = minuteInMillis * 60;
long dayInMillis = hourInMillis * 24;
long yearInMillis = dayInMillis * 365;
@SuppressWarnings("unused")
long elapsedYears = diff / yearInMillis;
diff = diff % yearInMillis;
@SuppressWarnings("unused")
long elapsedDays = diff / dayInMillis;
diff = diff % dayInMillis;
@SuppressWarnings("unused")
long elapsedHours = diff / hourInMillis;
diff = diff % hourInMillis;
long elapsedMinutes = diff / minuteInMillis;
diff = diff % minuteInMillis;
long elapsedSeconds = diff / secondInMillis;
diff = diff % secondInMillis;
long elapsedMilliseconds = diff % secondInMillis;
return elapsedMinutes + "m " + elapsedSeconds + "s " + elapsedMilliseconds + "ms";
}
}