package edu.hawaii.jmotif.sax; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.logging.ConsoleHandler; import java.util.logging.Formatter; import java.util.logging.Handler; import java.util.logging.Logger; import org.hackystat.utilities.logger.HackystatLogger; import org.hackystat.utilities.stacktrace.StackTrace; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import edu.hawaii.jmotif.distance.EuclideanDistance; import edu.hawaii.jmotif.sax.alphabet.Alphabet; import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet; import edu.hawaii.jmotif.sax.datastructures.DiscordRecord; import edu.hawaii.jmotif.sax.datastructures.DiscordRecords; import edu.hawaii.jmotif.sax.datastructures.DiscordsAndMotifs; import edu.hawaii.jmotif.sax.datastructures.MotifRecord; import edu.hawaii.jmotif.sax.datastructures.MotifRecords; import edu.hawaii.jmotif.sax.datastructures.SAXFrequencyData; import edu.hawaii.jmotif.sax.trie.SAXTrie; import edu.hawaii.jmotif.sax.trie.SAXTrieHitEntry; import edu.hawaii.jmotif.sax.trie.TrieException; import edu.hawaii.jmotif.sax.trie.VisitRegistry; import edu.hawaii.jmotif.timeseries.TSException; import edu.hawaii.jmotif.timeseries.TSUtils; import edu.hawaii.jmotif.timeseries.Timeseries; import edu.hawaii.jmotif.util.BriefFormatter; /** * Implements SAX algorithms. * * @author Pavel Senin * */ public final class SAXFactory { public static final int DEFAULT_COLLECTION_SIZE = 50; private static Logger consoleLogger; private static final String LOGGING_LEVEL = "SEVERE"; static { consoleLogger = HackystatLogger.getLogger("jmotif.debug.console", "jmotif"); consoleLogger.setUseParentHandlers(false); for (Handler handler : consoleLogger.getHandlers()) { consoleLogger.removeHandler(handler); } ConsoleHandler handler = new ConsoleHandler(); Formatter formatter = new BriefFormatter(); handler.setFormatter(formatter); consoleLogger.addHandler(handler); HackystatLogger.setLoggingLevel(consoleLogger, LOGGING_LEVEL); } /** * Constructor. */ private SAXFactory() { super(); } /** * Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX * conversion. NOSKIP means that ALL SAX words reported. * * @param ts The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param cuts The alphabet cuts to use. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static SAXFrequencyData ts2saxZnormByCutsNoSkip(Timeseries ts, int windowSize, int paaSize, double[] cuts) throws TSException, CloneNotSupportedException { // Initialize symbolic result data SAXFrequencyData res = new SAXFrequencyData(); // scan across the time series extract sub sequences, and converting // them to strings for (int i = 0; i < ts.size() - (windowSize - 1); i++) { // fix the current subsection Timeseries subSection = ts.subsection(i, i + windowSize - 1); // Z normalize it subSection = TSUtils.zNormalize(subSection); // perform PAA conversion if needed Timeseries paa; try { paa = TSUtils.paa(subSection, paaSize); } catch (CloneNotSupportedException e) { throw new TSException("Unable to clone: " + StackTrace.toString(e)); } // Convert the PAA to a string. char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts); res.put(new String(currentString), i); } return res; } /** * Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX * conversion. Not all SAX words reported, if the new SAX word is the same as current it will not * be reported. * * @param ts The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param cuts The alphabet cuts to use. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static SAXFrequencyData ts2saxZnormByCuts(Timeseries ts, int windowSize, int paaSize, double[] cuts) throws TSException, CloneNotSupportedException { // Initialize symbolic result data SAXFrequencyData res = new SAXFrequencyData(); String previousString = ""; // scan across the time series extract sub sequences, and converting // them to strings for (int i = 0; i < ts.size() - (windowSize - 1); i++) { // fix the current subsection Timeseries subSection = ts.subsection(i, i + windowSize - 1); // Z normalize it subSection = TSUtils.zNormalize(subSection); // perform PAA conversion if needed Timeseries paa; try { paa = TSUtils.paa(subSection, paaSize); } catch (CloneNotSupportedException e) { throw new TSException("Unable to clone: " + StackTrace.toString(e)); } // Convert the PAA to a string. char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts); // check if previous one was the same, if so, ignore that (don't // know why though, but guess // cause we didn't advance much on the timeseries itself) if (!previousString.isEmpty() && previousString.equalsIgnoreCase(new String(currentString))) { continue; } previousString = new String(currentString); res.put(new String(currentString), i); } return res; } /** * Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX * conversion. NOSKIP means that ALL SAX words reported. * * @param s The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param cuts The alphabet cuts to use. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static SAXFrequencyData ts2saxZnormByCutsNoSkip(double[] s, int windowSize, int paaSize, double[] cuts) throws TSException, CloneNotSupportedException { long[] ticks = new long[s.length]; for (int i = 0; i < s.length; i++) { ticks[i] = i; } Timeseries ts = new Timeseries(s, ticks); return ts2saxZnormByCutsNoSkip(ts, windowSize, paaSize, cuts); } /** * Convert the timeseries into SAX string representation, normalizes each of the pieces before SAX * conversion. Not all SAX words reported, if the new SAX word is the same as current it will not * be reported. * * @param s The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param cuts The alphabet cuts to use. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static SAXFrequencyData ts2saxZnormByCuts(double[] s, int windowSize, int paaSize, double[] cuts) throws TSException, CloneNotSupportedException { long[] ticks = new long[s.length]; for (int i = 0; i < s.length; i++) { ticks[i] = i; } Timeseries ts = new Timeseries(s, ticks); return ts2saxZnormByCuts(ts, windowSize, paaSize, cuts); } /** * Convert the timeseries into SAX string representation. It doesn't normalize anything. * * @param ts The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param cuts The alphabet cuts to use. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. */ public static SAXFrequencyData ts2saxNoZnormByCuts(Timeseries ts, int windowSize, int paaSize, double[] cuts) throws TSException { // Initialize symbolic result data SAXFrequencyData res = new SAXFrequencyData(); String previousString = ""; // scan across the time series extract sub sequences, and converting // them to strings for (int i = 0; i < ts.size() - (windowSize - 1); i++) { // fix the current subsection Timeseries subSection = ts.subsection(i, i + windowSize - 1); // Z normalize it // subSection = TSUtils.normalize(subSection); // perform PAA conversion if needed Timeseries paa; try { paa = TSUtils.paa(subSection, paaSize); } catch (CloneNotSupportedException e) { throw new TSException("Unable to clone: " + StackTrace.toString(e)); } // Convert the PAA to a string. char[] currentString = TSUtils.ts2StringWithNaNByCuts(paa, cuts); // check if previous one was the same, if so, ignore that (don't // know why though, but guess // cause we didn't advance much on the timeseries itself) if (!previousString.isEmpty() && previousString.equalsIgnoreCase(new String(currentString))) { previousString = new String(currentString); continue; } previousString = new String(currentString); res.put(new String(currentString), i); } return res; } /** * Convert the timeseries into SAX string representation. * * @param ts The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size used. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static SAXFrequencyData ts2saxZNorm(Timeseries ts, int windowSize, int paaSize, Alphabet alphabet, int alphabetSize) throws TSException, CloneNotSupportedException { if (alphabetSize > alphabet.getMaxSize()) { throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize()); } return ts2saxZnormByCuts(ts, windowSize, paaSize, alphabet.getCuts(alphabetSize)); } /** * Convert the timeseries into SAX string representation. * * @param ts The timeseries given. * @param windowSize The sliding window size used. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size used. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. */ public static SAXFrequencyData ts2saxNoZnorm(Timeseries ts, int windowSize, int paaSize, Alphabet alphabet, int alphabetSize) throws TSException { if (alphabetSize > alphabet.getMaxSize()) { throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize()); } return ts2saxNoZnormByCuts(ts, windowSize, paaSize, alphabet.getCuts(alphabetSize)); } /** * Convert the timeseries into SAX string representation. * * @param ts The timeseries given. * @param paaSize The number of the points used in the PAA reduction of the time series. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size used. * @return The SAX representation of the timeseries. * @throws TSException If error occurs. * @throws CloneNotSupportedException */ public static String ts2string(Timeseries ts, int paaSize, Alphabet alphabet, int alphabetSize) throws TSException, CloneNotSupportedException { if (alphabetSize > alphabet.getMaxSize()) { throw new TSException("Unable to set the alphabet size greater than " + alphabet.getMaxSize()); } int tsLength = ts.size(); if (tsLength == paaSize) { return new String(TSUtils.ts2String(TSUtils.zNormalize(ts), alphabet, alphabetSize)); } else { // perform PAA conversion Timeseries PAA; try { PAA = TSUtils.paa(TSUtils.zNormalize(ts), paaSize); } catch (CloneNotSupportedException e) { throw new TSException("Unable to clone: " + StackTrace.toString(e)); } return new String(TSUtils.ts2String(PAA, alphabet, alphabetSize)); } } /** * Build the SAX trie out of the series. * * @param tsData The timeseries. * @param windowSize PAA window size to use. * @param alphabetSize The SAX alphabet size. * @return Discords found within the series. * @throws TrieException if error occurs. * @throws TSException if error occurs. */ public static DiscordRecords ts2Discords(double[] tsData, int windowSize, int alphabetSize) throws TrieException, TSException { // make alphabet available NormalAlphabet normalA = new NormalAlphabet(); // get a trie instance SAXTrie trie = new SAXTrie(tsData.length - windowSize, alphabetSize); // build the trie sliding over the series // int currPosition = 0; while ((currPosition + windowSize) < tsData.length) { // get the window SAX representation double[] subSeries = getSubSeries(tsData, currPosition, currPosition + windowSize); char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize)); // add result to the structure trie.put(String.valueOf(saxVals), currPosition); // increment the position currPosition++; } // delegate the job to discords extraction engine DiscordRecords discords = getDiscordsAlgorithm(tsData, windowSize, trie, DEFAULT_COLLECTION_SIZE, new LargeWindowAlgorithm()); return discords; } /** * Compute the distance between the two strings, this function use the numbers associated with * ASCII codes, i.e. distance between a and b would be 1. * * @param a The first string. * @param b The second string. * @return The pairwise distance. * @throws TSException if length are differ. */ public static int strDistance(char[] a, char[] b) throws TSException { if (a.length == b.length) { int distance = 0; for (int i = 0; i < a.length; i++) { int tDist = Math.abs(Character.getNumericValue(a[i]) - Character.getNumericValue(b[i])); if (tDist > 1) { distance += tDist; } } return distance; } else { throw new TSException("Unable to compute SAX distance, string lengths are not equal"); } } /** * Compute the distance between the two chars based on the ASCII symbol codes. * * @param a The first char. * @param b The second char. * @return The distance. */ public static int strDistance(char a, char b) { return Math.abs(Character.getNumericValue(a) - Character.getNumericValue(b)); } /** * This function implements SAX MINDIST function which uses alphabet based distance matrix. * * @param a The SAX string. * @param b The SAX string. * @param distanceMatrix The distance matrix to use. * @return distance between strings. * @throws TSException If error occurs. */ public static double saxMinDist(char[] a, char[] b, double[][] distanceMatrix) throws TSException { if (a.length == b.length) { double dist = 0.0D; for (int i = 0; i < a.length; i++) { if (Character.isLetter(a[i]) && Character.isLetter(b[i])) { int numA = Character.getNumericValue(a[i]) - 10; int numB = Character.getNumericValue(b[i]) - 10; if (numA > 19 || numA < 0 || numB > 19 || numB < 0) { throw new TSException("The character index greater than 19 or less than 0!"); } double localDist = distanceMatrix[numA][numB]; dist += localDist; } else { throw new TSException("Non-literal character found!"); } } return dist; } else { throw new TSException("Data arrays lengths are not equal!"); } } public MotifRecords series2Motifs(double[] series, int windowSize, int alphabetSize, int motifsNumToReport, SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException, TSException { // init the SAX structures // SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize); StringBuilder sb = new StringBuilder(); sb.append("data size: ").append(series.length); double max = TSUtils.max(series); sb.append("; max: ").append(max); double min = TSUtils.min(series); sb.append("; min: ").append(min); double mean = TSUtils.mean(series); sb.append("; mean: ").append(mean); int nans = TSUtils.countNaN(series); sb.append("; NaNs: ").append(nans); consoleLogger.fine(sb.toString()); consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize + ", SAX Trie size: " + (series.length - windowSize)); Alphabet normalA = new NormalAlphabet(); Date start = new Date(); // build the trie // int currPosition = 0; while ((currPosition + windowSize) < series.length) { // get the window SAX representation double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize); char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize)); // add result to the structure trie.put(String.valueOf(saxVals), currPosition); // increment the position currPosition++; } Date end = new Date(); consoleLogger.fine("trie built in: " + timeToString(start.getTime(), end.getTime())); start = new Date(); MotifRecords motifs = getMotifs(trie, motifsNumToReport); end = new Date(); consoleLogger.fine("motifs retrieved in: " + timeToString(start.getTime(), end.getTime())); return motifs; } /** * Build the SAX trie out of Instances reporting discords. * * @param tsData The timeseries. * @param windowSize PAA window size to use. * @param alphabetSize The SAX alphabet size. * @param dataAttributeName The WEKA attribute - essentially points on the instance attribute * which bears the data value in this case. * @param discordsNumToReport how many discords to report. * @return Discords found within the series. * @throws TrieException if error occurs. * @throws TSException if error occurs. */ public static DiscordRecords instances2Discords(Instances tsData, String dataAttributeName, int windowSize, int alphabetSize, int discordsNumToReport) throws TrieException, TSException { // get the timestamps and data attributes // Attribute dataAttribute = tsData.attribute(dataAttributeName); double[] series = toRealSeries(tsData, dataAttribute); NormalAlphabet normalA = new NormalAlphabet(); SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize); StringBuilder sb = new StringBuilder(); sb.append("data size: ").append(series.length); double max = TSUtils.max(series); sb.append("; max: ").append(max); double min = TSUtils.min(series); sb.append("; min: ").append(min); double mean = TSUtils.mean(series); sb.append("; mean: ").append(mean); int nans = TSUtils.countNaN(series); sb.append("; NaNs: ").append(nans); consoleLogger.fine(sb.toString()); consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize + ", SAX Trie size: " + (series.length - windowSize)); // build the trie // int currPosition = 0; while ((currPosition + windowSize) < series.length) { // get the window SAX representation double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize); char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize)); // add result to the structure trie.put(String.valueOf(saxVals), currPosition); // increment the position currPosition++; } Date start = new Date(); int reportNum = DEFAULT_COLLECTION_SIZE; if (discordsNumToReport > 0 && discordsNumToReport < 50) { reportNum = discordsNumToReport; } DiscordRecords discords = getDiscordsAlgorithm(toRealSeries(tsData, dataAttribute), windowSize, trie, reportNum, new LargeWindowAlgorithm()); Date end = new Date(); consoleLogger.fine("discords search finished in : " + timeToString(start.getTime(), end.getTime())); return discords; } /** * Build the SAX trie out of the series and reports discords. * * @param series The timeseries. * @param windowSize PAA window size to use. * @param alphabetSize The SAX alphabet size. * @param discordsNumToReport how many discords to report. * @return Discords found within the series. * @throws TrieException if error occurs. * @throws TSException if error occurs. */ public static DiscordRecords series2Discords(double[] series, int windowSize, int alphabetSize, int discordsNumToReport, SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException, TSException { // get the timestamps and data attributes // NormalAlphabet normalA = new NormalAlphabet(); SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize); StringBuilder sb = new StringBuilder(); sb.append("data size: ").append(series.length); double max = TSUtils.max(series); sb.append("; max: ").append(max); double min = TSUtils.min(series); sb.append("; min: ").append(min); double mean = TSUtils.mean(series); sb.append("; mean: ").append(mean); int nans = TSUtils.countNaN(series); sb.append("; NaNs: ").append(nans); consoleLogger.fine(sb.toString()); consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize + ", SAX Trie size: " + (series.length - windowSize)); // build the trie // int currPosition = 0; while ((currPosition + windowSize) < series.length) { // get the window SAX representation double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize); char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize)); // add result to the structure trie.put(String.valueOf(saxVals), currPosition); // increment the position currPosition++; } Date start = new Date(); int reportNum = DEFAULT_COLLECTION_SIZE; if (discordsNumToReport > 0 && discordsNumToReport < 50) { reportNum = discordsNumToReport; } DiscordRecords discords = getDiscordsAlgorithm(series, windowSize, trie, reportNum, markerAlgorithm); Date end = new Date(); consoleLogger.fine("discords search finished in : " + timeToString(start.getTime(), end.getTime())); return discords; } /** * Builds two collections - collection of "discords" - the surprise or unique patterns and the * collection of the motifs - most frequent patterns. This method leveraging the Trie structure - * so the sliding window size will be translated into the alphabet size by using PAA. * * @param series The data series. * @param windowSize The sliding window size. * @param alphabetSize The alphabet size. * @param discordCollectionSize The size of the discord collection - how many top discords we want * to keep. * @param motifsCollectionSize The size of the motif collection - how many top motifs we want to * keep. * @return All what was promised if finishes. * * @throws TrieException if error occurs. * @throws TSException if error occurs. */ public static DiscordsAndMotifs series2DiscordsAndMotifs(double[] series, int windowSize, int alphabetSize, int discordCollectionSize, int motifsCollectionSize, SlidingWindowMarkerAlgorithm markerAlgorithm) throws TrieException, TSException { // init the SAX structures // DiscordsAndMotifs res = new DiscordsAndMotifs(discordCollectionSize, motifsCollectionSize); SAXTrie trie = new SAXTrie(series.length - windowSize, alphabetSize); StringBuilder sb = new StringBuilder(); sb.append("data size: ").append(series.length); double max = TSUtils.max(series); sb.append("; max: ").append(max); double min = TSUtils.min(series); sb.append("; min: ").append(min); double mean = TSUtils.mean(series); sb.append("; mean: ").append(mean); int nans = TSUtils.countNaN(series); sb.append("; NaNs: ").append(nans); consoleLogger.fine(sb.toString()); consoleLogger.fine("window size: " + windowSize + ", alphabet size: " + alphabetSize + ", SAX Trie size: " + (series.length - windowSize)); Alphabet normalA = new NormalAlphabet(); Date start = new Date(); // build the trie // int currPosition = 0; while ((currPosition + windowSize) < series.length) { // get the window SAX representation double[] subSeries = getSubSeries(series, currPosition, currPosition + windowSize); char[] saxVals = getSaxVals(subSeries, windowSize, normalA.getCuts(alphabetSize)); // add result to the structure trie.put(String.valueOf(saxVals), currPosition); // increment the position currPosition++; } Date end = new Date(); consoleLogger.fine("trie built in: " + timeToString(start.getTime(), end.getTime())); start = new Date(); MotifRecords motifs = getMotifs(trie, motifsCollectionSize); end = new Date(); consoleLogger.fine("motifs retrieved in: " + timeToString(start.getTime(), end.getTime())); start = new Date(); DiscordRecords discords = getDiscordsAlgorithm(series, windowSize, trie, discordCollectionSize, markerAlgorithm); end = new Date(); consoleLogger.fine("discords collected in: " + timeToString(start.getTime(), end.getTime())); res.addDiscords(discords); res.addMotifs(motifs); return res; } /** * The discords extraction method. * * Here I need to keep a continuous stack of knowledge with information not only about distance, * but about abandoning or conducting a full search for words. Thus, I will not be doing the same * expensive search on the rarest word all over again. * * @param series The series we work with. * @param windowSize The series window size. * @param marker The algorithm for marking visited locations. * @param trie * @param discordCollectionSize * @return * @throws TSException * @throws TrieException */ private static DiscordRecords getDiscordsAlgorithm(double[] series, int windowSize, SAXTrie trie, int discordCollectionSize, SlidingWindowMarkerAlgorithm marker) throws TSException, TrieException { consoleLogger.fine("starting discords finding routines"); // resulting discords collection DiscordRecords discords = new DiscordRecords(discordCollectionSize); // visit registry. the idea is to mark as visited all the discord // locations for all searches VisitRegistry discordsVisitRegistry = new VisitRegistry(series.length - windowSize); // the collection of seen words and their best so far distances // in the collection, in addition to pairs <word, distance> I store a // semaphore // which indicates whether the full search was conducted with this word, // or it was // abandoned at some point, so we do not know the final distance // TreeMap<String, DistanceEntry> knownWordsAndTheirCurrentDistances = new TreeMap<String, DistanceEntry>(); // the words already in the discords collection, so we do not have to // re-consider them // TreeSet<String> completeWords = new TreeSet<String>(); // we conduct the search until the number of discords is less than // desired // while (discords.getSize() < discordCollectionSize) { consoleLogger.fine("currently known discords: " + discords.getSize() + " out of " + discordCollectionSize); Date start = new Date(); DiscordRecord bestDiscord = findBestDiscord(series, windowSize, trie, completeWords, knownWordsAndTheirCurrentDistances, discordsVisitRegistry, marker); Date end = new Date(); // if the discord is null we getting out of the search if (bestDiscord.getDistance() == 0.0D || bestDiscord.getPosition() == -1) { consoleLogger.fine("breaking the outer search loop, discords found: " + discords.getSize() + " last seen discord: " + bestDiscord.toString()); break; } consoleLogger.fine("new discord: " + bestDiscord.getPayload() + ", position " + bestDiscord.getPosition() + ", distance " + bestDiscord.getDistance() + ", elapsed time: " + timeToString(start.getTime(), end.getTime())); // collect the result // discords.add(bestDiscord); // and maintain data structures // marker.markVisited(discordsVisitRegistry, bestDiscord.getPosition(), windowSize); completeWords.add(String.valueOf(bestDiscord.getPayload())); } // done deal // return discords; } /** * This method reports the best found discord. Note, that this discord is approximately the best. * Due to the fuzzy-logic search with randomization and aggressive labeling of the magic array * locations. * * @param series The series we are looking for discord in. * @param windowSize The sliding window size. * @param trie The trie (index of the series). * @param foundDiscordsWords Already found discords. * @param knownWordsAndTheirDistances The best known distances for certain word. I use the early * search abandoning optimization in oder to reduce complexity. * @param visitedLocations The magic array. * @return The best discord instance. * @throws TSException If error occurs. * @throws TrieException If error occurs. */ private static DiscordRecord findBestDiscord(double[] series, int windowSize, SAXTrie trie, TreeSet<String> foundDiscordsWords, TreeMap<String, DistanceEntry> knownWordsAndTheirDistances, VisitRegistry visitedLocations, SlidingWindowMarkerAlgorithm marker) throws TSException, TrieException { // we extract all seen words from the trie // and sort them by the frequency decrease // ArrayList<SAXTrieHitEntry> frequencies = trie.getFrequencies(); Collections.sort(frequencies); // StringBuilder sb = new StringBuilder(); // for (int i = 0; i < 10; i++) { // sb.append("top frequencies: ").append(frequencies.get(i).getStr()).append(",") // .append(frequencies.get(i).getPosition()).append(" ; "); // } // consoleLogger.finer(sb.toString()); // init variables int bestSoFarPosition = -1; double bestSoFarDistance = 0.0D; String bestSoFarString = ""; // we will iterate over words from rarest to frequent ones // int idx = 0; int limit = frequencies.size(); while (idx < limit) { SAXTrieHitEntry currentEntry = frequencies.get(idx); String currentWord = String.valueOf(currentEntry.getStr()); int currentPosition = currentEntry.getPosition(); // take care about this entry cleanUpFrequencies(frequencies, currentWord, idx); // and update the length limit = frequencies.size(); if (foundDiscordsWords.contains(currentWord) || visitedLocations.isVisited(currentPosition)) { consoleLogger.finer("skipping the search for " + currentWord); idx++; continue; } else { consoleLogger.finer("conducting search for " + currentWord + " iteration " + idx + " from " + frequencies.size()); } // so, lets search begin // double nearestNeighborDist = Double.MAX_VALUE; boolean doRandomSearch = true; // get a copy of visited locations VisitRegistry registry = new VisitRegistry(series.length - windowSize); registry.transferVisited(visitedLocations); // & mark visited current substring double[] currentLocations = getSubSeries(series, currentPosition, currentPosition + windowSize); marker.markVisited(registry, currentPosition, windowSize); // WE QRE GOING TO ITERATE OVER THE CURRENT WORD OCCURENCES HERE // DistanceEntry bestKnownDistance = knownWordsAndTheirDistances.get(String.valueOf(currentEntry .getStr())); if (null != bestKnownDistance && !(bestKnownDistance.isAbandoned())) { consoleLogger.finer("skipping iterations over " + currentWord + " retrieved result from known distances "); nearestNeighborDist = bestKnownDistance.getDistance(); if (bestKnownDistance.getDistance() < bestSoFarDistance) { consoleLogger .finer("breaking the inner loop flow, bestKnownDistance is less than bestSoFarDistance "); doRandomSearch = false; idx++; continue; } } else { List<Integer> currentOccurences = trie.getOccurences(currentEntry.getStr()); consoleLogger.finer(currentWord + " has " + currentOccurences.size() + " occurrences, iterating..."); for (Integer nextOccurrence : currentOccurences) { // skip the location we standing at if (Math.abs(nextOccurrence.intValue() - currentPosition) < windowSize) { continue; } // mark current next visited marker.markVisited(registry, nextOccurrence, windowSize); // get the piece of the timeseries double[] occurrenceValues = getSubSeries(series, nextOccurrence, nextOccurrence + windowSize); double dist = EuclideanDistance.distance(currentLocations, occurrenceValues); // keep track of best so far distance if (dist < nearestNeighborDist) { nearestNeighborDist = dist; if (dist < bestSoFarDistance) { consoleLogger.finer(" ** abandoning the occurrences iterations"); doRandomSearch = false; break; } } } } if (!(Double.MAX_VALUE == nearestNeighborDist)) { consoleLogger.finer("for " + currentWord + " occurrences, smallest nearest neighbor distance: " + nearestNeighborDist); } else { consoleLogger.finer("nothing changed after iterations..."); } boolean completeSearch = true; // check if we must continue with random neighbors if (doRandomSearch) { // it is heuristics here // int nextRandomVisitTarget = -1; int visitCounter = 0; while ((nextRandomVisitTarget = registry.getNextRandomUnvisitedPosition()) != -1) { consoleLogger.finer(" random position pick step " + visitCounter + " visited: " + registry.getVisited().size() + ", unvisited: " + registry.getUnvisited().size() + "; nearest neighbor: " + nearestNeighborDist); // registry.markVisited(nextRandomVisitTarget); marker.markVisited(registry, nextRandomVisitTarget, windowSize); double[] randomTargetValues = getSubSeries(series, nextRandomVisitTarget, nextRandomVisitTarget + windowSize); double randomTargetDistance = EuclideanDistance.distance(currentLocations, randomTargetValues); // early abandoning of the search, the current word is not // discord, we seen better if (randomTargetDistance < bestSoFarDistance) { nearestNeighborDist = randomTargetDistance; consoleLogger.finer(" ** abandoning random visits loop, seen distance " + nearestNeighborDist + " at iteration " + visitCounter); completeSearch = false; break; } // keep track if (randomTargetDistance < nearestNeighborDist) { nearestNeighborDist = randomTargetDistance; } visitCounter = visitCounter + 1; } // while inner loop consoleLogger.finer("random visits loop finished, total positions considered: " + visitCounter); } // if break loop if (nearestNeighborDist > bestSoFarDistance) { bestSoFarDistance = nearestNeighborDist; bestSoFarPosition = currentPosition; bestSoFarString = String.valueOf(currentEntry.getStr()); } if (knownWordsAndTheirDistances.containsKey(currentWord) && knownWordsAndTheirDistances.get(currentWord).isAbandoned()) { knownWordsAndTheirDistances.put(String.valueOf(currentWord), new DistanceEntry( nearestNeighborDist, completeSearch)); } else { knownWordsAndTheirDistances.put(String.valueOf(currentWord), new DistanceEntry( nearestNeighborDist, completeSearch)); } consoleLogger.finer(" . . iterated " + idx + " times, best distance: " + bestSoFarDistance + " for a string " + bestSoFarString); idx++; } // outer loop return new DiscordRecord(bestSoFarPosition, bestSoFarDistance, bestSoFarString); } private static void cleanUpFrequencies(ArrayList<SAXTrieHitEntry> frequencies, String currentWord, int startPosition) { int i = startPosition + 1; while (i < frequencies.size()) { if (currentWord.equalsIgnoreCase(String.valueOf(frequencies.get(i).getStr()))) { frequencies.remove(i); } else { i++; } } } /** * Get N top motifs from trie. * * @param trie The trie. * @param maxMotifsNum The number of motifs to report. * @return The motifs collection. * @throws TrieException If error occurs. */ private static MotifRecords getMotifs(SAXTrie trie, int maxMotifsNum) throws TrieException { MotifRecords res = new MotifRecords(maxMotifsNum); ArrayList<SAXTrieHitEntry> frequencies = trie.getFrequencies(); Collections.sort(frequencies); // all sorted - from one end we have unique words - those discords // from the other end - we have motifs - the most frequent entries // // what I'll do here - is to populate non-trivial frequent entries into // the resulting container // // picking those non-trivial patterns this method job // non-trivial here means the one which are not the same letters // Set<SAXTrieHitEntry> seen = new TreeSet<SAXTrieHitEntry>(); int counter = 0; // iterating backward - collection is sorted for (int i = frequencies.size() - 1; i >= 0; i--) { SAXTrieHitEntry entry = frequencies.get(i); if (entry.isTrivial(2) || seen.contains(entry) || (2 > entry.getFrequency())) { if ((2 > entry.getFrequency())) { break; } continue; } else { counter += 1; res.add(new MotifRecord(entry.getStr(), trie.getOccurences(entry.getStr()))); seen.add(entry); if (counter > maxMotifsNum) { break; } } } return res; } /** * Convert real-valued series into symbolic representation. * * @param vals Real valued timeseries. * @param windowSize The PAA window size. * @param cuts The cut values array used for SAX transform. * @return The symbolic representation of the given real time-series. * @throws TSException If error occurs. */ public static char[] getSaxVals(double[] vals, int windowSize, double[] cuts) throws TSException { char[] saxVals; if (windowSize == cuts.length + 1) { saxVals = TSUtils.ts2String(TSUtils.zNormalize(vals), cuts); } else { saxVals = TSUtils.ts2String(TSUtils.zNormalize(TSUtils.paa(vals, cuts.length + 1)), cuts); } return saxVals; } /** * Extracts sub-series from the WEKA-style series. * * @param data The series. * @param attribute The data-bearing attribute. * @param start The start timestamp. * @param end The end timestamp * @return sub-series from start to end. */ private static double[] getSubSeries(Instances data, Attribute attribute, int start, int end) { List<Instance> tmpList = data.subList(start, end); double[] vals = new double[end - start]; for (int i = 0; i < end - start; i++) { vals[i] = tmpList.get(i).value(attribute.index()); } return vals; } /** * Converts Instances into double array. * * @param tsData The instances data. * @param dataAttribute The attribute to use in conversion. * @return real-valued array. */ private static double[] toRealSeries(Instances tsData, Attribute dataAttribute) { double[] vals = new double[tsData.size()]; for (int i = 0; i < tsData.size(); i++) { vals[i] = tsData.get(i).value(dataAttribute.index()); } return vals; } /** * Extracts sub-series from series. * * @param data The series. * @param start The start position. * @param end The end position * @return sub-series from start to end. */ public static double[] getSubSeries(double[] data, int start, int end) { double[] vals = new double[end - start]; for (int i = 0; i < end - start; i++) { vals[i] = data[start + i]; } return vals; } /** * Brute force calculation of the distances. * * @param tsData timeseries. * @param dataAttributeName The pointer onto data-bearing attribute. * @param controls The control values. * @param window Window size. * @throws TSException if error occurs. */ public static void maxDistances(Instances tsData, String dataAttributeName, int[] controls, int window) throws TSException { // get the timestamps and data attributes // Attribute dataAttribute = tsData.attribute(dataAttributeName); double[] distances = new double[controls.length]; int[] maxPos = new int[controls.length]; for (int i = 0; i < controls.length; i++) { distances[i] = Double.MAX_VALUE; maxPos[i] = -1; } // [1.0] PREPROCESSING: in the sliding window loop build SAX string // entries // int currPosition = 0; while ((currPosition + window) < tsData.size()) { double[] vals = getSubSeries(tsData, dataAttribute, currPosition, currPosition + window); for (int i = 0; i < controls.length; i++) { if (Math.abs(controls[i] - currPosition) < window) { continue; } else { double[] oVals = getSubSeries(tsData, dataAttribute, controls[i], controls[i] + window); double dist = EuclideanDistance.distance(vals, oVals); if (distances[i] > dist) { distances[i] = dist; maxPos[i] = currPosition; } } } currPosition++; } // for (int i = 0; i < controls.length; i++) { // System.out.println(controls[i] + " - " + distances[i] + ", at " + // maxPos[i]); // } // for (int i = 0; i < controls.length; i++) { // double[] is = getSubSeries(tsData, dataAttribute, controls[i], // controls[i] + window); // double[] os = getSubSeries(tsData, dataAttribute, maxPos[i], // maxPos[i] + window); // System.out.println(Arrays.toString(is) + "\n" + Arrays.toString(os)); // } } // /** // * // * "We are given n, the length of the discords in advance, and we must // choose two parameters, // the // * cardinality of the SAX alphabet size a, and the SAX word size w. We // defer a discussion of how // * to set these parameters until" // * // * // * @param tsData timeseries. // * @param windowLength window length. // * @param paaSize The PAA window size. // * @param alphabetSize The SAX alphabet size. // * @param timeAttributeName Time-stamp attribute. // * @param dataAttributeName Value attribute. // * @return top discords for the time-series given // * @throws TSException if error occurs. // */ // public static DiscordRecords getBruteForceDiscords(Instances tsData, int // windowLength, // int paaSize, int alphabetSize, String timeAttributeName, String // dataAttributeName) // throws TSException { // // double[] cuts = normalAlphabet.getCuts(alphabetSize); // // // get the timestamps and data attributes // // // Attribute dataAttribute = tsData.attribute(dataAttributeName); // double[] theRawData = // TSUtils.zNormalize(tsData.attributeToDoubleArray(dataAttribute.index())); // // // Init variables // // // DiscordRecords discords = new DiscordRecords(10); // DiscordRecord discord = new DiscordRecord(); // // XMLGregorianCalendar cTstamp = // Tstamp.makeTimestamp(System.currentTimeMillis()); // // // run the search loop // // // for (int i = 0; i < tsData.size() - windowLength; i++) { // // if (i % 100 == 0) { // XMLGregorianCalendar nTstamp = // Tstamp.makeTimestamp(System.currentTimeMillis()); // System.out.println("i: " + i + ", at: " + nTstamp + ", diff: " // + Tstamp.diff(cTstamp, nTstamp) + ", discord at: " + // discord.getPosition() // + ", distance: " + discord.getDistance()); // cTstamp = nTstamp; // } // // // fix the i-s string // // // // char[] ssA = TSUtils.ts2String(TSUtils.paa(TSUtils.normalize(Arrays // // .copyOfRange(theRawData, i, i + windowLength)), paaSize), cuts); // // // char[] ssA = TSUtils.ts2String( // TSUtils.paa(Arrays.copyOfRange(theRawData, i, i + windowLength), // paaSize), cuts); // // Integer nearestNeighborDist = Integer.MAX_VALUE; // // // the inner loop // // // for (int j = 0; j < tsData.size() - windowLength; j++) { // // // check for the trivial match // // // if (Math.abs(i - j) >= windowLength) { // // // get the SAX approximations of both series here // // // // char[] ssB = TSUtils.ts2String(TSUtils.paa(TSUtils.normalize(Arrays // // .copyOfRange(theRawData, j, j + windowLength)), paaSize), cuts); // // char[] ssB = TSUtils.ts2String( // TSUtils.paa(Arrays.copyOfRange(theRawData, j, j + windowLength), // paaSize), cuts); // // // get the distance here and early terminate if it's less than the // // largest // // // Integer tmpDist = strDistance(ssA, ssB); // // System.out.println(String.valueOf(ssA) + " VS " + // // String.valueOf(ssB) // // + " : " + tmpDist); // if (tmpDist == 0 || tmpDist < discords.getMinDistance()) { // break; // } // if (tmpDist < nearestNeighborDist) { // nearestNeighborDist = tmpDist; // } // } // // } // if ((nearestNeighborDist != Integer.MAX_VALUE) // && (nearestNeighborDist > discords.getMinDistance())) { // discord.setDistance(nearestNeighborDist); // discord.setIndex(i); // discords.add(discord); // discord = new DiscordRecord(); // } // } // i loop - outer // return discords; // } /** * * "We are given n, the length of the discords in advance, and we must choose two parameters, the * cardinality of the SAX alphabet size a, and the SAX word size w. We defer a discussion of how * to set these parameters until" * * * @param tsData timeseries. * @param windowLength window length. * @return top discords for the time-series given * @throws TSException if error occurs. */ public static DiscordRecords getBruteForceDiscords(double[] tsData, int windowLength) throws TSException { DiscordRecords discords = new DiscordRecords(100); // run the search loop // for (int i = 0; i < tsData.length - windowLength; i++) { double[] seriesA = getSubSeries(tsData, i, i + windowLength); Double nearestNeighborDist = Double.MAX_VALUE; // the inner loop // for (int j = 0; j < tsData.length - windowLength; j++) { if (Math.abs(i - j) < windowLength) { continue; } double[] seriesB = getSubSeries(tsData, j, j + windowLength); double dist = EuclideanDistance.distance(seriesA, seriesB); if (dist < nearestNeighborDist) { nearestNeighborDist = dist; } } // inner loop // if (nearestNeighborDist > bestSoFarDist) { // bestSoFarDist = nearestNeighborDist; // bestSoFarLoc = i; discords.add(new DiscordRecord(i, nearestNeighborDist)); // } } return discords; } /** * Generic method to convert the milliseconds into the elapsed time string. * * @param start Start timestamp. * @param finish End timestamp. * @return String representation of the elapsed time. */ private static String timeToString(long start, long finish) { long diff = finish - start; long secondInMillis = 1000; long minuteInMillis = secondInMillis * 60; long hourInMillis = minuteInMillis * 60; long dayInMillis = hourInMillis * 24; long yearInMillis = dayInMillis * 365; @SuppressWarnings("unused") long elapsedYears = diff / yearInMillis; diff = diff % yearInMillis; @SuppressWarnings("unused") long elapsedDays = diff / dayInMillis; diff = diff % dayInMillis; @SuppressWarnings("unused") long elapsedHours = diff / hourInMillis; diff = diff % hourInMillis; long elapsedMinutes = diff / minuteInMillis; diff = diff % minuteInMillis; long elapsedSeconds = diff / secondInMillis; diff = diff % secondInMillis; long elapsedMilliseconds = diff % secondInMillis; return elapsedMinutes + "m " + elapsedSeconds + "s " + elapsedMilliseconds + "ms"; } }