package net.seninp.jmotif.sax.discord; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.seninp.jmotif.distance.EuclideanDistance; import net.seninp.jmotif.sax.NumerosityReductionStrategy; import net.seninp.jmotif.sax.SAXProcessor; import net.seninp.jmotif.sax.TSProcessor; import net.seninp.jmotif.sax.alphabet.NormalAlphabet; import net.seninp.jmotif.sax.datastructure.FrequencyTableEntry; import net.seninp.jmotif.sax.datastructure.SAXRecord; import net.seninp.jmotif.sax.datastructure.SAXRecords; import net.seninp.jmotif.sax.registry.MagicArrayEntry; import net.seninp.jmotif.sax.registry.SlidingWindowMarkerAlgorithm; import net.seninp.jmotif.sax.registry.VisitRegistry; /** * Implements HOTSAX discord discovery algorithm. * * @author psenin */ public class HOTSAXImplementation { private static TSProcessor tp = new TSProcessor(); private static SAXProcessor sp = new SAXProcessor(); private static EuclideanDistance ed = new EuclideanDistance(); // static block - we instantiate the logger // private static final Logger LOGGER = LoggerFactory.getLogger(HOTSAXImplementation.class); /** * Hash-table backed implementation (in contrast to trie). Time series is converted into a * SAXRecords data structure first, Hash-table backed magic array created second. HOTSAX applied * third. Nearest neighbors are searched only among the subsequences which were produced by SAX * with specified numerosity reduction. Thus, if the strategy is EXACT or MINDIST, discords do not * match those produced by BruteForce or NONE. * * @param series The timeseries. * @param discordsNumToReport The number of discords to report. * @param windowSize SAX sliding window size. * @param paaSize SAX PAA value. * @param alphabetSize SAX alphabet size. * @param strategy the numerosity reduction strategy. * @param nThreshold the normalization threshold value. * @return The set of discords found within the time series, it may return less than asked for -- * in this case, there are no more discords. * @throws Exception if error occurs., currentPos + windowSize */ public static DiscordRecords series2Discords(double[] series, int discordsNumToReport, int windowSize, int paaSize, int alphabetSize, NumerosityReductionStrategy strategy, double nThreshold) throws Exception { // fix the start time Date start = new Date(); // get the SAX transform done NormalAlphabet normalA = new NormalAlphabet(); SAXRecords sax = sp.ts2saxViaWindow(series, windowSize, paaSize, normalA.getCuts(alphabetSize), strategy, nThreshold); Date saxEnd = new Date(); LOGGER.debug("discretized in {}, words: {}, indexes: {}", SAXProcessor.timeToString(start.getTime(), saxEnd.getTime()), sax.getRecords().size(), sax.getIndexes().size()); // fill the array for the outer loop ArrayList<MagicArrayEntry> magicArray = new ArrayList<MagicArrayEntry>(sax.getRecords().size()); for (SAXRecord sr : sax.getRecords()) { magicArray.add(new MagicArrayEntry(String.valueOf(sr.getPayload()), sr.getIndexes().size())); } Date hashEnd = new Date(); LOGGER.debug("Magic array filled in : {}", SAXProcessor.timeToString(saxEnd.getTime(), hashEnd.getTime())); DiscordRecords discords = getDiscordsWithMagic(series, sax, windowSize, magicArray, discordsNumToReport, nThreshold); Date end = new Date(); LOGGER.debug("{} discords found in {}", discords.getSize(), SAXProcessor.timeToString(start.getTime(), end.getTime())); return discords; } private static DiscordRecords getDiscordsWithMagic(double[] series, SAXRecords sax, int windowSize, ArrayList<MagicArrayEntry> magicArray, int discordCollectionSize, double nThreshold) throws Exception { // sort the candidates Collections.sort(magicArray); // resulting discords collection DiscordRecords discords = new DiscordRecords(); // visit registry HashSet<Integer> visitRegistry = new HashSet<Integer>(windowSize * discordCollectionSize); // we conduct the search until the number of discords is less than // desired // while (discords.getSize() < discordCollectionSize) { LOGGER.trace("currently known discords: {} out of {}", discords.getSize(), discordCollectionSize); Date start = new Date(); DiscordRecord bestDiscord = findBestDiscordWithMagic(series, windowSize, sax, magicArray, visitRegistry, nThreshold); Date end = new Date(); // if the discord is null we getting out of the search if (bestDiscord.getNNDistance() == 0.0D || bestDiscord.getPosition() == -1) { LOGGER.trace("breaking the outer search loop, discords found: {} last seen discord: {}", discords.getSize(), bestDiscord); break; } bestDiscord.setInfo( "position " + bestDiscord.getPosition() + ", NN distance " + bestDiscord.getNNDistance() + ", elapsed time: " + SAXProcessor.timeToString(start.getTime(), end.getTime()) + ", " + bestDiscord.getInfo()); LOGGER.debug("{}", bestDiscord.getInfo()); // collect the result // discords.add(bestDiscord); // and maintain data structures // int markStart = bestDiscord.getPosition() - windowSize; // if (markStart < 0) { // markStart = 0; // } int markEnd = bestDiscord.getPosition() + windowSize; // if (markEnd > series.length) { // markEnd = series.length; // } LOGGER.debug("marking as globally visited [{}, {}]", markStart, markEnd); for (int i = markStart; i < markEnd; i++) { visitRegistry.add(i); } } // done deal // return discords; } /** * This method reports the best found discord. Note, that this discord is approximately the best. * Due to the fuzzy-logic search with randomization and aggressive labeling of the magic array * locations. * * @param series The series we are looking for discord in. * @param windowSize The sliding window size. * @param sax The SAX data structure for the reference. * @param allWords The magic heuristics array. * @param discordRegistry The global visit array. * @param nThreshold The z-normalization threshold. * @return The best discord instance. * @throws Exception If error occurs. */ private static DiscordRecord findBestDiscordWithMagic(double[] series, int windowSize, SAXRecords sax, ArrayList<MagicArrayEntry> allWords, HashSet<Integer> discordRegistry, double nThreshold) throws Exception { // prepare the visits array, note that there can't be more points to visit that in a SAX index int[] visitArray = new int[series.length]; // init tracking variables int bestSoFarPosition = -1; double bestSoFarDistance = 0.0D; String bestSoFarWord = ""; // discord search stats int iterationCounter = 0; int distanceCalls = 0; // System.err.println(frequencies.size() + " left to iterate over"); LOGGER.debug("iterating over {} entries", allWords.size()); for (MagicArrayEntry currentEntry : allWords) { // look into that entry String currentWord = currentEntry.getStr(); Set<Integer> occurrences = sax.getByWord(currentWord).getIndexes(); // we shall iterate over these candidate positions first for (int currentPos : occurrences) { iterationCounter++; // make sure it is not a previously found discord passed through the parameters array // // note, that the discordRegistry contains the whole span of previously found discord, // not just it's position.... // // if (discordRegistry.contains(currentPos)) { continue; } LOGGER.trace("conducting search for {} at {}, iteration {}", currentWord, currentPos, iterationCounter); int markStart = currentPos - windowSize; // if (markStart < 0) { // markStart = 0; // } int markEnd = currentPos + windowSize; // if (markEnd > series.length) { // markEnd = series.length; // } // all the candidates we are not going to try HashSet<Integer> alreadyVisited = new HashSet<Integer>( occurrences.size() + (markEnd - markStart)); for (int i = markStart; i < markEnd; i++) { alreadyVisited.add(i); } // fix the current subsequence trace double[] currentCandidateSeq = tp .znorm(tp.subseriesByCopy(series, currentPos, currentPos + windowSize), nThreshold); // let the search begin .. double nearestNeighborDist = Double.MAX_VALUE; boolean doRandomSearch = true; for (Integer nextOccurrence : occurrences) { // just in case there is an overlap if (alreadyVisited.contains(nextOccurrence)) { continue; } else { alreadyVisited.add(nextOccurrence); } // get the subsequence and the distance // double[] occurrenceSubsequence = tp.subseriesByCopy(series, nextOccurrence, // nextOccurrence + windowSize); // double dist = ed.distance(currentCandidateSeq, occurrenceSubsequence); double dist = distance(currentCandidateSeq, series, nextOccurrence, nextOccurrence + windowSize, nThreshold); distanceCalls++; // keep track of best so far distance if (dist < nearestNeighborDist) { nearestNeighborDist = dist; LOGGER.trace(" ** current NN at {}, distance: {}, pos {}", nextOccurrence, nearestNeighborDist, currentPos); } if (dist < bestSoFarDistance) { LOGGER.trace( " ** abandoning the occurrences loop, distance {} is less than the best so far {}", dist, bestSoFarDistance); doRandomSearch = false; break; } } // check if we must continue with random neighbors if (doRandomSearch) { LOGGER.trace("starting random search"); // init the visit array // int visitCounter = 0; int cIndex = 0; for (int i = 0; i < series.length - windowSize; i++) { if (!(alreadyVisited.contains(i))) { visitArray[cIndex] = i; cIndex++; } } cIndex--; // shuffle the visit array // Random rnd = new Random(); for (int i = cIndex; i > 0; i--) { int index = rnd.nextInt(i + 1); int a = visitArray[index]; visitArray[index] = visitArray[i]; visitArray[i] = a; } // while there are unvisited locations while (cIndex >= 0) { int randomPos = visitArray[cIndex]; cIndex--; // double[] randomSubsequence = tp.subseriesByCopy(series, randomPos, // randomPos + windowSize); // double dist = ed.distance(currentCandidateSeq, randomSubsequence); double dist = distance(currentCandidateSeq, series, randomPos, randomPos + windowSize, nThreshold); distanceCalls++; // keep track if (dist < nearestNeighborDist) { LOGGER.trace(" ** current NN at {}, distance: {}", +randomPos, dist); nearestNeighborDist = dist; } // early abandoning of the search: // the current word is not discord, we have seen better if (dist < bestSoFarDistance) { nearestNeighborDist = dist; LOGGER.trace(" ** abandoning random visits loop, seen distance {} at iteration {}", nearestNeighborDist, visitCounter); break; } visitCounter = visitCounter + 1; } // while inner loop } // end of random search loop if (nearestNeighborDist > bestSoFarDistance && nearestNeighborDist < Double.MAX_VALUE) { LOGGER.debug("discord updated: pos {}, dist {}", currentPos, bestSoFarDistance); bestSoFarDistance = nearestNeighborDist; bestSoFarPosition = currentPos; bestSoFarWord = currentWord; } LOGGER.trace(" . . iterated {} times, best distance: {} for a string {} at {}", iterationCounter, bestSoFarDistance, bestSoFarWord, bestSoFarPosition); } // outer loop inner part } // outer loop LOGGER.trace("Distance calls: {}", distanceCalls); DiscordRecord res = new DiscordRecord(bestSoFarPosition, bestSoFarDistance, bestSoFarWord); res.setLength(windowSize); res.setInfo("distance calls: " + distanceCalls); return res; } /** * Old implementation. Nearest neighbot for a candidate subsequence is searched among all other * time-series subsequences, regardless of their exclusion by EXACT or MINDIST. This also accepts * a marker implementation which is responsible for marking a segment as visited. * * @param series The timeseries. * @param discordsNumToReport The number of discords to report. * @param windowSize SAX sliding window size. * @param paaSize SAX PAA value. * @param alphabetSize SAX alphabet size. * @param markerAlgorithm marker algorithm. * @param strategy the numerosity reduction strategy. * @param nThreshold the normalization threshold value. * @return Discords found within the series. * @throws Exception if error occurs. */ @Deprecated public static DiscordRecords series2DiscordsDeprecated(double[] series, int discordsNumToReport, int windowSize, int paaSize, int alphabetSize, SlidingWindowMarkerAlgorithm markerAlgorithm, NumerosityReductionStrategy strategy, double nThreshold) throws Exception { Date start = new Date(); // get the SAX transform NormalAlphabet normalA = new NormalAlphabet(); SAXRecords sax = sp.ts2saxViaWindow(series, windowSize, alphabetSize, normalA.getCuts(alphabetSize), strategy, nThreshold); Date saxEnd = new Date(); LOGGER.debug("discretized in {}, words: {}, indexes: {}", SAXProcessor.timeToString(start.getTime(), saxEnd.getTime()), sax.getRecords().size(), sax.getIndexes().size()); // instantiate the hash HashMap<String, ArrayList<Integer>> hash = new HashMap<String, ArrayList<Integer>>(); // fill the hash for (SAXRecord sr : sax.getRecords()) { for (Integer pos : sr.getIndexes()) { // add to hash String word = String.valueOf(sr.getPayload()); if (!(hash.containsKey(word))) { hash.put(word, new ArrayList<Integer>()); } hash.get(String.valueOf(word)).add(pos); } } Date hashEnd = new Date(); LOGGER.debug("Hash filled in : {}", SAXProcessor.timeToString(saxEnd.getTime(), hashEnd.getTime())); DiscordRecords discords = getDiscordsWithHash(series, windowSize, hash, discordsNumToReport, markerAlgorithm, nThreshold); Date end = new Date(); LOGGER.info("{} discords found in {}", discords.getSize(), SAXProcessor.timeToString(start.getTime(), end.getTime())); return discords; } @Deprecated private static DiscordRecords getDiscordsWithHash(double[] series, int windowSize, HashMap<String, ArrayList<Integer>> hash, int discordCollectionSize, SlidingWindowMarkerAlgorithm markerAlgorithm, double nThreshold) throws Exception { // resulting discords collection DiscordRecords discords = new DiscordRecords(); // visit registry. the idea is to mark as visited all the discord // locations for all searches. in other words, if the discord was found, its location is marked // as visited and there will be no search IT CANT SPAN BEYOND series.length - windowSize VisitRegistry globalTrackVisitRegistry = new VisitRegistry(series.length); globalTrackVisitRegistry.markVisited(series.length - windowSize, windowSize); // we conduct the search until the number of discords is less than // desired // while (discords.getSize() < discordCollectionSize) { LOGGER.trace("currently known discords: {} out of {}", discords.getSize(), discordCollectionSize); Date start = new Date(); DiscordRecord bestDiscord = findBestDiscordWithHash(series, windowSize, hash, globalTrackVisitRegistry, nThreshold); Date end = new Date(); // if the discord is null we getting out of the search if (bestDiscord.getNNDistance() == 0.0D || bestDiscord.getPosition() == -1) { LOGGER.trace("breaking the outer search loop, discords found: {} last seen discord: {}", discords.getSize(), bestDiscord.toString()); break; } bestDiscord.setInfo( "position " + bestDiscord.getPosition() + ", NN distance " + bestDiscord.getNNDistance() + ", elapsed time: " + SAXProcessor.timeToString(start.getTime(), end.getTime()) + ", " + bestDiscord.getInfo()); LOGGER.debug(bestDiscord.getInfo()); // collect the result // discords.add(bestDiscord); // and maintain data structures // markerAlgorithm.markVisited(globalTrackVisitRegistry, bestDiscord.getPosition(), windowSize); } // done deal // return discords; } /** * This method reports the best found discord. Note, that this discord is approximately the best. * Due to the fuzzy-logic search with randomization and aggressive labeling of the magic array * locations. * * @param series The series we are looking for discord in. * @param windowSize The sliding window size. * @param hash The hash-based magic array. * @param globalRegistry The magic array. * @param nThreshold the z-Normalization threshold. * @return The best discord instance. * @throws Exception If error occurs. */ @Deprecated private static DiscordRecord findBestDiscordWithHash(double[] series, int windowSize, HashMap<String, ArrayList<Integer>> hash, VisitRegistry globalRegistry, double nThreshold) throws Exception { // we extract all seen words from the trie and sort them by the frequency decrease ArrayList<FrequencyTableEntry> frequencies = hashToFreqEntries(hash); Collections.sort(frequencies); // init tracking variables int bestSoFarPosition = -1; double bestSoFarDistance = 0.0D; String bestSoFarWord = ""; // discord search stats int iterationCounter = 0; int distanceCalls = 0; // System.err.println(frequencies.size() + " left to iterate over"); while (!frequencies.isEmpty()) { iterationCounter++; // the head of this array has the rarest word FrequencyTableEntry currentEntry = frequencies.remove(0); // if (frequencies.size() % 10000 == 0) { // System.err.println(frequencies.size() + " left to iterate over"); // } String currentWord = String.valueOf(currentEntry.getStr()); int currentPos = currentEntry.getPosition(); // make sure it is not previously found discord passed through the parameters array if (globalRegistry.isVisited(currentPos)) { continue; } // all the candidates we are going to try VisitRegistry randomRegistry = new VisitRegistry(series.length); randomRegistry.markVisited(series.length - windowSize, series.length); int markStart = currentPos - windowSize; if (markStart < 0) { markStart = 0; } int markEnd = currentPos + windowSize; if (markEnd > series.length) { markEnd = series.length; } randomRegistry.markVisited(markStart, markEnd); LOGGER.trace("conducting search for {} at {}, iteration {}, to go: {}", currentWord, currentPos, iterationCounter, frequencies.size()); // fix the current subsequence trace double[] currentCandidateSeq = tp .znorm(tp.subseriesByCopy(series, currentPos, currentPos + windowSize), nThreshold); // let the search begin .. double nearestNeighborDist = Double.MAX_VALUE; boolean doRandomSearch = true; // WE ARE GOING TO ITERATE OVER THE CURRENT WORD OCCURRENCES HERE FIRST List<Integer> currentWordOccurrences = hash.get(currentWord); for (Integer nextOccurrence : currentWordOccurrences) { // just in case there is an overlap if (randomRegistry.isVisited(nextOccurrence.intValue())) { continue; } else { randomRegistry.markVisited(nextOccurrence.intValue()); } // get the subsequence and the distance double[] occurrenceSubsequence = tp.znorm( tp.subseriesByCopy(series, nextOccurrence, nextOccurrence + windowSize), nThreshold); double dist = ed.distance(currentCandidateSeq, occurrenceSubsequence); distanceCalls++; // keep track of best so far distance if (dist < nearestNeighborDist) { nearestNeighborDist = dist; LOGGER.trace(" ** current NN at {}, distance: {}, pos {}" + nextOccurrence, nearestNeighborDist, currentPos); } if (dist < bestSoFarDistance) { LOGGER.trace(" ** abandoning random visits loop, seen distance {} at iteration {}", dist, bestSoFarDistance); doRandomSearch = false; break; } } // check if we must continue with random neighbors if (doRandomSearch) { LOGGER.trace("starting random search"); int visitCounter = 0; // while there are unvisited locations int randomPos = -1; while (-1 != (randomPos = randomRegistry.getNextRandomUnvisitedPosition())) { randomRegistry.markVisited(randomPos); double[] randomSubsequence = tp .znorm(tp.subseriesByCopy(series, randomPos, randomPos + windowSize), nThreshold); double dist = ed.distance(currentCandidateSeq, randomSubsequence); distanceCalls++; // early abandoning of the search: // the current word is not discord, we have seen better if (dist < bestSoFarDistance) { nearestNeighborDist = dist; LOGGER.trace(" ** abandoning random visits loop, seen distance {} at iteration {}", nearestNeighborDist, visitCounter); break; } // keep track if (dist < nearestNeighborDist) { LOGGER.trace(" ** current NN at {}, distance: {}, pos {}" + randomPos, dist, currentPos); nearestNeighborDist = dist; } visitCounter = visitCounter + 1; } // while inner loop } // end of random search loop if (nearestNeighborDist > bestSoFarDistance) { LOGGER.debug("discord updated: pos {}, dist {}", currentPos, bestSoFarDistance); bestSoFarDistance = nearestNeighborDist; bestSoFarPosition = currentPos; bestSoFarWord = currentWord; } LOGGER.trace(" . . iterated {} times, best distance: {} for a string {} at {}", iterationCounter, bestSoFarDistance, bestSoFarWord, bestSoFarPosition); } // outer loop LOGGER.trace("Distance calls: {}", distanceCalls); DiscordRecord res = new DiscordRecord(bestSoFarPosition, bestSoFarDistance, bestSoFarWord); res.setLength(windowSize); res.setInfo("distance calls: " + distanceCalls); return res; } /** * Translates the hash table into sortable array of substrings. * * @param hash * @return */ @Deprecated private static ArrayList<FrequencyTableEntry> hashToFreqEntries( HashMap<String, ArrayList<Integer>> hash) { ArrayList<FrequencyTableEntry> res = new ArrayList<FrequencyTableEntry>(); for (Entry<String, ArrayList<Integer>> e : hash.entrySet()) { char[] payload = e.getKey().toCharArray(); int frequency = e.getValue().size(); for (Integer i : e.getValue()) { res.add(new FrequencyTableEntry(i, payload.clone(), frequency)); } } return res; } /** * Calculates the Euclidean distance between two points. Don't use this unless you need that. * * @param subseries The first subsequence -- ASSUMED TO BE Z-normalized. * @param series The second point. * @param from the initial index of the range to be copied, inclusive * @param to the final index of the range to be copied, exclusive. (This index may lie outside the * array.) * @param nThreshold z-Normalization threshold. * @return The Euclidean distance between z-Normalized versions of subsequences. */ private static double distance(double[] subseries, double[] series, int from, int to, double nThreshold) throws Exception { double[] subsequence = tp.znorm(tp.subseriesByCopy(series, from, to), nThreshold); Double sum = 0D; for (int i = 0; i < subseries.length; i++) { double tmp = subseries[i] - subsequence[i]; sum = sum + tmp * tmp; } return Math.sqrt(sum); } }