package net.seninp.grammarviz.anomaly; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashSet; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.seninp.gi.logic.RuleInterval; import net.seninp.jmotif.distance.EuclideanDistance; import net.seninp.jmotif.sax.SAXProcessor; import net.seninp.jmotif.sax.TSProcessor; import net.seninp.jmotif.sax.discord.DiscordRecord; import net.seninp.jmotif.sax.discord.DiscordRecords; /** * Implements RRA algorithm. * * @author psenin * */ public class RRAImplementation { private static TSProcessor tp = new TSProcessor(); private static EuclideanDistance ed = new EuclideanDistance(); // static block - we instantiate the logger // private static final Logger LOGGER = LoggerFactory.getLogger(RRAImplementation.class); /** * Implements RRA -- an anomaly discovery algorithm based on discretization and grammar inference. * RRA stands for rare rule anomaly. * * @param series The series to find discord at. * @param discordCollectionSize How many discords to find. * @param intervals The intervals. In our implementation these come from the set of Sequitur * grammar rules. * @param zNormThreshold - the normalization threshold (dstance). * @return Discords. * @throws TSException If error occurs. */ public static DiscordRecords series2RRAAnomalies(double[] series, int discordCollectionSize, ArrayList<RuleInterval> intervals, double zNormThreshold) throws Exception { Date gStart = new Date(); // resulting discords collection DiscordRecords discords = new DiscordRecords(); if (intervals.isEmpty()) { return discords; } // visit registry HashSet<Integer> registry = new HashSet<Integer>( discordCollectionSize * intervals.get(0).getLength() * 2); // we conduct the search until the number of discords is less than desired // while (discords.getSize() < discordCollectionSize) { LOGGER.trace( "currently known discords: " + discords.getSize() + " out of " + discordCollectionSize); Date start = new Date(); DiscordRecord bestDiscord = findBestDiscordForIntervals(series, intervals, registry, zNormThreshold); Date end = new Date(); // if the discord is null we getting out of the search if (bestDiscord.getNNDistance() == Integer.MIN_VALUE || bestDiscord.getPosition() == Integer.MIN_VALUE) { LOGGER.trace("breaking the outer search loop, discords found: " + discords.getSize() + " last seen discord: " + bestDiscord.toString()); break; } bestDiscord.setInfo("position " + bestDiscord.getPosition() + ", length " + bestDiscord.getLength() + ", NN distance " + bestDiscord.getNNDistance() + ", elapsed time: " + SAXProcessor.timeToString(start.getTime(), end.getTime()) + ", " + bestDiscord.getInfo()); LOGGER.debug(bestDiscord.getInfo()); // collect the result // discords.add(bestDiscord); // mark the discord discovered // int markStart = bestDiscord.getPosition() - bestDiscord.getLength(); int markEnd = bestDiscord.getPosition() + bestDiscord.getLength(); if (markStart < 0) { markStart = 0; } if (markEnd > series.length) { markEnd = series.length; } for (int i = markStart; i < markEnd; i++) { registry.add(i); } } LOGGER.info(discords.getSize() + " discords found in " + SAXProcessor.timeToString(gStart.getTime(), new Date().getTime())); // done deal // return discords; } /** * * @param series * @param globalIntervals * @param registry * @param zNormThreshold * @return * @throws Exception */ public static DiscordRecord findBestDiscordForIntervals(double[] series, ArrayList<RuleInterval> globalIntervals, HashSet<Integer> registry, double zNormThreshold) throws Exception { // prepare the visits array, note that there can't be more points to visit that in a SAX index int[] visitArray = new int[globalIntervals.size()]; // this is outer loop heuristics ArrayList<RuleInterval> intervals = cloneIntervals(globalIntervals); Collections.sort(intervals, new Comparator<RuleInterval>() { public int compare(RuleInterval c1, RuleInterval c2) { return Double.compare(c1.getCoverage(), c2.getCoverage()); } }); // init variables int bestSoFarPosition = Integer.MIN_VALUE; int bestSoFarLength = Integer.MIN_VALUE; int bestSoFarRule = Integer.MIN_VALUE; double bestSoFarDistance = Integer.MIN_VALUE; // we will iterate over words from rarest to frequent ones - this is an OUTER LOOP of the best // discord search // int iterationCounter = 0; int distanceCalls = 0; LOGGER .trace("going to iterate over " + intervals.size() + " intervals looking for the discord"); for (int i = 0; i < intervals.size(); i++) { iterationCounter++; RuleInterval currentEntry = intervals.get(i); // make sure it is not a previously found discord if (registry.contains(currentEntry.getStart())) { continue; } int currentPos = currentEntry.getStart(); String currentRule = String.valueOf(currentEntry.getId()); LOGGER.trace("iteration " + i + ", out of " + intervals.size() + ", rule " + currentRule + " at " + currentPos + ", length " + currentEntry.getLength()); // other occurrences of the current rule // TODO : this can be taken out of here to optimize multiple discords discovery ArrayList<Integer> currentOccurences = listRuleOccurrences(currentEntry.getId(), intervals); LOGGER.trace(" there are " + currentOccurences.size() + " occurrences for the rule " + currentEntry.getId() + ", iterating..."); // organize visited so-far positions tracking // int markStart = currentPos - currentEntry.getLength(); if (markStart < 0) { markStart = 0; } int markEnd = currentPos + currentEntry.getLength(); if (markEnd > series.length) { markEnd = series.length; } // all the candidates we are not going to try HashSet<Integer> alreadyVisited = new HashSet<Integer>( currentOccurences.size() + (markEnd - markStart)); for (int j = markStart; j < markEnd; j++) { alreadyVisited.add(j); } // extract the subsequence & mark visited current substring // double[] currentSubsequence = tp.subseriesByCopy(series, currentEntry.getStart(), // currentEntry.getEnd()); // so, lets the search begin... double nearestNeighborDist = Double.MAX_VALUE; boolean doRandomSearch = true; // this is the first INNER LOOP for (Integer nextOccurrenceIdx : currentOccurences) { RuleInterval nextOccurrence = intervals.get(nextOccurrenceIdx); // skip the location we standing at, check if we overlap if (alreadyVisited.contains(nextOccurrence.getStart())) { continue; } else { alreadyVisited.add(nextOccurrence.getStart()); } // double[] occurrenceSubsequence = extractSubsequence(series, nextOccurrence); double dist = normalizedDistance(series, currentEntry, nextOccurrence, zNormThreshold); distanceCalls++; // keep track of best so far distance if (dist < nearestNeighborDist) { nearestNeighborDist = dist; LOGGER.trace(" ** current NN at interval " + nextOccurrence.getStart() + "-" + nextOccurrence.getEnd() + ", distance: " + nearestNeighborDist); } if (dist < bestSoFarDistance) { LOGGER.trace(" ** abandoning the occurrences iterations"); doRandomSearch = false; break; } } // check if we must continue with random neighbors if (doRandomSearch) { LOGGER.trace("starting random search"); // init the visit array // int visitCounter = 0; int cIndex = 0; for (int j = 0; j < intervals.size(); j++) { RuleInterval interval = intervals.get(j); if (!(alreadyVisited.contains(interval.getStart()))) { visitArray[cIndex] = j; cIndex++; } } cIndex--; // shuffle the visit array // Random rnd = new Random(); for (int j = cIndex; j > 0; j--) { int index = rnd.nextInt(j + 1); int a = visitArray[index]; visitArray[index] = visitArray[j]; visitArray[j] = a; } // while there are unvisited locations while (cIndex >= 0) { RuleInterval randomInterval = intervals.get(visitArray[cIndex]); cIndex--; // double[] randomSubsequence = extractSubsequence(series, randomInterval); double dist = normalizedDistance(series, currentEntry, randomInterval, zNormThreshold); distanceCalls++; // early abandoning of the search: // the current word is not discord, we have seen better if (dist < bestSoFarDistance) { nearestNeighborDist = dist; LOGGER.trace(" ** abandoning random visits loop, seen distance " + nearestNeighborDist + " at iteration " + visitCounter); break; } // keep track if (dist < nearestNeighborDist) { LOGGER.trace(" ** current NN id rule " + randomInterval.getId() + " at " + randomInterval.startPos + ", distance: " + dist); nearestNeighborDist = dist; } visitCounter = visitCounter + 1; } // while inner loop } // end of random search branch if (nearestNeighborDist > bestSoFarDistance) { LOGGER.trace(" updating discord candidate: rule " + currentEntry.getId() + " at " + currentEntry.getStart() + " len " + currentEntry.getLength() + " NN dist: " + bestSoFarDistance); bestSoFarDistance = nearestNeighborDist; bestSoFarPosition = currentEntry.getStart(); bestSoFarLength = currentEntry.getLength(); bestSoFarRule = currentEntry.getId(); } LOGGER.trace(" . . iterated " + iterationCounter + " times, best distance: " + bestSoFarDistance + " for a rule " + bestSoFarRule + " at " + bestSoFarPosition + " len " + bestSoFarLength); } // outer loop DiscordRecord res = new DiscordRecord(bestSoFarPosition, bestSoFarDistance, "pos,calls,len,rule " + bestSoFarPosition + " " + distanceCalls + " " + bestSoFarLength + " " + bestSoFarRule); res.setLength(bestSoFarLength); res.setRuleId(bestSoFarRule); res.setInfo("distance calls: " + distanceCalls); return res; } /** * Computes the normalized distance. The whole idea is that rules map to subsequences of different * length. * * @param series * @param reference * @param candidate * @param zNormThreshold * @return * @throws Exception */ private static double normalizedDistance(double[] series, RuleInterval reference, RuleInterval candidate, double zNormThreshold) throws Exception { double[] ref = Arrays.copyOfRange(series, reference.getStart(), reference.getEnd()); double[] cand = Arrays.copyOfRange(series, candidate.getStart(), candidate.getEnd()); double divisor = Integer.valueOf(ref.length).doubleValue(); // if the reference is the longest, we shrink it down with PAA // if (ref.length > cand.length) { ref = tp.paa(ref, cand.length); divisor = Integer.valueOf(cand.length).doubleValue(); // update the normalization value } // if the candidate is longest, we shrink it with PAA too // else { cand = tp.paa(cand, ref.length); } return ed.distance(tp.znorm(ref, zNormThreshold), tp.znorm(cand, zNormThreshold)) / divisor; } // /** // * Extracts a time series subsequence corresponding to the grammar rule adjusting for its // length. // * // * @param series // * @param randomInterval // * @return // */ // private static double[] extractSubsequence(double[] series, RuleInterval randomInterval) { // return Arrays.copyOfRange(series, randomInterval.getStart(), randomInterval.getEnd()); // } /** * Finds all the Sequitur rules with a given Id and populates their start and end into the array. * * @param id The rule Id. * @param intervals The rule intervals. * @return map of start - end. */ private static ArrayList<Integer> listRuleOccurrences(int id, ArrayList<RuleInterval> intervals) { ArrayList<Integer> res = new ArrayList<Integer>(100); for (int j = 0; j < intervals.size(); j++) { RuleInterval i = intervals.get(j); if (id == i.getId()) { res.add(j); } } return res; } /** * Cloning an array. * * @param source the source array. * @return the clone. */ private static ArrayList<RuleInterval> cloneIntervals(ArrayList<RuleInterval> source) { ArrayList<RuleInterval> res = new ArrayList<RuleInterval>(source.size()); for (RuleInterval r : source) { res.add(new RuleInterval(r.getId(), r.getStart(), r.getEnd(), r.getCoverage())); } return res; } }