RRAImplementation.java example

Explorer

grammarviz2_src-master
- src
  - main
    - java
      - net
        seninp
        grammarviz
        GrammarSizeSorter.java
        GrammarVizAnomaly.java
        GrammarVizAnomalyParameters.java
        GrammarVizGUI.java
        anomaly
        AnomalyAlgorithm.java
        RRAImplementation.java
        cli
        RulesWriter.java
        TS2GrammarParameters.java
        TS2SequiturGrammar.java
        controller
        GrammarVizController.java
        logic
        CoverageCountStrategy.java
        GrammarVizAnomalyFinder.java
        GrammarVizChartData.java
        IntegerValueFilter.java
        PackedRuleRecord.java
        RulePeriodicityRecord.java
        SAXMotif.java
        SAXPointsNumber.java
        SameLengthMotifs.java
        model
        GrammarVizMessage.java
        GrammarVizModel.java
        session
        UserSession.java
        tinker
        OneEuroFilter.java
        view
        AboutGrammarVizDialog.java
        GrammarRulesPanel.java
        GrammarVizAnomaliesPanel.java
        GrammarVizView.java
        GrammarvizChartPanel.java
        GrammarvizGuesserDialog.java
        GrammarvizGuesserPane.java
        GrammarvizOptionsDialog.java
        GrammarvizOptionsPane.java
        GrammarvizParamsSampler.java
        GrammarvizRuleChartPanel.java
        MouseMarker.java
        PackedRulesPanel.java
        RulesPeriodicityPanel.java
        table
        AnomalyTableColumns.java
        AnomalyTableModel.java
        CellDoubleRenderer.java
        GrammarvizRulesTableColumns.java
        GrammarvizRulesTableDataModel.java
        GrammarvizRulesTableModel.java
        PeriodicityTableColumns.java
        PeriodicityTableModel.java
        PrunedRulesTableColumns.java
        PrunedRulesTableDataModel.java
        PrunedRulesTableModel.java
        tinker
        Interval.java
        MovieMaker.java
        MovieUtils.java
        PagePrinter.java
        PaperDiscordFinder.java
        PaperDiscordFinderHOTSAX.java
        PaperWindowSampler.java
        ParamSampler.java
        SamplerAnomaly.java
        SamplerAnomalyParameters.java
        SamplerRecord.java
        SamplingSorter.java
        util
        SAXFileIOHelper.java
  - test
    - java
      - net
        seninp
        grammarviz
        anomaly
        TestRRAanomaly.java
        tinker
        TestInterval.java

package net.seninp.grammarviz.anomaly;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashSet;
import java.util.Random;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import net.seninp.gi.logic.RuleInterval;
import net.seninp.jmotif.distance.EuclideanDistance;
import net.seninp.jmotif.sax.SAXProcessor;
import net.seninp.jmotif.sax.TSProcessor;
import net.seninp.jmotif.sax.discord.DiscordRecord;
import net.seninp.jmotif.sax.discord.DiscordRecords;

/**
 * Implements RRA algorithm.
 * 
 * @author psenin
 *
 */
public class RRAImplementation {

  private static TSProcessor tp = new TSProcessor();
  private static EuclideanDistance ed = new EuclideanDistance();

  // static block - we instantiate the logger
  //
  private static final Logger LOGGER = LoggerFactory.getLogger(RRAImplementation.class);

  /**
   * Implements RRA -- an anomaly discovery algorithm based on discretization and grammar inference.
   * RRA stands for rare rule anomaly.
   * 
   * @param series The series to find discord at.
   * @param discordCollectionSize How many discords to find.
   * @param intervals The intervals. In our implementation these come from the set of Sequitur
   * grammar rules.
   * @param zNormThreshold - the normalization threshold (dstance).
   * @return Discords.
   * @throws TSException If error occurs.
   */
  public static DiscordRecords series2RRAAnomalies(double[] series, int discordCollectionSize,
      ArrayList<RuleInterval> intervals, double zNormThreshold) throws Exception {

    Date gStart = new Date();

    // resulting discords collection
    DiscordRecords discords = new DiscordRecords();

    if (intervals.isEmpty()) {
      return discords;
    }

    // visit registry
    HashSet<Integer> registry = new HashSet<Integer>(
        discordCollectionSize * intervals.get(0).getLength() * 2);

    // we conduct the search until the number of discords is less than desired
    //
    while (discords.getSize() < discordCollectionSize) {

      LOGGER.trace(
          "currently known discords: " + discords.getSize() + " out of " + discordCollectionSize);

      Date start = new Date();
      DiscordRecord bestDiscord = findBestDiscordForIntervals(series, intervals, registry,
          zNormThreshold);
      Date end = new Date();

      // if the discord is null we getting out of the search
      if (bestDiscord.getNNDistance() == Integer.MIN_VALUE
          || bestDiscord.getPosition() == Integer.MIN_VALUE) {
        LOGGER.trace("breaking the outer search loop, discords found: " + discords.getSize()
            + " last seen discord: " + bestDiscord.toString());
        break;
      }

      bestDiscord.setInfo("position " + bestDiscord.getPosition() + ", length "
          + bestDiscord.getLength() + ", NN distance " + bestDiscord.getNNDistance()
          + ", elapsed time: " + SAXProcessor.timeToString(start.getTime(), end.getTime()) + ", "
          + bestDiscord.getInfo());
      LOGGER.debug(bestDiscord.getInfo());

      // collect the result
      //
      discords.add(bestDiscord);

      // mark the discord discovered
      //
      int markStart = bestDiscord.getPosition() - bestDiscord.getLength();
      int markEnd = bestDiscord.getPosition() + bestDiscord.getLength();
      if (markStart < 0) {
        markStart = 0;
      }
      if (markEnd > series.length) {
        markEnd = series.length;
      }
      for (int i = markStart; i < markEnd; i++) {
        registry.add(i);
      }
    }

    LOGGER.info(discords.getSize() + " discords found in "
        + SAXProcessor.timeToString(gStart.getTime(), new Date().getTime()));

    // done deal
    //
    return discords;
  }

  /**
   * 
   * @param series
   * @param globalIntervals
   * @param registry
   * @param zNormThreshold
   * @return
   * @throws Exception
   */
  public static DiscordRecord findBestDiscordForIntervals(double[] series,
      ArrayList<RuleInterval> globalIntervals, HashSet<Integer> registry, double zNormThreshold)
      throws Exception {

    // prepare the visits array, note that there can't be more points to visit that in a SAX index
    int[] visitArray = new int[globalIntervals.size()];

    // this is outer loop heuristics
    ArrayList<RuleInterval> intervals = cloneIntervals(globalIntervals);
    Collections.sort(intervals, new Comparator<RuleInterval>() {
      public int compare(RuleInterval c1, RuleInterval c2) {
        return Double.compare(c1.getCoverage(), c2.getCoverage());
      }
    });

    // init variables
    int bestSoFarPosition = Integer.MIN_VALUE;
    int bestSoFarLength = Integer.MIN_VALUE;
    int bestSoFarRule = Integer.MIN_VALUE;

    double bestSoFarDistance = Integer.MIN_VALUE;

    // we will iterate over words from rarest to frequent ones - this is an OUTER LOOP of the best
    // discord search
    //
    int iterationCounter = 0;
    int distanceCalls = 0;

    LOGGER
        .trace("going to iterate over " + intervals.size() + " intervals looking for the discord");

    for (int i = 0; i < intervals.size(); i++) {

      iterationCounter++;

      RuleInterval currentEntry = intervals.get(i);

      // make sure it is not a previously found discord
      if (registry.contains(currentEntry.getStart())) {
        continue;
      }

      int currentPos = currentEntry.getStart();
      String currentRule = String.valueOf(currentEntry.getId());

      LOGGER.trace("iteration " + i + ", out of " + intervals.size() + ", rule " + currentRule
          + " at " + currentPos + ", length " + currentEntry.getLength());

      // other occurrences of the current rule
      // TODO : this can be taken out of here to optimize multiple discords discovery
      ArrayList<Integer> currentOccurences = listRuleOccurrences(currentEntry.getId(), intervals);
      LOGGER.trace(" there are " + currentOccurences.size() + " occurrences for the rule "
          + currentEntry.getId() + ", iterating...");

      // organize visited so-far positions tracking
      //
      int markStart = currentPos - currentEntry.getLength();
      if (markStart < 0) {
        markStart = 0;
      }
      int markEnd = currentPos + currentEntry.getLength();
      if (markEnd > series.length) {
        markEnd = series.length;
      }

      // all the candidates we are not going to try
      HashSet<Integer> alreadyVisited = new HashSet<Integer>(
          currentOccurences.size() + (markEnd - markStart));
      for (int j = markStart; j < markEnd; j++) {
        alreadyVisited.add(j);
      }

      // extract the subsequence & mark visited current substring
      // double[] currentSubsequence = tp.subseriesByCopy(series, currentEntry.getStart(),
      // currentEntry.getEnd());

      // so, lets the search begin...
      double nearestNeighborDist = Double.MAX_VALUE;
      boolean doRandomSearch = true;

      // this is the first INNER LOOP
      for (Integer nextOccurrenceIdx : currentOccurences) {

        RuleInterval nextOccurrence = intervals.get(nextOccurrenceIdx);

        // skip the location we standing at, check if we overlap
        if (alreadyVisited.contains(nextOccurrence.getStart())) {
          continue;
        }
        else {
          alreadyVisited.add(nextOccurrence.getStart());
        }

        // double[] occurrenceSubsequence = extractSubsequence(series, nextOccurrence);

        double dist = normalizedDistance(series, currentEntry, nextOccurrence, zNormThreshold);
        distanceCalls++;

        // keep track of best so far distance
        if (dist < nearestNeighborDist) {
          nearestNeighborDist = dist;
          LOGGER.trace(" ** current NN at interval " + nextOccurrence.getStart() + "-"
              + nextOccurrence.getEnd() + ", distance: " + nearestNeighborDist);
        }
        if (dist < bestSoFarDistance) {
          LOGGER.trace(" ** abandoning the occurrences iterations");
          doRandomSearch = false;
          break;
        }
      }

      // check if we must continue with random neighbors
      if (doRandomSearch) {
        LOGGER.trace("starting random search");

        // init the visit array
        //
        int visitCounter = 0;
        int cIndex = 0;
        for (int j = 0; j < intervals.size(); j++) {
          RuleInterval interval = intervals.get(j);
          if (!(alreadyVisited.contains(interval.getStart()))) {
            visitArray[cIndex] = j;
            cIndex++;
          }
        }
        cIndex--;

        // shuffle the visit array
        //
        Random rnd = new Random();
        for (int j = cIndex; j > 0; j--) {
          int index = rnd.nextInt(j + 1);
          int a = visitArray[index];
          visitArray[index] = visitArray[j];
          visitArray[j] = a;
        }

        // while there are unvisited locations
        while (cIndex >= 0) {

          RuleInterval randomInterval = intervals.get(visitArray[cIndex]);
          cIndex--;

          // double[] randomSubsequence = extractSubsequence(series, randomInterval);

          double dist = normalizedDistance(series, currentEntry, randomInterval, zNormThreshold);
          distanceCalls++;

          // early abandoning of the search:
          // the current word is not discord, we have seen better
          if (dist < bestSoFarDistance) {
            nearestNeighborDist = dist;
            LOGGER.trace(" ** abandoning random visits loop, seen distance " + nearestNeighborDist
                + " at iteration " + visitCounter);
            break;
          }

          // keep track
          if (dist < nearestNeighborDist) {
            LOGGER.trace(" ** current NN id rule " + randomInterval.getId() + " at "
                + randomInterval.startPos + ", distance: " + dist);
            nearestNeighborDist = dist;
          }

          visitCounter = visitCounter + 1;

        } // while inner loop

      } // end of random search branch

      if (nearestNeighborDist > bestSoFarDistance) {
        LOGGER.trace(" updating discord candidate: rule " + currentEntry.getId() + " at "
            + currentEntry.getStart() + " len " + currentEntry.getLength() + " NN dist: "
            + bestSoFarDistance);
        bestSoFarDistance = nearestNeighborDist;
        bestSoFarPosition = currentEntry.getStart();
        bestSoFarLength = currentEntry.getLength();
        bestSoFarRule = currentEntry.getId();
      }

      LOGGER.trace(" . . iterated " + iterationCounter + " times, best distance:  "
          + bestSoFarDistance + " for a rule " + bestSoFarRule + " at " + bestSoFarPosition
          + " len " + bestSoFarLength);

    } // outer loop

    DiscordRecord res = new DiscordRecord(bestSoFarPosition, bestSoFarDistance,
        "pos,calls,len,rule " + bestSoFarPosition + " " + distanceCalls + " " + bestSoFarLength
            + " " + bestSoFarRule);

    res.setLength(bestSoFarLength);
    res.setRuleId(bestSoFarRule);
    res.setInfo("distance calls: " + distanceCalls);

    return res;
  }

  /**
   * Computes the normalized distance. The whole idea is that rules map to subsequences of different
   * length.
   * 
   * @param series
   * @param reference
   * @param candidate
   * @param zNormThreshold
   * @return
   * @throws Exception
   */
  private static double normalizedDistance(double[] series, RuleInterval reference,
      RuleInterval candidate, double zNormThreshold) throws Exception {

    double[] ref = Arrays.copyOfRange(series, reference.getStart(), reference.getEnd());
    double[] cand = Arrays.copyOfRange(series, candidate.getStart(), candidate.getEnd());
    double divisor = Integer.valueOf(ref.length).doubleValue();

    // if the reference is the longest, we shrink it down with PAA
    //
    if (ref.length > cand.length) {
      ref = tp.paa(ref, cand.length);
      divisor = Integer.valueOf(cand.length).doubleValue(); // update the normalization value
    }
    // if the candidate is longest, we shrink it with PAA too
    //
    else {
      cand = tp.paa(cand, ref.length);
    }

    return ed.distance(tp.znorm(ref, zNormThreshold), tp.znorm(cand, zNormThreshold)) / divisor;

  }

  // /**
  // * Extracts a time series subsequence corresponding to the grammar rule adjusting for its
  // length.
  // *
  // * @param series
  // * @param randomInterval
  // * @return
  // */
  // private static double[] extractSubsequence(double[] series, RuleInterval randomInterval) {
  // return Arrays.copyOfRange(series, randomInterval.getStart(), randomInterval.getEnd());
  // }

  /**
   * Finds all the Sequitur rules with a given Id and populates their start and end into the array.
   * 
   * @param id The rule Id.
   * @param intervals The rule intervals.
   * @return map of start - end.
   */
  private static ArrayList<Integer> listRuleOccurrences(int id, ArrayList<RuleInterval> intervals) {
    ArrayList<Integer> res = new ArrayList<Integer>(100);
    for (int j = 0; j < intervals.size(); j++) {
      RuleInterval i = intervals.get(j);
      if (id == i.getId()) {
        res.add(j);
      }
    }
    return res;
  }

  /**
   * Cloning an array.
   * 
   * @param source the source array.
   * @return the clone.
   */
  private static ArrayList<RuleInterval> cloneIntervals(ArrayList<RuleInterval> source) {
    ArrayList<RuleInterval> res = new ArrayList<RuleInterval>(source.size());
    for (RuleInterval r : source) {
      res.add(new RuleInterval(r.getId(), r.getStart(), r.getEnd(), r.getCoverage()));
    }
    return res;
  }
}