/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data.util;
import java.util.ArrayList;
import java.util.Random;
import java.util.TreeMap;
import com.addthis.codec.codables.Codable;
/**
* Tools for finding change points in an integer array.
*/
public class ChangePoints implements Codable {
/**
* Finds statistically significant changes in an Integer Array
* Returns a Map: keys are indices of big changes
* vals are estimated sizes of these changes
*/
public static TreeMap<Integer, Integer> findChangePoints(Integer[] data, double threshold_ratio, int min_size, double inactive_threshold) {
ArrayList<Integer> candidates = FCPhelper(data, 0, data.length - 1);
return checkCandidates(candidates, data, threshold_ratio, min_size);
}
public static TreeMap<Integer, Integer> findPeaks2(Integer[] data, int max_width, int min_height) {
TreeMap<Integer, Integer> rv = new TreeMap<>();
int currIndex = 0;
int currHt = data[0];
int currWidth = 0;
boolean started = false;
if (data.length <= 1) return rv;
for (int i = 0; i < data.length; i++) {
if (data[i] > currHt && data[i] > min_height) {
started = true;
currIndex = i;
currHt = data[i];
currWidth = 0;
} else {
if (started) {
currWidth++;
if (currWidth >= max_width) {
started = false;
rv.put(currIndex, currHt);
currWidth = 0;
currIndex = -1;
currHt = 0;
}
}
}
}
if (started) {
rv.put(currIndex, currHt);
}
return rv;
}
public static TreeMap<Integer, Integer> findPeaks(Integer[] data, int min_size) {
TreeMap<Integer, Integer> rv = new TreeMap<>();
Integer lastPeak = -1;
for (int i = 1; i < data.length; i++) {
int lb, rb;
if (i < 3 || i > data.length - 5) {
continue;
}
lb = i - 3;
rb = i + 3;
Integer[] leftNbhd = slice(data, lb, i - 1);
Integer[] rightNbhd = slice(data, i + 1, rb);
Integer del = differedPeaky(leftNbhd, rightNbhd, data[i]);
if (Math.abs(del) > min_size) // A peak of width 1
{
rv.put(i, del);
} else // A peak of width 2
{
Integer pairMean = (data[i] + data[i + 1]) / 2;
leftNbhd = slice(data, lb, i - 1);
rightNbhd = slice(data, i + 2, rb + 1);
del = differedPeaky(leftNbhd, rightNbhd, pairMean);
if (Math.abs(del) > min_size) // A peak of width 1
{
rv.put(i, del);
}
}
}
return rv;
}
private static int differedPeaky(Integer[] A, Integer[] B, Integer z) {
// Switch to a linear regression approach?
double diffA = z - mean(A);
double diffB = z - mean(B);
if (Math.abs(diffA) > 3 * Math.sqrt(var(A))
&& Math.abs(diffB) > 3 * Math.sqrt(var(B))
&& (diffA > 0) == (diffB > 0))
{
return (int) diffA;
} else {
return 0;
}
}
public static TreeMap<String, Integer> findChangePointsDetails(Integer[] data, double threshold_ratio, int min_size, int inactive_threshold) {
ArrayList<Integer> candidates = FCPhelper(data, 0, data.length - 1);
return checkAndSortCandidates(candidates, data, threshold_ratio, min_size, inactive_threshold);
}
private static ArrayList<Integer> FCPhelper(Integer[] data, int a, int b) {
ArrayList<Integer> rv = new ArrayList<>();
if (b > a) {
Integer[] currSlice = slice(data, a, b);
if (changeOccured(currSlice)) {
int index = findMinMSE(currSlice) + a;
if (index - 1 > a) {
ArrayList<Integer> firstCPs = FCPhelper(data, a, index - 1);
for (int z : firstCPs) {
rv.add(z);
}
}
rv.add(index);
if (index + 1 < b) {
ArrayList<Integer> secondCPs = FCPhelper(data, index + 1, b);
for (int z : secondCPs) {
rv.add(z);
}
}
}
}
return rv;
}
private static TreeMap<Integer, Integer> checkCandidates(ArrayList<Integer> cands, Integer[] data, double threshold_ratio, int min_size) {
TreeMap<Integer, Integer> rv = new TreeMap<>();
for (int i = 0; i < cands.size(); i++) {
int z = cands.get(i);
int lb, rb;
if (z < 2 || z > data.length - 4) {
continue;
}
if (i == 0) {
lb = Math.min(z - 2, z - z / 4);
} else {
int prevCand = cands.get(i - 1);
lb = Math.min(z - 2, z - (z - prevCand) / 4);
}
if (i == cands.size() - 1) {
rb = Math.max(z + 3, z + (data.length - z) / 4);
} else {
int nextCand = cands.get(i + 1);
rb = Math.max(z + 3, z + (nextCand - z) / 4);
}
int del = differed(slice(data, lb, z), slice(data, z + 1, rb), threshold_ratio);
if (Math.abs(del) > min_size) {
rv.put(z, del);
}
}
return rv;
}
private static TreeMap<String, Integer> checkAndSortCandidates(ArrayList<Integer> cands, Integer[] data, double threshold_ratio, int min_size, int inactive_threshold) {
TreeMap<String, Integer> rv = new TreeMap<>();
for (int i = 0; i < cands.size(); i++) {
Integer z = cands.get(i);
int lb, rb;
if (z < 2 || z > data.length - 4) {
continue;
}
if (i == 0) {
lb = Math.min(z - 2, z - z / 4);
} else {
int prevCand = cands.get(i - 1);
lb = Math.min(z - 2, z - (z - prevCand) / 4);
}
if (i == cands.size() - 1) {
rb = Math.max(z + 3, z + (data.length - z) / 4);
} else {
int nextCand = cands.get(i + 1);
rb = Math.max(z + 3, z + (nextCand - z) / 4);
}
int del = differed(slice(data, lb, z), slice(data, z + 1, rb), threshold_ratio);
// Currently recalculates means.
double meanA = mean(slice(data, lb, z));
double meanB = mean(slice(data, z + 1, rb));
if (Math.abs(del) > min_size) {
rv.put(findChangeType(meanA, meanB, inactive_threshold) + z, Math.abs(del));
}
}
return rv;
}
/**
* Uses Linear Regression to test whether the second array of data is out
* of line with the trend in the first array. Returns an estimate for the size
* of the change if a change is detected, and zero otherwise.
*/
private static int differed(Integer[] A, Integer[] B, Double threshold_ratio) {
double sdA = Math.sqrt(var(A));
double[] lr = linReg(A);
double meanA = mean(A);
double meanB = mean(B);
double biggerMean = Math.max(Math.abs(meanA), Math.abs(meanB));
double diff = meanB - (lr[0] * (A.length + (double) (B.length - 1) / 2) + lr[1]);
if (Math.abs(diff) > 3 * sdA && Math.abs(diff) > threshold_ratio * biggerMean) {
return (int) (meanB - meanA);
} else {
return 0;
}
}
private static String findChangeType(double meanA, double meanB, int inactive_threshold) {
if (meanA < meanB) {
if (meanA < inactive_threshold) {
return "B"; // Begin
} else {
return "R"; // Rise
}
} else {
if (meanB < inactive_threshold) {
return "S"; // Stop
} else {
return "F"; // Fall
}
}
}
private static Integer[] slice(Integer[] data, int a, int b) {
int l = b - a + 1;
Integer[] rv = new Integer[l];
for (int i = 0; i < l; i++) {
rv[i] = data[a + i];
}
return rv;
}
private static boolean changeOccured(Integer[] data) {
double CONFIDENCE_LEVEL = .9;
int BOOTSTRAPS = 200;
double mean = mean(data);
double[] cs = cusum(data, mean);
double Sdiff = range(cs);
int numLess = 0;
for (int i = 0; i < BOOTSTRAPS; i++) {
double S0diff = range(cusum(bootstrap(data), mean));
numLess += (S0diff < Sdiff ? 1 : 0);
}
return (double) (numLess) / BOOTSTRAPS > CONFIDENCE_LEVEL;
}
private static Integer[] bootstrap(Integer[] data) {
Random random = new Random();
Integer[] copy = data.clone();
int l = data.length;
for (int i = 0; i < data.length; i++) {
int r = random.nextInt(l);
Integer temp = copy[i];
copy[i] = copy[r];
copy[r] = temp;
}
return copy;
}
private static double[] cusum(Integer[] data, double m) {
int l = data.length;
double[] cusum = new double[l];
cusum[0] = data[0] - m;
for (int i = 1; i < l; i++) {
cusum[i] = cusum[i - 1] + data[i] - m;
}
return cusum;
}
private static double range(double[] A) {
double[] minmax = minmax(A);
return minmax[1] - minmax[0];
}
private static double[] minmax(double[] A) {
double min = A[0];
double max = A[0];
for (int i = 1; i < A.length; i++) {
min = A[i] < min ? A[i] : min;
max = A[i] > max ? A[i] : max;
}
return new double[]
{
min, max
};
}
public static double mean(Integer[] A) {
return (double) (sum(A)) / A.length;
}
private static double var(Integer[] A) {
double m = mean(A);
double sum = 0;
for (Integer z : A) {
sum += Math.pow(z - m, 2);
}
return sum;
}
private static Long sum(Integer[] data) {
long sum = 0;
for (int i = 0; i < data.length; i++) {
sum += data[i];
}
return sum;
}
private static int findMinMSE(Integer[] data) {
int minInd = 0;
double minVal = MSE(data, 0);
for (int i = 1; i < data.length; i++) {
double newMSE = MSE(data, i);
if (newMSE < minVal) {
minInd = i;
minVal = newMSE;
}
}
return minInd;
}
private static double MSE(Integer[] data, int i) {
Integer[] firstPart = slice(data, 0, i);
Integer[] secondPart = slice(data, i + 1, data.length - 1);
double firstSum = 0;
double secondSum = 0;
double firstMean = mean(firstPart);
double secondMean = mean(secondPart);
for (Integer z : firstPart) {
firstSum += Math.pow(z - firstMean, 2);
}
for (Integer z : secondPart) {
secondSum += Math.pow(z - secondMean, 2);
}
return firstSum + secondSum;
}
/**
* Finds a,b such that ax+b is a good model for Data
*/
private static double[] linReg(Integer[] Data) {
double[] rv = new double[2];
int l = Data.length;
Integer[] xy = new Integer[l];
for (int i = 0; i < l; i++) {
xy[i] = i * Data[i];
}
Integer[] xx = new Integer[l];
for (int i = 0; i < l; i++) {
xx[i] = i * i;
}
Integer sumx = l * (l - 1) / 2;
rv[0] = (sum(xy) - (double) (sumx * sum(Data)) / l) / (sum(xx) - (double) (sumx * sumx) / l);
rv[1] = mean(Data) - rv[0] * (double) (l) / 2;
return rv;
}
}