package org.numenta.nupic.algorithms;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.numenta.nupic.algorithms.Anomaly.KEY_MEAN;
import static org.numenta.nupic.algorithms.Anomaly.KEY_MODE;
import static org.numenta.nupic.algorithms.Anomaly.KEY_STDEV;
import static org.numenta.nupic.algorithms.Anomaly.KEY_VARIANCE;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.stream.IntStream;
import org.joda.time.DateTime;
import org.junit.Before;
import org.junit.Test;
import org.numenta.nupic.algorithms.Anomaly.AveragedAnomalyRecordList;
import org.numenta.nupic.algorithms.Anomaly.Mode;
import org.numenta.nupic.util.ArrayUtils;
import org.numenta.nupic.util.Condition;
import org.numenta.nupic.util.MersenneTwister;
import gnu.trove.iterator.TDoubleIterator;
import gnu.trove.map.TObjectDoubleMap;
import gnu.trove.map.hash.TObjectDoubleHashMap;
public class AnomalyLikelihoodTest {
private AnomalyLikelihood an;
@Before
public void setup() {
Map<String, Object> params = new HashMap<>();
params.put(KEY_MODE, Mode.LIKELIHOOD);
an = (AnomalyLikelihood)Anomaly.create(params);
}
/**
* Given the parameters of a distribution, generate numSamples points from it.
* This routine is mostly for testing.
*
* @param mean
* @param variance
* @return
*/
public static double[] sampleDistribution(Random random, double mean, double variance, int size) {
SampleDistribution sampler = new SampleDistribution(mean, Math.sqrt(variance), size);
return sampler.getSample(random);
}
/**
* Generate 1440 samples of fake metrics data with a particular distribution
* of anomaly scores and metric values. Here we generate values every minute.
*
* @param mean
* @param variance
* @param metricMean
* @param metricVariance
* @return
*/
public static List<Sample> generateSampleData(double mean, double variance, double metricMean, double metricVariance) {
List<Sample> retVal = new ArrayList<>();
Random random = new MersenneTwister(42);
double[] samples = sampleDistribution(random, mean, variance, 1440);
double[] metricValues = sampleDistribution(random, metricMean, metricVariance, 1440);
for(int hour : ArrayUtils.range(0, 24)) {
for(int minute : ArrayUtils.range(0, 60)) {
retVal.add(
new Sample(
new DateTime(2013, 2, 2, hour, minute),
metricValues[hour * 60 + minute],
samples[hour * 60 + minute]
)
);
}
}
return retVal;
}
public static boolean assertWithinEpsilon(double a, double b) {
return assertWithinEpsilon(a, b, 0.001);
}
public static boolean assertWithinEpsilon(double a, double b, double epsilon) {
if(Math.abs(a - b) <= epsilon) {
return true;
}
return false;
}
/**
* This test attempts to find the anomaly-probability after create an
* AnomalyLikelihood instance with default values for the learning period
* and estimation samples. This used to generate an exception stating that
* you must have at least one anomaly score.
*/
@Test
public void testConstructorWithDefaultLearningPeriodAndEstimationSamples() {
this.an.anomalyProbability(0.75, 0.5, null);
}
@Test
public void testNormalProbability() {
TObjectDoubleMap<String> p = new TObjectDoubleHashMap<>();
p.put(KEY_MEAN, 0.0);
p.put(KEY_VARIANCE, 1.0);
p.put(KEY_STDEV, 1.0);
// Test a standard normal distribution
// Values taken from http://en.wikipedia.org/wiki/Standard_normal_table
assertWithinEpsilon(an.normalProbability(0.0, p), 0.5);
assertWithinEpsilon(an.normalProbability(0.3, p), 0.3820885780);
assertWithinEpsilon(an.normalProbability(1.0, p), 0.1587);
assertWithinEpsilon(1.0 - an.normalProbability(1.0, p), an.normalProbability(-1.0, p));
assertWithinEpsilon(an.normalProbability(-0.3, p), 1.0 - an.normalProbability(0.3, p));
// Non standard normal distribution
// p = {"name": "normal", "mean": 1.0, "variance": 4.0, "stdev": 2.0}
p.put(KEY_MEAN, 1.0);
p.put(KEY_VARIANCE, 4.0);
p.put(KEY_STDEV, 2.0);
assertWithinEpsilon(an.normalProbability(1.0, p), 0.5);
assertWithinEpsilon(an.normalProbability(2.0, p), 0.3085);
assertWithinEpsilon(an.normalProbability(3.0, p), 0.1587);
assertWithinEpsilon(an.normalProbability(3.0, p), 1.0 - an.normalProbability(-1.0, p));
assertWithinEpsilon(an.normalProbability(0.0, p), 1.0 - an.normalProbability(2.0, p));
// Non standard normal distribution
// p = {"name": "normal", "mean": -2.0, "variance": 0.5, "stdev": math.sqrt(0.5)}
p.put(KEY_MEAN, -2.0);
p.put(KEY_VARIANCE, 0.5);
p.put(KEY_STDEV, Math.sqrt(0.5));
assertWithinEpsilon(an.normalProbability(-2.0, p), 0.5);
assertWithinEpsilon(an.normalProbability(-1.5, p), 0.241963652);
assertWithinEpsilon(an.normalProbability(-2.5, p), 1.0 - an.normalProbability(-1.5, p));
}
/**
* This passes in a known set of data and ensures the estimateNormal
* function returns the expected results.
*/
@Test
public void testEstimateNormal() {
Map<String, Object> params = new HashMap<String, Object>();
params.put(KEY_MODE, Mode.LIKELIHOOD);
// 100 samples drawn from mean=0.4, stdev = 0.5
double[] samples = new double[] {
0.32259025, -0.44936321, -0.15784842, 0.72142628, 0.8794327,
0.06323451, -0.15336159, -0.02261703, 0.04806841, 0.47219226,
0.31102718, 0.57608799, 0.13621071, 0.92446815, 0.1870912,
0.46366935, -0.11359237, 0.66582357, 1.20613048, -0.17735134,
0.20709358, 0.74508479, 0.12450686, -0.15468728, 0.3982757,
0.87924349, 0.86104855, 0.23688469, -0.26018254, 0.10909429,
0.65627481, 0.39238532, 0.77150761, 0.47040352, 0.9676175,
0.42148897, 0.0967786, -0.0087355, 0.84427985, 1.46526018,
1.19214798, 0.16034816, 0.81105554, 0.39150407, 0.93609919,
0.13992161, 0.6494196, 0.83666217, 0.37845278, 0.0368279,
-0.10201944, 0.41144746, 0.28341277, 0.36759426, 0.90439446,
0.05669459, -0.11220214, 0.34616676, 0.49898439, -0.23846184,
1.06400524, 0.72202135, -0.2169164, 1.136582, -0.69576865,
0.48603271, 0.72781008, -0.04749299, 0.15469311, 0.52942518,
0.24816816, 0.3483905, 0.7284215, 0.93774676, 0.07286373,
1.6831539, 0.3851082, 0.0637406, -0.92332861, -0.02066161,
0.93709862, 0.82114131, 0.98631562, 0.05601529, 0.72214694,
0.09667526, 0.3857222, 0.50313998, 0.40775344, -0.69624046,
-0.4448494, 0.99403206, 0.51639049, 0.13951548, 0.23458214,
1.00712699, 0.40939048, -0.06436434, -0.02753677, -0.23017904
};
Statistic result = an.estimateNormal(samples, true);
assertTrue(assertWithinEpsilon(result.mean, 0.3721));
assertTrue(assertWithinEpsilon(result.variance, 0.22294));
assertTrue(assertWithinEpsilon(result.stdev, 0.47216));
}
/**
* Test that sampleDistribution from a generated distribution returns roughly
* the same parameters.
*/
@Test
public void testSampleDistribution() {
TObjectDoubleMap<String> p = new TObjectDoubleHashMap<>();
p.put(KEY_MEAN, 0.5);
p.put(KEY_STDEV, Math.sqrt(0.1));
p.put(KEY_VARIANCE, 0.1);
double[] samples = sampleDistribution(new MersenneTwister(), 0.5, 0.1, 1000);
Statistic np = an.estimateNormal(samples, true);
assertTrue(assertWithinEpsilon(p.get(KEY_MEAN), np.mean, 0.1));
assertTrue(assertWithinEpsilon(p.get(KEY_VARIANCE), np.variance, 0.1));
assertTrue(assertWithinEpsilon(p.get(KEY_STDEV), np.stdev, 0.1));
}
/**
* This calls estimateAnomalyLikelihoods to estimate the distribution on fake
* data and validates the results
*/
@Test
public void testEstimateAnomalyLikelihoods() {
// Generate an estimate using fake distribution of anomaly scores.
List<Sample> data = generateSampleData(0.2, 0.2, 0.2, 0.2).subList(0, 1000);
AnomalyLikelihoodMetrics metrics = an.estimateAnomalyLikelihoods(data, 10, 0);
assertEquals(1000, metrics.getLikelihoods().length);
assertEquals(1000, metrics.getAvgRecordList().averagedRecords.size());
assertTrue(an.isValidEstimatorParams(metrics.getParams()));
// Get the total
double total = 0;
for(Sample sample : metrics.getAvgRecordList().averagedRecords) {
total = total + sample.score;
}
// Check that the estimated mean is correct
Statistic statistic = (Statistic)metrics.getParams().distribution();
assertTrue(
assertWithinEpsilon(
statistic.mean, (total / (double)metrics.getAvgRecordList().averagedRecords.size())
)
);
int count = ArrayUtils.where(metrics.getLikelihoods(), new Condition.Adapter<Double>() {
public boolean eval(double d) { return d < 0.02; }
}).length;
assertTrue(count <= 50);
assertTrue(count >= 1);
}
/**
* NOTE: SKIPPED
*
* This calls {@link AnomalyLikelihood#estimateAnomalyLikelihoods(List, int, int)}
* to estimate the distribution on fake data and validates the results
*/
@Test
public void testEstimateAnomalyLikelihoodsMalformedRecords() {
// Skipped due to impossibility of forming bad Sample objects in Java
}
/**
* This tests the anomalyProbability method with a number of calls that will
* trigger copying of the sample array.
*/
@Test
public void testAnomalyProbabilityArrayCopying() {
Map<String, Object> params = new HashMap<>();
params.put(KEY_MODE, Mode.LIKELIHOOD);
params.put(AnomalyLikelihood.KEY_LEARNING_PERIOD, 300);
params.put(AnomalyLikelihood.KEY_ESTIMATION_SAMPLES, 300);
an = (AnomalyLikelihood) Anomaly.create(params);
for (int i = 0; i < 2000; i++) {
an.anomalyProbability(0.07, .5, null);
}
}
/**
* This calls estimateAnomalyLikelihoods with various values of skipRecords
*/
@Test
public void testSkipRecords() {
// Generate an estimate using fake distribution of anomaly scores.
List<Sample> data = generateSampleData(0.1, 0.2, 0.2, 0.2).subList(0, 200);
data.addAll(generateSampleData(0.9, 0.2, 0.2, 0.2).subList(0, 200));
// skipRecords = 200
AnomalyLikelihoodMetrics metrics = an.estimateAnomalyLikelihoods(data, 10, 200);
Statistic stats = (Statistic)metrics.getParams().distribution();
// Check results are correct, i.e. we are actually skipping the first 50
assertWithinEpsilon(stats.mean, 0.9, 0.1);
// Check case where skipRecords > num records
// In this case a null distribution should be returned which makes all
// the likelihoods reasonably high
metrics = an.estimateAnomalyLikelihoods(data, 10, 500);
assertEquals(metrics.getLikelihoods().length, data.size());
assertTrue(ArrayUtils.sum(metrics.getLikelihoods()) >= 0.3 * metrics.getLikelihoods().length);
// Check the case where skipRecords == num records
metrics = an.estimateAnomalyLikelihoods(data, 10, data.size());
assertEquals(metrics.getLikelihoods().length, data.size());
assertTrue(ArrayUtils.sum(metrics.getLikelihoods()) >= 0.3 * metrics.getLikelihoods().length);
}
/**
* A slight more complex test. This calls estimateAnomalyLikelihoods
* to estimate the distribution on fake data, followed by several calls
* to updateAnomalyLikelihoods.
*/
@Test
public void testUpdateAnomalyLikelihoods() {
//----------------------------------------
// Step 1. Generate an initial estimate using fake distribution of anomaly scores.
List<Sample> data1 = generateSampleData(0.2, 0.2, 0.2, 0.2).subList(0, 1000);
AnomalyLikelihoodMetrics metrics1 = an.estimateAnomalyLikelihoods(data1, 5, 0);
//----------------------------------------
// Step 2. Generate some new data with a higher average anomaly
// score. Using the estimator from step 1, to compute likelihoods. Now we
// should see a lot more anomalies.
List<Sample> data2 = generateSampleData(0.6, 0.2, 0.2, 0.2).subList(0, 300);
AnomalyLikelihoodMetrics metrics2 = an.updateAnomalyLikelihoods(data2, metrics1.getParams());
assertEquals(metrics2.getLikelihoods().length, data2.size());
assertEquals(metrics2.getAvgRecordList().size(), data2.size());
assertTrue(an.isValidEstimatorParams(metrics2.getParams()));
// The new running total should be different
assertFalse(metrics1.getAvgRecordList().total == metrics2.getAvgRecordList().total);
// We should have many more samples where likelihood is < 0.01, but not all
Condition<Double> cond = new Condition.Adapter<Double>() {
public boolean eval(double d) { return d < 0.01; }
};
int conditionCount = ArrayUtils.where(metrics2.getLikelihoods(), cond).length;
assertTrue(conditionCount >= 25);
assertTrue(conditionCount <= 250);
//----------------------------------------
// Step 3. Generate some new data with the expected average anomaly score. We
// should see fewer anomalies than in Step 2.
List<Sample> data3 = generateSampleData(0.2, 0.2, 0.2, 0.2).subList(0, 1000);
AnomalyLikelihoodMetrics metrics3 = an.updateAnomalyLikelihoods(data3, metrics2.getParams());
assertEquals(metrics3.getLikelihoods().length, data3.size());
assertEquals(metrics3.getAvgRecordList().size(), data3.size());
assertTrue(an.isValidEstimatorParams(metrics3.getParams()));
// The new running total should be different
assertFalse(metrics1.getAvgRecordList().total == metrics3.getAvgRecordList().total);
assertFalse(metrics2.getAvgRecordList().total == metrics3.getAvgRecordList().total);
// We should have a small number of samples where likelihood is < 0.02
conditionCount = ArrayUtils.where(metrics3.getLikelihoods(), cond).length;
assertTrue(conditionCount >= 1);
assertTrue(conditionCount <= 100);
//------------------------------------------
// Step 4. Validate that sending data incrementally is the same as sending
// in one batch
List<Sample> allData = new ArrayList<>();
allData.addAll(data1);
allData.addAll(data2);
allData.addAll(data3);
AveragedAnomalyRecordList recordList = an.anomalyScoreMovingAverage(allData, 5);
double[] historicalValuesAll = new double[recordList.historicalValues.size()];
int i = 0;
for(TDoubleIterator it = recordList.historicalValues.iterator();it.hasNext();) {
historicalValuesAll[i++] = it.next();
}
assertEquals(ArrayUtils.sum(historicalValuesAll), ArrayUtils.sum(
metrics3.getParams().movingAverage().getSlidingWindow().toArray()), 0);
assertEquals(recordList.total, metrics3.getParams().movingAverage().getTotal(), 0);
}
/**
* This calls estimateAnomalyLikelihoods with flat distributions and
* ensures things don't crash.
*/
@Test
public void testFlatAnomalyScores() {
// Generate an estimate using fake distribution of anomaly scores.
List<Sample> data1 = generateSampleData(42, 1e-10, 0.2, 0.2).subList(0, 1000);
AnomalyLikelihoodMetrics metrics1 = an.estimateAnomalyLikelihoods(data1, 10, 0);
assertEquals(metrics1.getLikelihoods().length, data1.size());
assertEquals(metrics1.getAvgRecordList().size(), data1.size());
assertTrue(an.isValidEstimatorParams(metrics1.getParams()));
// Check that the estimated mean is correct
Statistic stats = metrics1.getParams().distribution();
assertWithinEpsilon(stats.mean, data1.get(0).score);
// If you deviate from the mean, you should get probability 0
// Test this by sending in just slightly different values.
List<Sample> data2 = generateSampleData(42.5, 1e-10, 0.2, 0.2);
AnomalyLikelihoodMetrics metrics2 = an.updateAnomalyLikelihoods(data2.subList(0, 10), metrics1.getParams());
// The likelihoods should go to zero very quickly
assertTrue(ArrayUtils.sum(metrics2.getLikelihoods()) <= 0.01);
// Test edge case where anomaly scores are very close to 0
// In this case we don't let likelihood to get too low. An average
// anomaly score of 0.1 should be essentially zero, but an average
// of 0.04 should be higher
List<Sample> data3 = generateSampleData(0.01, 1e-6, 0.2, 0.2);
AnomalyLikelihoodMetrics metrics3 = an.estimateAnomalyLikelihoods(data3.subList(0, 100), 10, 0);
List<Sample> data4 = generateSampleData(0.1, 1e-6, 0.2, 0.2);
AnomalyLikelihoodMetrics metrics4 = an.updateAnomalyLikelihoods(data4.subList(0, 20), metrics3.getParams());
// Average of 0.1 should go to zero
double[] likelihoods4 = Arrays.copyOfRange(metrics4.getLikelihoods(), 10, metrics4.getLikelihoods().length);
assertTrue(ArrayUtils.average(likelihoods4) <= 0.002);
List<Sample> data5 = generateSampleData(0.05, 1e-6, 0.2, 0.2);
AnomalyLikelihoodMetrics metrics5 = an.updateAnomalyLikelihoods(data5.subList(0, 20), metrics4.getParams());
// The likelihoods should be low but not near zero
double[] likelihoods5 = Arrays.copyOfRange(metrics5.getLikelihoods(), 10, metrics4.getLikelihoods().length);
assertTrue(ArrayUtils.average(likelihoods5) <= 0.28);
assertTrue(ArrayUtils.average(likelihoods5) > 0.015);
}
/**
* This calls estimateAnomalyLikelihoods with flat metric values. In this case
* we should use the null distribution, which gets reasonably high likelihood
* for everything.
*/
@Test
public void testFlatMetricScores() {
// Generate samples with very flat metric values
List<Sample> data1 = generateSampleData(0.2, 0.2, 42, 1e-10).subList(0, 1000);
// Check that we do indeed get reasonable likelihood values
AnomalyLikelihoodMetrics metrics1 = an.estimateAnomalyLikelihoods(data1, 10, 0);
assertEquals(metrics1.getLikelihoods().length, data1.size());
double[] likelihoods = metrics1.getLikelihoods();
assertTrue(ArrayUtils.sum(likelihoods) >= 0.4 * likelihoods.length);
metrics1.getParams().distribution().equals(an.nullDistribution());
assertTrue(metrics1.getParams().distribution().equals(an.nullDistribution()));
}
/**
* This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
* with one or no scores.
*/
@Test
public void testVeryFewScores() {
// Generate an estimate using two data points
List<Sample> data1 = generateSampleData(42, 1e-10, 0.2, 0.2).subList(0, 2);
AnomalyLikelihoodMetrics metrics1 = an.estimateAnomalyLikelihoods(data1, 10, 0);
assertTrue(an.isValidEstimatorParams(metrics1.getParams()));
// Check that the estimated mean is that value
assertWithinEpsilon(metrics1.getParams().distribution().mean, data1.get(0).score);
// Can't generate an estimate using no data points
List<Sample> test = new ArrayList<>();
try {
an.estimateAnomalyLikelihoods(test, 10, 0);
fail();
}catch(Exception e) {
assertTrue(e.getMessage().equals("Must have at least one anomaly score."));
}
// Can't update without scores
try {
an.updateAnomalyLikelihoods(test, metrics1.getParams());
fail();
}catch(Exception e) {
assertTrue(e.getMessage().equals("Must have at least one anomaly score."));
}
}
/**
* NOTE: Not a valid test in java. Remnant of Python ability to substitute types, so we
* just do a simple test
*/
@Test
public void testFilterLikelihoodsInputType() {
double[] l2 = an.filterLikelihoods(new double[] { 0.0, 0.0, 0.3, 0.3, 0.5 });
double[] filtered = new double[] { 0.0, 0.001, 0.3, 0.3, 0.5 };
int i = 0;
for(double d : l2) {
assertEquals(d, filtered[i++], 0.01);
}
}
/**
* <pre>
* Tests _filterLikelihoods function for several cases:
* i. Likelihood goes straight to redzone, skipping over yellowzone, repeats
* ii. Case (i) with different values, and numpy array instead of float list
* iii. A scenario where changing the redzone from four to five 9s should
* filter differently
* </pre>
*/
@Test
public void testFilterLikelihoods() {
double redThreshold = 0.9999;
double yellowThreshold = 0.999;
// Case (i): values at indices 1 and 7 should be filtered to yellow zone
double[] l = { 1.0, 1.0, 0.9, 0.8, 0.5, 0.4, 1.0, 1.0, 0.6, 0.0 };
l = Arrays.stream(l).map(d -> 1.0d - d).toArray();
double[] l2a = Arrays.copyOf(l, l.length);
l2a[1] = 1 - yellowThreshold;
l2a[7] = 1 - yellowThreshold;
double[] l3a = an.filterLikelihoods(l, redThreshold, yellowThreshold);
int successIndexes =
IntStream.range(0, l.length).map(i -> { assertEquals(l2a[i], l3a[i], 0.01); return 1; }).sum();
assertEquals(successIndexes, l.length);
// Case (ii): values at indices 1-10 should be filtered to yellow zone
l = new double[] {
0.999978229, 0.999978229, 0.999999897, 1, 1, 1, 1,
0.999999994, 0.999999966, 0.999999966, 0.999994331,
0.999516576, 0.99744487 };
l = Arrays.stream(l).map(d -> 1.0d - d).toArray();
double[] l2b = Arrays.copyOf(l, l.length);
ArrayUtils.setIndexesTo(l2b, ArrayUtils.range(1, 11), 1 - yellowThreshold);
double[] l3b = an.filterLikelihoods(l);
successIndexes =
IntStream.range(0, l.length).map(i -> { assertEquals(l2b[i], l3b[i], 0.01); return 1; }).sum();
assertEquals(successIndexes, l.length);
// Case (iii): redThreshold difference should be at index 2
l = new double[] {
0.999968329, 0.999999897, 1, 1, 1,
1, 0.999999994, 0.999999966, 0.999999966,
0.999994331, 0.999516576, 0.99744487
};
l = Arrays.stream(l).map(d -> 1.0d - d).toArray();
double[] l2a2 = Arrays.copyOf(l, l.length);
double[] l2b2 = Arrays.copyOf(l, l.length);
ArrayUtils.setIndexesTo(l2a2, ArrayUtils.range(1, 10), 1 - yellowThreshold);
ArrayUtils.setIndexesTo(l2b2, ArrayUtils.range(2, 10), 1 - yellowThreshold);
double[] l3a2 = an.filterLikelihoods(l);
double[] l3b2 = an.filterLikelihoods(l, 0.99999, yellowThreshold);
successIndexes =
IntStream.range(0, l2a2.length).map(i -> { assertEquals(l2a2[i], l3a2[i], 0.01); return 1; }).sum();
assertEquals(successIndexes, l2a2.length);
successIndexes =
IntStream.range(0, l2b2.length).map(i -> { assertEquals(l2b2[i], l3b2[i], 0.01); return 1; }).sum();
assertEquals(successIndexes, l2b2.length);
}
}