// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.frequency;
import java.util.Random;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.talend.dataquality.statistics.frequency.impl.EFrequencyAlgorithm;
public class CMSFrequencyStatisticsTest {
private DataTypeFrequencyAnalyzer fta = new DataTypeFrequencyAnalyzer();
@Before
public void setUp() throws Exception {
fta.setAlgorithm(EFrequencyAlgorithm.COUNT_MIN_SKETCH);
}
@After
public void tearDown() throws Exception {
}
@Test
public void testAnalyze() {
String[] data = new String[] { "a", "b", "a", "b", "a", "c", "1", "2", "3" };
fta.init();
for (String col : data) {
fta.analyze(col);
}
Assert.assertEquals(3, fta.getResult().get(0).getFrequency("a"), 0);
Assert.assertEquals(2, fta.getResult().get(0).getFrequency("b"), 0);
Assert.assertEquals(1, fta.getResult().get(0).getFrequency("1"), 0);
Assert.assertEquals(1, fta.getResult().get(0).getFrequency("c"), 0);
// Test larger data
int seed = 7364181;
Random r = new Random(seed);
int numItems = 1000000;
int[] xs = new int[numItems];
int maxScale = 20;
fta.init();
for (int i = 0; i < numItems; i++) {
int scale = r.nextInt(maxScale);
xs[i] = r.nextInt(1 << scale);
fta.analyze(xs[i] + "");
}
int[] actualFreq = new int[1 << maxScale];
for (int x : xs) {
actualFreq[x]++;
}
int numErrors = 0;
double epsOfTotalCount = 0.0001;
double confidence = 0.99;
for (int i = 0; i < actualFreq.length; ++i) {
double ratio = ((double) (fta.getResult().get(0).getFrequency(i + "") - actualFreq[i])) / numItems;
if (ratio > epsOfTotalCount) {
numErrors++;
}
}
double pCorrect = 1.0 - ((double) numErrors) / actualFreq.length;
Assert.assertTrue("Confidence not reached: required " + confidence + ", reached " + pCorrect, pCorrect > confidence);
}
}