package org.streaminer.stream.frequency; import cern.colt.Arrays; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import org.junit.Test; import static org.junit.Assert.*; import org.streaminer.stream.frequency.util.CountEntry; /** * * @author maycon */ public class SpaceSavingTest { @Test public void testAccuracy() throws FrequencyException { Random r = new Random(); int numItems = 1000000; int[] xs = new int[numItems]; int maxScale = 20; for (int i = 0; i < xs.length; ++i) { int scale = r.nextInt(maxScale); int num = Math.max(Integer.MAX_VALUE, (1 << scale)); xs[i] = r.nextInt(num); } double support = 0.01; double maxError = 0.1; SpaceSaving<Integer> counter = new SpaceSaving<Integer>(maxScale, support, maxError); for (int x : xs) { counter.add(x, 1); } int count = 0; List<CountEntry<Integer>> topk = counter.getFrequentItems(); Collections.sort(topk); List<Integer> frequentItems = new ArrayList<Integer>(); CountEntry<Integer> lastItem = topk.get(topk.size() - 1); double epsilon = 1.0/(double)maxScale; double threshold = epsilon * (double)numItems; for (CountEntry<Integer> item : topk) { System.out.println(item.getItem() + ": " + item.getFrequency()); count += item.getFrequency(); frequentItems.add(item.getItem()); } // sum of all counters should be equal to number of items on the stream (n) assertEquals("Sum of all counter should be equal to stream size", count, numItems); // smallest counter value should be at most epsilon*n assertTrue("Smallest counter value should be at most epsilon*n, actual: " + lastItem.getFrequency() + " <= " + threshold, lastItem.getFrequency() <= threshold); // calculates actual frequencies RealCounting<Integer> actualFreq = new RealCounting<Integer>(); for (int v : xs) { actualFreq.add(v); } for (CountEntry<Integer> item : actualFreq.getFrequentItems()) { // check if all items whose count > epsilon*n have been stored if (item.getFrequency() > (epsilon*numItems)) { assertTrue("Any item whose count > epsilon*n should be stored", frequentItems.contains(item.getItem())); } // check if non-stored items have count <= min count if (!counter.contains(item.getItem())) { assertTrue("Count of non-stored items should be at most the min count stored", item.getFrequency() <= lastItem.getFrequency()); } } } }