package brickhouse.analytics.uniques; import brickhouse.udf.sketch.SetSimilarityUDF; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import org.junit.Assert; import org.junit.Test; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import java.util.SortedMap; import java.util.UUID; public class SketchSetTest { @Test public void testSketchSet() { SketchSet ss = new SketchSet(); ss.addHash(1); ss.addHash(2); ss.addHash(3); double card = ss.estimateReach(); Assert.assertEquals(3.0, card, 0.0); } @Test public void testDupSketchSet() { SketchSet ss = new SketchSet(); ss.addHash(-11); ss.addHash(-11); ss.addHash(1); ss.addHash(2); ss.addHash(2); ss.addHash(3); ss.addHash(3); double card = ss.estimateReach(); Assert.assertEquals(4.0, card, 0.0); } @Test public void testDupSketchSetOver5000() { SketchSet ss = new SketchSet(); int numHashes = (int) (5000 + Math.random() * 1024 * 4); long minHash = Long.MAX_VALUE; for (int i = 0; i < numHashes; ++i) { double randHash = (Math.random() * ((double) Long.MAX_VALUE) * 2) - (double) Long.MAX_VALUE; if (randHash < minHash) minHash = (long) randHash; ss.addItem("" + randHash); } double ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.9 && ratio < 1.1); ss.addHash(minHash - 1); numHashes++; ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.95 && ratio < 1.05); ss.addHash(minHash - 1); ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.95 && ratio < 1.05); ss.addHash(minHash - 2); numHashes++; ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.95 && ratio < 1.05); ss.addHash(minHash - 2); ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.95 && ratio < 1.05); long lastHash = ss.lastHash(); ss.addHash(lastHash - 1); numHashes++; ratio = ss.estimateReach() / (double) numHashes; System.out.println(" Estimate reach = " + ss.estimateReach() + " size = " + numHashes + " ratio = " + ratio); Assert.assertTrue(ratio > 0.95 && ratio < 1.05); } @Test public void testRandomHashes() { SketchSet ss = new SketchSet(); int numHashes = 1024 * 1024; for (int i = 0; i < numHashes; ++i) { double randHash = Math.random() * ((double) Long.MAX_VALUE); if (Math.random() < 0.5) { randHash = -1 * randHash; } ss.addHash((long) randHash); } double card = ss.estimateReach(); double tolerance = 0.05; int diff = (int) Math.abs(card - numHashes); double diffRatio = ((double) diff) / numHashes; System.out.println(" Estimated cardinality is " + card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio); Assert.assertTrue(diffRatio <= tolerance); System.out.println(" Estimated cardinality is " + card); Assert.assertEquals(numHashes, card, numHashes * tolerance); } ///@Test public void testManyRandomHashes() { double maxDiff = 0; double totDiff = 0; int numRuns = 512; long now = System.currentTimeMillis(); for (int j = 0; j < numRuns; ++j) { SketchSet ss = new SketchSet(); int numHashes = (int) (Math.random() * 1024 * 514); for (int i = 0; i < numHashes; ++i) { double randHash = Math.random() * ((double) Long.MAX_VALUE); if (Math.random() < 0.5) { randHash = -1 * randHash; } ss.addHash((long) randHash); } double card = ss.estimateReach(); double tolerance = 0.05; int diff = (int) Math.abs(card - numHashes); double diffRatio = ((double) diff) / numHashes; System.out.println(" J = " + j); System.out.println(" Estimated cardinality is " + card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio); Assert.assertTrue(diffRatio <= tolerance); if (diffRatio > maxDiff) maxDiff = diffRatio; totDiff += diffRatio; System.out.println(" Estimated cardinality is " + card); Assert.assertEquals(numHashes, card, numHashes * tolerance); } long later = System.currentTimeMillis(); int numSecs = (int) ((later - now) / 1000.0); System.out.println(" Max Diff Ratio = " + maxDiff); double avgDiff = totDiff / (double) numRuns; System.out.println(" Avg Diff Ratio = " + avgDiff); System.out.print(numRuns + " took " + numSecs); } @Test public void testHashStrings() { SketchSet ss = new SketchSet(5000); int numHashes = 512 * 1024; for (int i = 0; i < numHashes; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss.addItem(randomUUID.toString()); } double card = ss.estimateReach(); double tolerance = 0.05; int diff = (int) Math.abs(card - numHashes); double diffRatio = ((double) diff) / numHashes; System.out.println(" Estimated cardinality is " + card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio); Assert.assertTrue(diffRatio <= tolerance); } @Test public void testDistinctSets() { SketchSet ss = new SketchSet(5000); int numHashes1 = (int) ((double) (1024 * 512) * Math.random()); System.out.println(" Number of hashes one = " + numHashes1); for (int i = 0; i < numHashes1; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss.addItem(randomUUID.toString()); } double card = ss.estimateReach(); System.out.println(" Estimated Card 1 = " + card); SketchSet ss2 = new SketchSet(5000); int numHashes2 = (int) ((double) (1024 * 512) * Math.random()); System.out.println(" Number of hashes two = " + numHashes2); for (int i = 0; i < numHashes2; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss2.addItem(randomUUID.toString()); } double card2 = ss2.estimateReach(); System.out.println(" Estimated Card 2 = " + card2); ss.combine(ss2); double newCard = ss.estimateReach(); System.out.println(" Sum of hashes = " + (card + card2)); System.out.println(" Estimated Combined Card = " + newCard); System.out.println(" New Card = " + newCard); } @Test public void testOverlapSets() { SketchSet ss = new SketchSet(5000); int numHashes1 = (int) ((double) (1024 * 512) * Math.random()); System.out.println(" Number of hashes one = " + numHashes1); for (int i = 0; i < numHashes1; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss.addItem(randomUUID.toString()); } double card = ss.estimateReach(); System.out.println(" Estimated Card 1 = " + card); SketchSet ss2 = new SketchSet(5000); int numHashes2 = (int) ((double) (1024 * 512) * Math.random()); System.out.println(" Number of hashes two = " + numHashes2); for (int i = 0; i < numHashes2; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss2.addItem(randomUUID.toString()); } double card2 = ss2.estimateReach(); System.out.println(" Estimated Card 2 = " + card2); SketchSet ss3 = new SketchSet(5000); int numHashes3 = (int) ((double) (1024 * 512) * Math.random()); System.out.println(" Number of hashes three = " + numHashes3); for (int i = 0; i < numHashes3; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss3.addItem(randomUUID.toString()); } double card3 = ss3.estimateReach(); System.out.println(" Estimated Card 3= " + card3); ss.combine(ss2); ss.combine(ss2); double cardCombine1 = ss.estimateReach(); System.out.println(" Combine 1 = " + cardCombine1); System.out.println(" 1 +2 " + (numHashes1 + numHashes2)); ss3.combine(ss2); double cardCombine3 = ss3.estimateReach(); System.out.println(" Combine 3 = " + cardCombine3); System.out.println(" 2 + 3 " + (numHashes3 + numHashes2)); SketchSet overLap = new SketchSet(); overLap.combine(ss); overLap.combine(ss3); double cardOverlap = overLap.estimateReach(); System.out.println(" Overlap = " + cardOverlap); System.out.println(" All hashes = " + (numHashes1 + numHashes2 + numHashes3)); } @Test public void testObama() throws IOException { SketchSet ss = new SketchSet(); SketchSet ss2 = new SketchSet(); HashFunction md5 = Hashing.md5(); System.out.println(" Directory is " + System.getProperty("user.dir")); FileInputStream fs = new FileInputStream("src/test/resources/obama.txt"); int cnt = 0; BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line; while ((line = reader.readLine()) != null) { ss.addItem(line); ss2.addHashItem(md5.hashUnencodedChars(line).asLong(), line); cnt++; } System.out.println(" Estimated Reach = " + ss.estimateReach() + " count = " + cnt); double diff = cnt - ss.estimateReach(); double pctDiff = Math.abs(diff / (double) cnt); System.out.println(" Difference is " + pctDiff); Assert.assertTrue(pctDiff < 0.03); System.out.println(" Estimated Reach = " + ss2.estimateReach() + " count = " + cnt); diff = cnt - ss2.estimateReach(); pctDiff = Math.abs(diff / (double) cnt); System.out.println(" Difference is " + pctDiff); Assert.assertTrue(pctDiff < 0.03); SortedMap<Long, String> hashItemMap = ss.getHashItemMap(); System.out.println(" First Key is " + hashItemMap.firstKey()); System.out.println(" Last Key is " + hashItemMap.lastKey()); } @Test public void testGetMinHashes() { SketchSet ss = new SketchSet(); int numHashes = 5100 + (int) (Math.random() * 15000); for (int i = 0; i < numHashes; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); ss.addItem(randomUUID.toString()); } List<Long> md5Hashes = ss.getMinHashes(); long last = Long.MIN_VALUE; for (long md5 : md5Hashes) { Assert.assertTrue(md5 > last); last = md5; } double estReach = SketchSet.EstimatedReach(last, ss.getMaxItems()); double ratio = estReach / (double) numHashes; System.out.println(" Estimated Reach = " + estReach + " num Hashes = " + numHashes + " ; Ratio = " + ratio); Assert.assertTrue(ratio < 1.05 && ratio > 0.95); } @Test public void testSetSimilarity() { int numHashes = 200000; SketchSet a = new SketchSet(); SketchSet b = new SketchSet(); SketchSet c = new SketchSet(); for (int i = 0; i < numHashes; ++i) { UUID randomUUID = UUID.randomUUID(); ///System.out.println(" RandomUUID " + randomUUID.toString()); a.addItem(randomUUID.toString()); randomUUID = UUID.randomUUID(); b.addItem(randomUUID.toString()); randomUUID = UUID.randomUUID(); c.addItem(randomUUID.toString()); } SetSimilarityUDF simUDF = new SetSimilarityUDF(); double same = simUDF.evaluate(a.getMinHashItems(), a.getMinHashItems()); System.out.println("Similarity with self = " + same); Assert.assertEquals(1.0, same, 0); double diff = simUDF.evaluate(a.getMinHashItems(), b.getMinHashItems()); System.out.println("Similarity with different = " + diff); Assert.assertEquals(0, diff, 0.03); /// Might not be quite zero a.combine(c); b.combine(c); double mixed = simUDF.evaluate(a.getMinHashItems(), b.getMinHashItems()); System.out.println("Similarity with mixed = " + mixed); //// Should be about a third Assert.assertEquals(0.333333333, mixed, 0.03); } }