/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.apache.pig.builtin.Bloom; import org.apache.pig.builtin.BuildBloom; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.junit.Test; /** * This class unit tests the built in UDFs BuildBloom and Bloom. */ public class TestBloom { static class TestBuildBloom extends BuildBloom { TestBuildBloom(String numElements, String desiredFalsePositive) { super("jenkins", numElements, desiredFalsePositive); } int getSize() { return vSize; } int getNumHash() { return numHash; } } @Test public void testSizeCalc() throws Exception { TestBuildBloom tbb = new TestBuildBloom("1000", "0.01"); assertEquals(9585, tbb.getSize()); assertEquals(6, tbb.getNumHash()); tbb = new TestBuildBloom("1000000", "0.01"); assertEquals(9585058, tbb.getSize()); assertEquals(6, tbb.getNumHash()); tbb = new TestBuildBloom("1000", "0.0001"); assertEquals(19170, tbb.getSize()); assertEquals(13, tbb.getNumHash()); tbb = new TestBuildBloom("1000000", "0.00001"); assertEquals(23962645, tbb.getSize()); assertEquals(16, tbb.getNumHash()); } @Test(expected = RuntimeException.class) public void testBadHash() throws Exception { String size = "100"; String numHash = "3"; String hashFunc = "nosuchhash"; try { BuildBloom bb = new BuildBloom(hashFunc, "fixed", size, numHash); } catch (RuntimeException re) { assertTrue(re.getMessage().contains("Unknown hash type")); throw re; } } @Test public void testFuncNames() throws Exception { String size = "100"; String numHash = "3"; String hashFunc = "JENKINS_HASH"; BuildBloom bb = new BuildBloom(hashFunc, "fixed", size, numHash); assertEquals("org.apache.pig.builtin.BuildBloom$Initial", bb.getInitial()); assertEquals("org.apache.pig.builtin.BuildBloom$Intermediate", bb.getIntermed()); assertEquals("org.apache.pig.builtin.BuildBloom$Final", bb.getFinal()); } @Test public void testMap() throws Exception { String size = "100"; String numHash = "3"; String hashFunc = "JENKINS"; TupleFactory tf = TupleFactory.getInstance(); BagFactory bf = BagFactory.getInstance(); Tuple t = tf.newTuple(1); t.set(0, 1); DataBag b = bf.newDefaultBag(); b.add(t); Tuple input = tf.newTuple(b); BuildBloom.Initial map = new BuildBloom.Initial(hashFunc, "fixed", size, numHash); t = map.exec(input); Bloom bloom = new Bloom("bla"); bloom.setFilter((DataByteArray)t.get(0)); // Test that everything we put in passes. Tuple t1 = tf.newTuple(1); t1.set(0, 1); assertTrue(bloom.exec(t1)); // A few that don't pass for (int i = 100; i < 10; i++) { Tuple t2 = tf.newTuple(1); t2.set(0, i); assertFalse(bloom.exec(t2)); } } @Test public void testCombiner() throws Exception { String size = "100"; String numHash = "3"; String hashFunc = "jenkins"; TupleFactory tf = TupleFactory.getInstance(); BagFactory bf = BagFactory.getInstance(); DataBag combinerBag = bf.newDefaultBag(); for (int j = 0; j < 3; j++) { // map loop Tuple t = tf.newTuple(1); t.set(0, 10 + j); DataBag mapBag = bf.newDefaultBag(); mapBag.add(t); Tuple input = tf.newTuple(mapBag); BuildBloom.Initial map = new BuildBloom.Initial(hashFunc, "fixed", size, numHash); combinerBag.add(map.exec(input)); } Tuple t = tf.newTuple(1); t.set(0, combinerBag); BuildBloom.Intermediate combiner = new BuildBloom.Intermediate(hashFunc, "fixed", size, numHash); t = combiner.exec(t); Bloom bloom = new Bloom("bla"); bloom.setFilter((DataByteArray)t.get(0)); // Test that everything we put in passes. for (int j = 0; j < 3; j++) { Tuple t1 = tf.newTuple(1); t1.set(0, 10 + j); assertTrue(bloom.exec(t1)); } // A few that don't pass for (int i = 100; i < 10; i++) { Tuple t2 = tf.newTuple(1); t2.set(0, i); assertFalse(bloom.exec(t2)); } } @Test public void testSingleKey() throws Exception { String size = "100"; String numHash = "3"; String hashFunc = "MURMUR"; TupleFactory tf = TupleFactory.getInstance(); BagFactory bf = BagFactory.getInstance(); DataBag reducerBag = bf.newDefaultBag(); for (int i = 0; i < 3; i++) { // combiner loop DataBag combinerBag = bf.newDefaultBag(); for (int j = 0; j < 3; j++) { // map loop Tuple t = tf.newTuple(1); t.set(0, i * 10 + j); DataBag mapBag = bf.newDefaultBag(); mapBag.add(t); Tuple input = tf.newTuple(mapBag); BuildBloom.Initial map = new BuildBloom.Initial(hashFunc, "fixed", size, numHash); combinerBag.add(map.exec(input)); } Tuple t = tf.newTuple(1); t.set(0, combinerBag); BuildBloom.Intermediate combiner = new BuildBloom.Intermediate(hashFunc, "fixed", size, numHash); reducerBag.add(combiner.exec(t)); } Tuple t = tf.newTuple(1); t.set(0, reducerBag); BuildBloom.Final reducer = new BuildBloom.Final(hashFunc, "fixed", size, numHash); DataByteArray dba = reducer.exec(t); Bloom bloom = new Bloom("bla"); bloom.setFilter(dba); // Test that everything we put in passes. for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { Tuple t1 = tf.newTuple(1); t1.set(0, i * 10 + j); assertTrue(bloom.exec(t1)); } } // A few that don't pass for (int i = 100; i < 10; i++) { Tuple t1 = tf.newTuple(1); t1.set(0, i); assertFalse(bloom.exec(t1)); } } @Test public void testMultiKey() throws Exception { String numElements = "10"; String falsePositive = "0.001"; String hashFunc = "murmur"; TupleFactory tf = TupleFactory.getInstance(); BagFactory bf = BagFactory.getInstance(); String[][] strs = { { "fred", "joe", "bob" }, { "mary", "sally", "jane" }, { "fido", "spot", "fluffly" } }; DataBag reducerBag = bf.newDefaultBag(); for (int i = 0; i < 3; i++) { // combiner loop DataBag combinerBag = bf.newDefaultBag(); for (int j = 0; j < 3; j++) { // map loop Tuple t = tf.newTuple(2); t.set(0, i * 10 + j); t.set(1, strs[i][j]); DataBag mapBag = bf.newDefaultBag(); mapBag.add(t); Tuple input = tf.newTuple(mapBag); BuildBloom.Initial map = new BuildBloom.Initial(hashFunc, numElements, falsePositive); combinerBag.add(map.exec(input)); } Tuple t = tf.newTuple(1); t.set(0, combinerBag); BuildBloom.Intermediate combiner = new BuildBloom.Intermediate(hashFunc, numElements, falsePositive); reducerBag.add(combiner.exec(t)); } Tuple t = tf.newTuple(1); t.set(0, reducerBag); BuildBloom.Final reducer = new BuildBloom.Final(hashFunc, numElements, falsePositive); DataByteArray dba = reducer.exec(t); Bloom bloom = new Bloom("bla"); bloom.setFilter(dba); // Test that everything we put in passes. for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { Tuple t1 = tf.newTuple(2); t1.set(0, i * 10 + j); t1.set(1, strs[i][j]); assertTrue(bloom.exec(t1)); } } // A few that don't pass for (int i = 100; i < 10; i++) { Tuple t1 = tf.newTuple(2); t1.set(0, i); t1.set(1, "ichabod"); assertFalse(bloom.exec(t1)); } } }