/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.builtin; import java.io.IOException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.util.Iterator; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.hash.Hash; import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; /** * A Base class for BuildBloom and its Algebraic implementations. */ public abstract class BuildBloomBase<T> extends EvalFunc<T> { protected int vSize; protected int numHash; protected int hType; protected BloomFilter filter; protected BuildBloomBase() { } /** * @param hashType type of the hashing function (see * {@link org.apache.hadoop.util.hash.Hash}). * @param mode Will be ignored, though by convention it should be * "fixed" or "fixedsize" * @param vectorSize The vector size of <i>this</i> filter. * @param nbHash The number of hash functions to consider. */ public BuildBloomBase(String hashType, String mode, String vectorSize, String nbHash) { vSize = Integer.valueOf(vectorSize); numHash = Integer.valueOf(nbHash); hType = convertHashType(hashType); } /** * @param hashType type of the hashing function (see * {@link org.apache.hadoop.util.hash.Hash}). * @param numElements The number of distinct elements expected to be * placed in this filter. * @param desiredFalsePositive the acceptable rate of false positives. * This should be a floating point value between 0 and 1.0, where 1.0 * would be 100% (ie, a totally useless filter). */ public BuildBloomBase(String hashType, String numElements, String desiredFalsePositive) { setSize(numElements, desiredFalsePositive); hType = convertHashType(hashType); } protected DataByteArray bloomOr(Tuple input) throws IOException { filter = new BloomFilter(vSize, numHash, hType); try { DataBag values = (DataBag)input.get(0); for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); filter.or(bloomIn((DataByteArray)t.get(0))); } } catch (ExecException ee) { throw new IOException(ee); } return bloomOut(); } protected DataByteArray bloomOut() throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(vSize / 8); DataOutputStream dos = new DataOutputStream(baos); filter.write(dos); return new DataByteArray(baos.toByteArray()); } protected BloomFilter bloomIn(DataByteArray b) throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(b.get())); BloomFilter f = new BloomFilter(); f.readFields(dis); return f; } private int convertHashType(String hashType) { if (hashType.toLowerCase().contains("jenkins")) { return Hash.JENKINS_HASH; } else if (hashType.toLowerCase().contains("murmur")) { return Hash.MURMUR_HASH; } else { throw new RuntimeException("Unknown hash type " + hashType + ". Valid values are jenkins and murmur."); } } private void setSize(String numElements, String desiredFalsePositive) { int num = Integer.valueOf(numElements); float fp = Float.valueOf(desiredFalsePositive); if (num < 1 || fp < 0.0 || fp >= 1.0) { throw new RuntimeException("Number of elements must be greater " + "than zero and desiredFalsePositive must be between 0 " + " and 1."); } vSize = (int)(-1 * (num * Math.log(fp)) / Math.pow(Math.log(2), 2)); log.info("BuildBloom setting vector size to " + vSize); numHash = (int)(0.7 * vSize / num); log.info("BuildBloom setting number of hashes to " + numHash); } }