/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.udf.generic; import java.util.Random; import javolution.util.FastBitSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.io.Text; public class NumDistinctValueEstimator { static final Logger LOG = LoggerFactory.getLogger(NumDistinctValueEstimator.class.getName()); /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number. * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1. * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1 * thus introducing errors in the estimates. */ private static final int BIT_VECTOR_SIZE = 31; private final int numBitVectors; // Refer to Flajolet-Martin'86 for the value of phi private static final double PHI = 0.77351; private final int[] a; private final int[] b; private final FastBitSet[] bitVector; private final Random aValue; private final Random bValue; /* Create a new distinctValueEstimator */ public NumDistinctValueEstimator(int numBitVectors) { this.numBitVectors = numBitVectors; bitVector = new FastBitSet[numBitVectors]; for (int i=0; i< numBitVectors; i++) { bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE); } a = new int[numBitVectors]; b = new int[numBitVectors]; /* Use a large prime number as a seed to the random number generator. * Java's random number generator uses the Linear Congruential Generator to generate random * numbers using the following recurrence relation, * * X(n+1) = (a X(n) + c ) mod m * * where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48 * is not a prime number and hence the set of numbers from 0 to m don't form a finite field. * If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair * wise independent. * * However, empirically passing in prime numbers as seeds seems to work better than when passing * composite numbers as seeds. Ideally Java's Random should pick m such that m is prime. * */ aValue = new Random(99397); bValue = new Random(9876413); for (int i = 0; i < numBitVectors; i++) { int randVal; /* a and b shouldn't be even; If a and b are even, then none of the values * will set bit 0 thus introducing errors in the estimate. Both a and b can be even * 25% of the times and as a result 25% of the bit vectors could be inaccurate. To avoid this * always pick odd values for a and b. */ do { randVal = aValue.nextInt(); } while (randVal % 2 == 0); a[i] = randVal; do { randVal = bValue.nextInt(); } while (randVal % 2 == 0); b[i] = randVal; if (a[i] < 0) { a[i] = a[i] + (1 << BIT_VECTOR_SIZE - 1); } if (b[i] < 0) { b[i] = b[i] + (1 << BIT_VECTOR_SIZE - 1); } } } public NumDistinctValueEstimator(String s, int numBitVectors) { this.numBitVectors = numBitVectors; FastBitSet bitVectorDeser[] = deserialize(s, numBitVectors); bitVector = new FastBitSet[numBitVectors]; for(int i=0; i <numBitVectors; i++) { bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE); bitVector[i].clear(); bitVector[i].or(bitVectorDeser[i]); } a = null; b = null; aValue = null; bValue = null; } /** * Resets a distinctValueEstimator object to its original state. */ public void reset() { for (int i=0; i< numBitVectors; i++) { bitVector[i].clear(); } } public FastBitSet getBitVector(int index) { return bitVector[index]; } public int getnumBitVectors() { return numBitVectors; } public int getBitVectorSize() { return BIT_VECTOR_SIZE; } public void printNumDistinctValueEstimator() { String t = new String(); LOG.debug("NumDistinctValueEstimator"); LOG.debug("Number of Vectors: {}", numBitVectors); LOG.debug("Vector Size: {}", BIT_VECTOR_SIZE); for (int i=0; i < numBitVectors; i++) { t = t + bitVector[i].toString(); } LOG.debug("Serialized Vectors: "); LOG.debug(t); } /* Serializes a distinctValueEstimator object to Text for transport. * */ public Text serialize() { String s = new String(); for(int i=0; i < numBitVectors; i++) { s = s + (bitVector[i].toString()); } return new Text(s); } /* Deserializes from string to FastBitSet; Creates a NumDistinctValueEstimator object and * returns it. */ private FastBitSet[] deserialize(String s, int numBitVectors) { FastBitSet[] b = new FastBitSet[numBitVectors]; for (int j=0; j < numBitVectors; j++) { b[j] = new FastBitSet(BIT_VECTOR_SIZE); b[j].clear(); } int vectorIndex =0; /* Parse input string to obtain the indexes that are set in the bitvector. * When a toString() is called on a FastBitSet object to serialize it, the serialization * adds { and } to the beginning and end of the return String. * Skip "{", "}", ",", " " in the input string. */ for(int i=1; i < s.length()-1;) { char c = s.charAt(i); i = i + 1; // Move on to the next bit vector if (c == '}') { vectorIndex = vectorIndex + 1; } // Encountered a numeric value; Extract out the entire number if (c >= '0' && c <= '9') { String t = new String(); t = t + c; c = s.charAt(i); i = i + 1; while (c != ',' && c!= '}') { t = t + c; c = s.charAt(i); i = i + 1; } int bitIndex = Integer.parseInt(t); assert(bitIndex >= 0); assert(vectorIndex < numBitVectors); b[vectorIndex].set(bitIndex); if (c == '}') { vectorIndex = vectorIndex + 1; } } } return b; } private int generateHash(long v, int hashNum) { int mod = (1<<BIT_VECTOR_SIZE) - 1; long tempHash = a[hashNum] * v + b[hashNum]; tempHash %= mod; int hash = (int) tempHash; /* Hash function should map the long value to 0...2^L-1. * Hence hash value has to be non-negative. */ if (hash < 0) { hash = hash + mod; } return hash; } private int generateHashForPCSA(long v) { int mod = 1 << (BIT_VECTOR_SIZE - 1) - 1; long tempHash = a[0] * v + b[0]; tempHash %= mod; int hash = (int) tempHash; /* Hash function should map the long value to 0...2^L-1. * Hence hash value has to be non-negative. */ if (hash < 0) { hash = hash + mod + 1; } return hash; } public void addToEstimator(long v) { /* Update summary bitVector : * Generate hash value of the long value and mod it by 2^bitVectorSize-1. * In this implementation bitVectorSize is 31. */ for (int i = 0; i<numBitVectors; i++) { int hash = generateHash(v,i); int index; // Find the index of the least significant bit that is 1 for (index=0; index<BIT_VECTOR_SIZE; index++) { if (hash % 2 != 0) { break; } hash = hash >> 1; } // Set bitvector[index] := 1 bitVector[i].set(index); } } public void addToEstimatorPCSA(long v) { int hash = generateHashForPCSA(v); int rho = hash/numBitVectors; int index; // Find the index of the least significant bit that is 1 for (index=0; index<BIT_VECTOR_SIZE; index++) { if (rho % 2 != 0) { break; } rho = rho >> 1; } // Set bitvector[index] := 1 bitVector[hash%numBitVectors].set(index); } public void addToEstimator(double d) { int v = new Double(d).hashCode(); addToEstimator(v); } public void addToEstimatorPCSA(double d) { int v = new Double(d).hashCode(); addToEstimatorPCSA(v); } public void addToEstimator(HiveDecimal decimal) { int v = decimal.hashCode(); addToEstimator(v); } public void addToEstimatorPCSA(HiveDecimal decimal) { int v = decimal.hashCode(); addToEstimatorPCSA(v); } public void mergeEstimators(NumDistinctValueEstimator o) { // Bitwise OR the bitvector with the bitvector in the agg buffer for (int i=0; i<numBitVectors; i++) { bitVector[i].or(o.getBitVector(i)); } } public long estimateNumDistinctValuesPCSA() { double numDistinctValues = 0.0; long S = 0; for (int i=0; i < numBitVectors; i++) { int index = 0; while (bitVector[i].get(index) && index < BIT_VECTOR_SIZE) { index = index + 1; } S = S + index; } numDistinctValues = ((numBitVectors/PHI) * Math.pow(2.0, S/numBitVectors)); return ((long)numDistinctValues); } /* We use the Flajolet-Martin estimator to estimate the number of distinct values.FM uses the * location of the least significant zero as an estimate of log2(phi*ndvs). */ public long estimateNumDistinctValues() { int sumLeastSigZero = 0; double avgLeastSigZero; double numDistinctValues; for (int i=0; i< numBitVectors; i++) { int leastSigZero = bitVector[i].nextClearBit(0); sumLeastSigZero += leastSigZero; } avgLeastSigZero = sumLeastSigZero/(numBitVectors * 1.0) - (Math.log(PHI)/Math.log(2.0)); numDistinctValues = Math.pow(2.0, avgLeastSigZero); return ((long)(numDistinctValues)); } @InterfaceAudience.LimitedPrivate(value = { "Hive" }) static int lengthFor(JavaDataModel model, Integer numVector) { int length = model.object(); length += model.primitive1() * 2; // two int length += model.primitive2(); // one double length += model.lengthForRandom() * 2; // two Random if (numVector == null) { numVector = 16; // HiveConf hive.stats.ndv.error default produces 16 vectors } if (numVector > 0) { length += model.array() * 3; // three array length += model.primitive1() * numVector * 2; // two int array length += (model.object() + model.array() + model.primitive1() + model.primitive2()) * numVector; // bitset array } return length; } public int lengthFor(JavaDataModel model) { return lengthFor(model, getnumBitVectors()); } }