/*
* Copyright 2012, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.LinkBench.distributions;
import java.util.ArrayList;
import java.util.Properties;
import java.util.Random;
import org.apache.commons.math3.util.FastMath;
import org.apache.log4j.Logger;
import com.facebook.LinkBench.Config;
import com.facebook.LinkBench.ConfigUtil;
public class ZipfDistribution implements ProbabilityDistribution {
private final Logger logger = Logger.getLogger(ConfigUtil.LINKBENCH_LOGGER);
private long min = 0;
private long max = 1;
private double shape = 0.0;
/** The total number of items in the world */
private double scale;
// precomputed values
private double alpha = 0.0;
private double eta = 0.0;
private double zetan = 0.0;
private double point5theta = 0.0;
@Override
public void init(long min, long max, Properties props, String keyPrefix) {
if (max <= min) {
throw new IllegalArgumentException("max = " + max + " <= min = " + min +
": probability distribution cannot have zero or negative domain");
}
this.min = min;
this.max = max;
String shapeS = props != null ? ConfigUtil.getPropertyRequired(props,
keyPrefix + "shape") : null;
if (shapeS == null ) {
throw new IllegalArgumentException("ZipfDistribution must be provided " +
keyPrefix + "shape parameter");
}
shape = Double.valueOf(shapeS);
if (shape <= 0.0) {
throw new IllegalArgumentException("Zipf shape parameter " + shape +
" is not positive");
}
if (props != null && props.containsKey(keyPrefix + Config.PROB_MEAN)) {
scale = (max - min) * ConfigUtil.getDouble(props,
keyPrefix + Config.PROB_MEAN);
} else {
scale = 1.0;
}
// Precompute some values to speed up future method calls
long n = max - min;
alpha = 1 / (1 - shape);
zetan = calcZetan(n);
eta = (1 - FastMath.pow(2.0 / n, 1 - shape)) /
(1 - Harmonic.generalizedHarmonic(2, shape) / zetan);
point5theta = FastMath.pow(0.5, shape);
}
// For large n, calculating zetan takes a long time. This is a simple
// but effective caching technique that speeds up startup a lot
// when multiple instances of the distribution are initialized in
// close succession.
private static class CacheEntry {
long n;
double shape;
double zetan;
}
/** Min value of n to cache */
private static final long MIN_CACHE_VALUE = 1000;
private static final int MAX_CACHE_ENTRIES = 1024;
private static ArrayList<CacheEntry> zetanCache =
new ArrayList<CacheEntry>(MAX_CACHE_ENTRIES);
private double calcZetan(long n) {
if (n < MIN_CACHE_VALUE) {
return uncachedCalcZetan(n);
}
synchronized(ZipfDistribution.class) {
for (int i = 0; i < zetanCache.size(); i++) {
CacheEntry ce = zetanCache.get(i);
if (ce.n == n && ce.shape == shape) {
return ce.zetan;
}
}
}
double calcZetan = uncachedCalcZetan(n);
synchronized (ZipfDistribution.class) {
CacheEntry ce = new CacheEntry();
ce.zetan = calcZetan;
ce.n = n;
ce.shape = shape;
if (zetanCache.size() >= MAX_CACHE_ENTRIES) {
zetanCache.remove(0);
}
zetanCache.add(ce);
}
return calcZetan;
}
private double uncachedCalcZetan(long n) {
double calcZetan;
if (shape <= 1.0) {
// use approximation
calcZetan = ApproxHarmonic.generalizedHarmonic(n, shape);
} else {
// Can't use approximation
// If calculation will take more than 5 or so seconds, let user know
// what is happening
if (n > 20000000) {
logger.info("Precalculating constants for Zipf distribution over "
+ n + " items with shape = " + shape
+ ". Please be patient, this can take a little time.");
}
calcZetan = Harmonic.generalizedHarmonic(n, shape);
}
return calcZetan;
}
@Override
public double pdf(long id) {
return scaledPDF(id, 1.0);
}
@Override
public double expectedCount(long id) {
return scaledPDF(id, scale);
}
private double scaledPDF(long id, double scale) {
// Calculate this way to avoid losing precision by calculating very
// small pdf number
if (id < min || id >= max) return 0.0;
return (scale / (double) FastMath.pow(id + 1 - min, shape))/ zetan;
}
@Override
public double cdf(long id) {
if (id < min) return 0.0;
if (id >= max) return 1.0;
double harm;
if (shape <= 1.0) {
harm = ApproxHarmonic.generalizedHarmonic(id + 1 - min, shape);
} else {
harm = Harmonic.generalizedHarmonic(id + 1 - min, shape);
}
return harm / zetan;
}
/**
* Algorithm from "Quickly Generating Billion-Record Synthetic Databases",
* Gray et. al., 1994
*
* Pick a value in range [min, max) according to zipf distribution,
* with min being the most likely to be chosen
*/
@Override
public long choose(Random rng) {
return quantile(rng.nextDouble());
}
/**
* Quantile function
*
* parts of formula are precomputed in init since they are expensive
* to calculate and only depend on the distribution parameters
*/
public long quantile(double p) {
double uz = p * zetan;
long n = max - min;
if (uz < 1) return min;
if (uz < 1 + point5theta) return min + 1;
long offset = (long) (n * FastMath.pow(eta * p - eta + 1, alpha));
if (offset >= n) return max - 1;
return min + offset;
}
}