/*
* Copyright 2012, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.LinkBench.distributions;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
/**
* A distribution where the cumulative density function is an arbitrary
* piecewise linear function.
*
* Rather confusingly there are two possible ways of looking at the
* distribution. The first is to divide the keyspace by ids, and order
* these IDs by the number of accesses. Then DIST-A determines how likely
* it is that that given key will be chosen. The second is to divide the
* keyspace into buckets, where there are multiple keys in each bucket which
* have been accessed the same number of times. There DIST-B determines how
* likely a random key is to fall into each bucket. The input data is
* represented as DIST-B, but the probability distribution represented by
* this class is DIST-A, so we need to convert from one representation to
* another.
*
* The conversion process works as follows.
* Suppose you have items numbered 0 to n - 1. Then item i gets assigned
* the percentile rank p = i / (n - 1), a number between 0 and 1.
*
* The input is a set of tuples (r, v), where v is the total number of
* observations of the item at percentile p. So the values of the are
* denominated not in probability density, but rather in number of observation.
*
* This means that to convert the input to a probability density distribution,
* we need to calculate the expected value of the distribution, and then divide
* the value by that.
*
* This is an abstract class: the init method needs to be implemented
* @author tarmstrong
*
*/
public abstract class PiecewiseLinearDistribution implements ProbabilityDistribution {
//helper class to store (value, probability)
public static class Point implements Comparable<Point> {
public int value;
public double probability;
public Point(int input_value, double input_probability) {
this.value = input_value;
this.probability = input_probability;
}
public int compareTo(Point obj) {
Point p = (Point)obj;
return this.value - p.value;
}
public String toString() {
return "(" + value + ", " + probability + ")";
}
}
protected void init(long min, long max, ArrayList<Point> cdf) {
double pdf[] = getPDF(cdf);
double ccdf[] = getCCDF(pdf);
double cs[] = getCumulativeSum(ccdf);
long right_points[] = new long[cs.length];
init(min, max, cdf, cs, right_points, expectedValue(cdf));
}
/**
* Init with precalculated values
* @param min
* @param max
* @param cdf
* @param cs
* @param right_points
* @param expectedValue
*/
protected void init(long min, long max, ArrayList<Point> cdf,
double cs[], long right_points[], double expectedValue) {
this.min = min;
this.max = max;
this.cdf = cdf;
this.cs = cs;
this.right_points = right_points;
this.expected_val = expectedValue;
}
protected long max;
protected long min;
protected ArrayList<Point> cdf;
protected double[] cs;
protected long[] right_points;
/**
* Total number of observations in data
*/
private double expected_val;
@Override
public double pdf(long id) {
long n = (max - min);
double totalSum = expected_val * n;
return expectedCount(id) / totalSum;
}
@Override
public double expectedCount(long id) {
return expectedCount(min, max, id, cdf);
}
public static double expectedCount(long min, long max, long id,
ArrayList<Point> cdf) {
if (id < min || id >= max) {
return 0.0;
}
long n = (max - min);
// Put in into range [0.0, 1.0] with most popular at 0.0
double u = 1.0 - (id - min) / (double) n;
int ix = binarySearch(cdf, u);
Point p1 = cdf.get(ix);
assert(u <= p1.probability);
// Assuming piecewise linear, so equally as probably as p1.value
return p1.value;
}
@Override
public double cdf(long id) {
// Since this should be the CDF function for DIST-A, rather
// than DIST-B, it is non-trivial to calculate (requires some kind
// of integration of DIST-B).
throw new RuntimeException("Cdf not implemented yet");
}
@Override
public long quantile(double p) {
// This is not implemented, due to similar reasons to cdf
throw new RuntimeException("Quantile not implemented yet");
}
@Override
public long choose(Random rng) {
return choose(rng, min, max, cs, right_points);
}
protected static long choose(Random rng, long startid1, long maxid1,
double[] cs, long[] right_points) {
double max_probability = cs[cs.length - 1];
double p = max_probability * rng.nextDouble();
int idx = binarySearch(cs, p);
if (idx == 0) idx = 1;
/*
* TODO: this algorithm does not appear to generate data
* faithful to the distribution.
* Additional problems include data races if multiple threads are
* concurrently modifying the shared arrays, and the fact
* that a workload cannot be reproduced.
*/
long result = right_points[idx] % (maxid1 - startid1);
right_points[idx] = (result + 1) % (maxid1 - startid1);
long id1 = startid1 + result;
return id1;
}
/**
* Get the expected value of the distribution (e.g. the
* average number of links
* @param cdf
* @return
*/
protected static double expectedValue(ArrayList<Point> cdf) {
// This function is not entirely precise since it assumes
// that the ID space is continuous, which is not an accurate
// approximation for small ID counts
if (cdf.size() == 0) return 0;
// Assume CDF is piecewise linear
double sum = 0;
sum = cdf.get(0).probability * cdf.get(0).value;
for (int i = 1; i < cdf.size(); i++) {
Point prev = cdf.get(i-1);
Point curr = cdf.get(i);
double p = curr.probability - prev.probability;
sum += p * curr.value;
}
return sum;
}
public static int binarySearch(ArrayList<Point> points, double p) {
int left = 0, right = points.size() - 1;
while (left < right) {
int mid = (left + right)/2;
if (points.get(mid).probability >= p) {
right = mid;
} else {
left = mid + 1;
}
}
if (points.get(left).probability >= p) {
return left;
} else {
return left + 1;
}
}
public static int binarySearch(double[] a, double p) {
// Use built-in binary search
int res = Arrays.binarySearch(a, p);
if (res >= 0) {
return res;
} else {
// Arrays.binarySearch returns (-(insertion point) - 1) when not found
return -(res + 1);
}
}
protected static double[] getPDF(ArrayList<Point> cdf) {
int max_value = cdf.get(cdf.size() - 1).value;
double[] pdf = new double[max_value + 1];
// set all 0
for (int i = 0; i < pdf.length; ++i) pdf[i] = 0;
// convert cdf to pdf
pdf[cdf.get(0).value] = cdf.get(0).probability;
for (int i = 1; i < cdf.size(); ++i) {
pdf[cdf.get(i).value] = cdf.get(i).probability -
cdf.get(i - 1).probability;
}
return pdf;
}
protected static double[] getCCDF(double[] pdf) {
int length = pdf.length;
double[] ccdf = new double[length];
ccdf[length - 1] = pdf[length - 1];
for (int i = length - 2; i >= 0; --i) {
ccdf[i] = ccdf[i + 1] + pdf[i];
}
return ccdf;
}
protected static double[] getCumulativeSum(double[] cdf) {
int length = cdf.length;
double[] cs = new double[length];
cs[0] = 0; //ignore cdf[0]
for (int i = 1; i < length; ++i) {
cs[i] = cs[i - 1] + cdf[i];
}
return cs;
}
}