PiecewiseLinearDistribution.java example

Explorer

linkbench-master
- src
  - main
    - java
      - com
        facebook
        LinkBench
        Config.java
        ConfigUtil.java
        GraphStore.java
        InvertibleShuffler.java
        Link.java
        LinkBenchConfigError.java
        LinkBenchDriver.java
        LinkBenchDriverMR.java
        LinkBenchLoad.java
        LinkBenchOp.java
        LinkBenchRequest.java
        LinkBenchTask.java
        LinkCount.java
        LinkStore.java
        LinkStoreHBaseGeneralAtomicityTesting.java
        LinkStoreMysql.java
        LinkStoreRocksDb.java
        MemoryLinkStore.java
        Node.java
        NodeLoader.java
        NodeStore.java
        Phase.java
        RealDistribution.java
        Shuffler.java
        Timer.java
        distributions
        AccessDistributions.java
        ApproxHarmonic.java
        GeometricDistribution.java
        Harmonic.java
        ID2Chooser.java
        LinkDistributions.java
        LogNormalDistribution.java
        PiecewiseLinearDistribution.java
        ProbabilityDistribution.java
        UniformDistribution.java
        ZipfDistribution.java
        generators
        DataGenerator.java
        MotifDataGenerator.java
        UniformDataGenerator.java
        stats
        LatencyStats.java
        RunningMean.java
        SampledStats.java
        util
        ClassLoadUtil.java
  - test
    - java
      - com
        facebook
        LinkBench
        DistributionTestBase.java
        DummyLinkStore.java
        DummyLinkStoreTest.java
        GeneratedDataDump.java
        GeomDistTest.java
        GraphStoreTestBase.java
        HarmonicTest.java
        ID2ChooserTest.java
        InvertibleShufflerTest.java
        LinkStoreTestBase.java
        LogNormalTest.java
        MemoryGraphStoreTest.java
        MemoryLinkStoreTest.java
        MemoryNodeStoreTest.java
        MySqlGraphStoreTest.java
        MySqlLinkStoreTest.java
        MySqlNodeStoreTest.java
        MySqlTestConfig.java
        NodeStoreTestBase.java
        PiecewiseDistTest.java
        TestAccessDistribution.java
        TestDataGen.java
        TestRealDistribution.java
        TestStats.java
        TimerTest.java
        UniformDistTest.java
        ZipfDistTest.java
        testtypes
        MySqlTest.java
        ProviderTest.java
        RocksDbTest.java
        SlowTest.java

/*
 * Copyright 2012, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.LinkBench.distributions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;

/**
 * A distribution where the cumulative density function is an arbitrary
 * piecewise linear function.
 *
 * Rather confusingly there are two possible ways of looking at the
 * distribution.  The first is to divide the keyspace by ids, and order
 * these IDs by the number of accesses.  Then DIST-A determines how likely
 * it is that that given key will be chosen.  The second is to divide the
 * keyspace into buckets, where there are multiple keys in each bucket which
 * have been accessed the same number of times.  There DIST-B determines how
 * likely a random key is to fall into each bucket.  The input data is
 * represented as DIST-B, but the probability distribution represented by
 * this class is DIST-A, so we need to convert from one representation to
 * another.
 *
 * The conversion process works as follows.
 * Suppose you have items numbered 0 to n - 1.  Then item i gets assigned
 * the percentile rank p = i / (n - 1), a number between 0 and 1.
 *
 * The input is a set of tuples (r, v), where v is the total number of
 * observations of the item at percentile p.  So the values of the are
 * denominated not in probability density, but rather in number of observation.
 *
 * This means that to convert the input to a probability density distribution,
 * we need to calculate the expected value of the distribution, and then divide
 * the value by that.
 *
 * This is an abstract class: the init method needs to be implemented
 * @author tarmstrong
 *
 */
public abstract class PiecewiseLinearDistribution implements ProbabilityDistribution {

  //helper class to store (value, probability)
  public static class Point implements Comparable<Point> {
    public int value;
    public double probability;

    public Point(int input_value, double input_probability) {
      this.value = input_value;
      this.probability = input_probability;
    }

    public int compareTo(Point obj) {
      Point p = (Point)obj;
      return this.value - p.value;
    }

    public String toString() {
      return "(" + value + ", " + probability + ")";
    }
  }

  protected void init(long min, long max, ArrayList<Point> cdf) {
    double pdf[] = getPDF(cdf);
    double ccdf[] = getCCDF(pdf);
    double cs[] = getCumulativeSum(ccdf);
    long right_points[] = new long[cs.length];
    init(min, max, cdf, cs, right_points, expectedValue(cdf));
  }

  /**
   * Init with precalculated values
   * @param min
   * @param max
   * @param cdf
   * @param cs
   * @param right_points
   * @param expectedValue
   */
  protected void init(long min, long max, ArrayList<Point> cdf,
        double cs[], long right_points[], double expectedValue) {
    this.min = min;
    this.max = max;
    this.cdf = cdf;
    this.cs = cs;
    this.right_points = right_points;
    this.expected_val = expectedValue;
  }

  protected long max;
  protected long min;
  protected ArrayList<Point> cdf;

  protected double[] cs;
  protected long[] right_points;

  /**
   * Total number of observations in data
   */
  private double expected_val;


  @Override
  public double pdf(long id) {
    long n = (max - min);
    double totalSum = expected_val * n;
    return expectedCount(id) / totalSum;
  }

  @Override
  public double expectedCount(long id) {
    return expectedCount(min, max, id, cdf);
  }

  public static double expectedCount(long min, long max, long id,
                ArrayList<Point> cdf) {
    if (id < min || id >= max) {
      return 0.0;
    }
    long n = (max - min);
    // Put in into range [0.0, 1.0] with most popular at 0.0
    double u = 1.0 - (id - min) / (double) n;
    int ix = binarySearch(cdf, u);
    Point p1 = cdf.get(ix);
    assert(u <= p1.probability);

    // Assuming piecewise linear, so equally as probably as p1.value
    return p1.value;
  }

  @Override
  public double cdf(long id) {
    // Since this should be the CDF function for DIST-A, rather
    // than DIST-B, it is non-trivial to calculate (requires some kind
    // of integration of DIST-B).
    throw new RuntimeException("Cdf not implemented yet");
  }

  @Override
  public long quantile(double p) {
    // This is not implemented, due to similar reasons to cdf
    throw new RuntimeException("Quantile not implemented yet");
  }

  @Override
  public long choose(Random rng) {
    return choose(rng, min, max, cs, right_points);
  }

  protected static long choose(Random rng, long startid1, long maxid1,
      double[] cs, long[] right_points) {
    double max_probability = cs[cs.length - 1];
    double p = max_probability * rng.nextDouble();

    int idx = binarySearch(cs, p);
    if (idx == 0) idx = 1;

    /*
     * TODO: this algorithm does not appear to generate data
     * faithful to the distribution.
     * Additional problems include data races if multiple threads are
     * concurrently modifying the shared arrays, and the fact
     * that a workload cannot be reproduced.
     */
    long result = right_points[idx] % (maxid1 - startid1);
    right_points[idx] = (result + 1) % (maxid1 - startid1);
    long id1 = startid1 + result;
    return id1;
  }

  /**
   * Get the expected value of the distribution (e.g. the
   * average number of links
   * @param cdf
   * @return
   */
  protected static double expectedValue(ArrayList<Point> cdf) {
    // This function is not entirely precise since it assumes
    // that the ID space is continuous, which is not an accurate
    // approximation for small ID counts

    if (cdf.size() == 0) return 0;
    // Assume CDF is piecewise linear
    double sum = 0;
    sum = cdf.get(0).probability * cdf.get(0).value;
    for (int i = 1; i < cdf.size(); i++) {
      Point prev = cdf.get(i-1);
      Point curr = cdf.get(i);
      double p = curr.probability - prev.probability;
      sum += p * curr.value;
    }
    return sum;
  }

  public static int binarySearch(ArrayList<Point> points, double p) {
    int left = 0, right = points.size() - 1;
    while (left < right) {
      int mid = (left + right)/2;
      if (points.get(mid).probability >= p) {
        right = mid;
      } else {
        left = mid + 1;
      }
    }
    if (points.get(left).probability >= p) {
      return left;
    } else {
      return left + 1;
    }
  }

  public static int binarySearch(double[] a, double p) {
    // Use built-in binary search
    int res = Arrays.binarySearch(a, p);
    if (res >= 0) {
      return res;
    } else {
      // Arrays.binarySearch returns (-(insertion point) - 1) when not found
      return -(res + 1);
    }
  }

  protected static double[] getPDF(ArrayList<Point> cdf) {
    int max_value = cdf.get(cdf.size() - 1).value;
    double[] pdf = new double[max_value + 1];

    // set all 0
    for (int i = 0; i < pdf.length; ++i) pdf[i] = 0;

    // convert cdf to pdf
    pdf[cdf.get(0).value] = cdf.get(0).probability;
    for (int i = 1; i < cdf.size(); ++i) {
      pdf[cdf.get(i).value] = cdf.get(i).probability -
        cdf.get(i - 1).probability;
    }
    return pdf;
  }

  protected static double[] getCCDF(double[] pdf) {
    int length = pdf.length;
    double[] ccdf = new double[length];
    ccdf[length - 1] = pdf[length - 1];
    for (int i = length - 2; i >= 0; --i) {
      ccdf[i] = ccdf[i + 1] + pdf[i];
    }
    return ccdf;
  }

  protected static double[] getCumulativeSum(double[] cdf) {
    int length = cdf.length;
    double[] cs = new double[length];
    cs[0] = 0; //ignore cdf[0]
    for (int i = 1; i < length; ++i) {
      cs[i] = cs[i - 1] + cdf[i];
    }
    return cs;
  }
}