/** * Copyright (c) 2010 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. See accompanying * LICENSE file. */ package com.couchbase.loadgen.generator; import java.util.Random; /** * A generator of a zipfian distribution. It produces a sequence of items, such * that some items are more popular than others, according to a zipfian * distribution. When you construct an instance of this class, you specify the * number of items in the set to draw from, either by specifying an itemcount * (so that the sequence is of items from 0 to itemcount-1) or by specifying a * min and a max (so that the sequence is of items from min to max inclusive). * After you construct the instance, you can change the number of items by * calling nextInt(itemcount) or nextLong(itemcount). * * Note that the popular items will be clustered together, e.g. item 0 is the * most popular, item 1 the second most popular, and so on (or min is the most * popular, min+1 the next most popular, etc.) If you don't want this * clustering, and instead want the popular items scattered throughout the item * space, then use ScrambledZipfianGenerator instead. * * Be aware: initializing this generator may take a long time if there are lots * of items to choose from (e.g. over a minute for 100 million objects). This is * because certain mathematical values need to be computed to properly generate * a zipfian skew, and one of those values (zeta) is a sum sequence from 1 to n, * where n is the itemcount. Note that if you increase the number of items in * the set, we can compute a new zeta incrementally, so it should be fast unless * you have added millions of items. However, if you decrease the number of * items, we recompute zeta from scratch, so this can take a long time. * * The algorithm used here is from * "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, * SIGMOD 1994. */ public class ZipfianGenerator extends IntegerGenerator { public static final double ZIPFIAN_CONSTANT = 0.99; /** * Number of items. */ long items; /** * Min item to generate. */ long base; /** * The zipfian constant to use. */ double zipfianconstant; /** * Computed parameters for generating the distribution. */ double alpha, zetan, eta, theta, zeta2theta; Random random; /** * The number of items used to compute zetan the last time. */ long countforzeta; /** * Flag to prevent problems. If you increase the number of items the zipfian * generator is allowed to choose from, this code will incrementally compute * a new zeta value for the larger itemcount. However, if you decrease the * number of items, the code computes zeta from scratch; this is expensive * for large itemsets. Usually this is not intentional; e.g. one thread * thinks the number of items is 1001 and calls "nextLong()" with that item * count; then another thread who thinks the number of items is 1000 calls * nextLong() with itemcount=1000 triggering the expensive recomputation. * (It is expensive for 100 million items, not really for 1000 items.) Why * did the second thread think there were only 1000 items? maybe it read the * item count before the first thread incremented it. So this flag allows * you to say if you really do want that recomputation. If true, then the * code will recompute zeta if the itemcount goes down. If false, the code * will assume itemcount only goes up, and never recompute. */ boolean allowitemcountdecrease = false; /******************************* Constructors **************************************/ /** * Create a zipfian generator for the specified number of items. * * @param _items * The number of items in the distribution. */ public ZipfianGenerator(long _items) { this(0, _items - 1); } /** * Create a zipfian generator for items between min and max. * * @param _min * The smallest integer to generate in the sequence. * @param _max * The largest integer to generate in the sequence. */ public ZipfianGenerator(long _min, long _max) { this(_min, _max, ZIPFIAN_CONSTANT); } /** * Create a zipfian generator for the specified number of items using the * specified zipfian constant. * * @param _items * The number of items in the distribution. * @param _zipfianconstant * The zipfian constant to use. */ public ZipfianGenerator(long _items, double _zipfianconstant) { this(0, _items - 1, _zipfianconstant); } /** * Create a zipfian generator for items between min and max (inclusive) for * the specified zipfian constant. * * @param min * The smallest integer to generate in the sequence. * @param max * The largest integer to generate in the sequence. * @param _zipfianconstant * The zipfian constant to use. */ public ZipfianGenerator(long min, long max, double _zipfianconstant) { this(min, max, _zipfianconstant, zetastatic(max - min + 1, _zipfianconstant)); } /** * Create a zipfian generator for items between min and max (inclusive) for * the specified zipfian constant, using the precomputed value of zeta. * * @param min * The smallest integer to generate in the sequence. * @param max * The largest integer to generate in the sequence. * @param _zipfianconstant * The zipfian constant to use. * @param _zetan * The precomputed zeta constant. */ public ZipfianGenerator(long min, long max, double _zipfianconstant, double _zetan) { items = max - min + 1; base = min; zipfianconstant = _zipfianconstant; random = new Random(); theta = zipfianconstant; zeta2theta = zeta(2, theta); alpha = 1.0 / (1.0 - theta); // zetan=zeta(items,theta); zetan = _zetan; countforzeta = items; eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); // System.out.println("XXXX 3 XXXX"); nextInt(); // System.out.println("XXXX 4 XXXX"); } /**************************************************************************/ /** * Compute the zeta constant needed for the distribution. Do this from * scratch for a distribution with n items, using the zipfian constant * theta. Remember the value of n, so if we change the itemcount, we can * recompute zeta. * * @param n * The number of items to compute zeta over. * @param theta * The zipfian constant. */ double zeta(long n, double theta) { countforzeta = n; return zetastatic(n, theta); } /** * Compute the zeta constant needed for the distribution. Do this from * scratch for a distribution with n items, using the zipfian constant * theta. This is a static version of the function which will not remember * n. * * @param n * The number of items to compute zeta over. * @param theta * The zipfian constant. */ static double zetastatic(long n, double theta) { return zetastatic(0, n, theta, 0); } /** * Compute the zeta constant needed for the distribution. Do this * incrementally for a distribution that has n items now but used to have st * items. Use the zipfian constant theta. Remember the new value of n so * that if we change the itemcount, we'll know to recompute zeta. * * @param st * The number of items used to compute the last initialsum * @param n * The number of items to compute zeta over. * @param theta * The zipfian constant. * @param initialsum * The value of zeta we are computing incrementally from. */ double zeta(long st, long n, double theta, double initialsum) { countforzeta = n; return zetastatic(st, n, theta, initialsum); } /** * Compute the zeta constant needed for the distribution. Do this * incrementally for a distribution that has n items now but used to have st * items. Use the zipfian constant theta. Remember the new value of n so * that if we change the itemcount, we'll know to recompute zeta. * * @param st * The number of items used to compute the last initialsum * @param n * The number of items to compute zeta over. * @param theta * The zipfian constant. * @param initialsum * The value of zeta we are computing incrementally from. */ static double zetastatic(long st, long n, double theta, double initialsum) { double sum = initialsum; for (long i = st; i < n; i++) { sum += 1 / (Math.pow(i + 1, theta)); } // System.out.println("countforzeta="+countforzeta); return sum; } /****************************************************************************************/ /** * Generate the next item. this distribution will be skewed toward lower * integers; e.g. 0 will be the most popular, 1 the next most popular, etc. * * @param itemcount * The number of items in the distribution. * @return The next item in the sequence. */ public int nextInt(int itemcount) { return (int) nextLong(itemcount); } /** * Generate the next item as a long. * * @param itemcount * The number of items in the distribution. * @return The next item in the sequence. */ public long nextLong(long itemcount) { // from "Quickly Generating Billion-Record Synthetic Databases", Jim // Gray et al, SIGMOD 1994 if (itemcount != countforzeta) { // have to recompute zetan and eta, since they depend on itemcount synchronized (this) { if (itemcount > countforzeta) { // System.err.println("WARNING: Incrementally recomputing Zipfian distribtion. (itemcount="+itemcount+" countforzeta="+countforzeta+")"); // we have added more items. can compute zetan // incrementally, which is cheaper zetan = zeta(countforzeta, itemcount, theta, zetan); eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); } else if ((itemcount < countforzeta) && (allowitemcountdecrease)) { // have to start over with zetan // note : for large itemsets, this is very slow. so don't do // it! // TODO: can also have a negative incremental computation, // e.g. if you decrease the number of items, then just // subtract // the zeta sequence terms for the items that went away. // This would be faster than recomputing from scratch when // the number of items // decreases System.err .println("WARNING: Recomputing Zipfian distribtion. This is slow and should be avoided. (itemcount=" + itemcount + " countforzeta=" + countforzeta + ")"); zetan = zeta(itemcount, theta); eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); } } } double u = random.nextDouble(); double uz = u * zetan; if (uz < 1.0) { return 0; } if (uz < 1.0 + Math.pow(0.5, theta)) { return 1; } long ret = base + (long) ((itemcount) * Math.pow(eta * u - eta + 1, alpha)); setLastInt((int) ret); return ret; } /** * Return the next value, skewed by the Zipfian distribution. The 0th item * will be the most popular, followed by the 1st, followed by the 2nd, etc. * (Or, if min != 0, the min-th item is the most popular, the min+1th item * the next most popular, etc.) If you want the popular items scattered * throughout the item space, use ScrambledZipfianGenerator instead. */ @Override public int nextInt() { return (int) nextLong(items); } /** * Return the next value, skewed by the Zipfian distribution. The 0th item * will be the most popular, followed by the 1st, followed by the 2nd, etc. * (Or, if min != 0, the min-th item is the most popular, the min+1th item * the next most popular, etc.) If you want the popular items scattered * throughout the item space, use ScrambledZipfianGenerator instead. */ public long nextLong() { return nextLong(items); } public static void main(String[] args) { new ZipfianGenerator(ScrambledZipfianGenerator.ITEM_COUNT); } }