/* * Copyright 2012, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.LinkBench.generators; import java.util.Properties; import java.util.Random; import com.facebook.LinkBench.Config; import com.facebook.LinkBench.ConfigUtil; import com.facebook.LinkBench.LinkBenchConfigError; /** * A simple data generator where the same sequences of bytes, or "motifs" occur * multiple times. This is designed to emulate one particular property of real * data that is exploited by compression algorithms. Typically a short sequence * of data generated by this generator will not be very compressible on its own, * as no motifs will recur, but if multiple output strings are concatenated * together then the same motifs will recur repeatedly and the data will be * compressible. * * The motif data generator has a buffer of "shared" motifs, which reoccur * frequently in the output of the generator * * The data generator generates bytes from within the range of values [min, max). * There is an additional parameter, which is called uniqueness for lack of a * better name. The generator fills a buffer with data in chunks. A chunk * is either generated as random new bytes, or is drawn from the "motifs", * * The uniqueness parameter controls the proportion of new chunks versus duplicated * motifs. It is a probability between 0.0 and 1.0. It can also be seen as the expected * percentage of bytes are generated from scratch. * * Control how often motifs appear in data * uniqueness = 0.0: all data drawn from motifs * uniqueness 1.0: completely independent bytes */ public class MotifDataGenerator implements DataGenerator { private static final int MAX_CHUNK_SIZE = 128; public static final int DEFAULT_MOTIF_BUFFER_SIZE = 512; /** Lowest byte to appear in output */ private int start; /** Number of distinct bytes to appear in output */ private int range; /** percentage of data drawn from motifs */ private double uniqueness; /** * Buffer with a sequence of random bytes that are * pasted into output. Starts off null, initialized * on demand. */ private byte motifs[]; /** Size of motif buffer */ private int motifBytes; public MotifDataGenerator() { start = '\0'; range = 1; uniqueness = 0.0; } /** * Generate characters from start to end (inclusive both ends) * @param start * @param end */ public void init(int start, int end, double uniqueness) { init(start, end, uniqueness, DEFAULT_MOTIF_BUFFER_SIZE); } public void init(int start, int end, double uniqueness, int motifBytes) { if (start < 0 || start >= 256) { throw new LinkBenchConfigError("start " + start + " out of range [0,255]"); } if (end < 0 || end >= 256) { throw new LinkBenchConfigError("endbyte " + end + " out of range [0,255]"); } if (start >= end) { throw new LinkBenchConfigError("startByte " + start + " >= endByte " + end); } this.start = (byte)start; this.range = end - start + 1; this.uniqueness = uniqueness; this.motifBytes = motifBytes; this.motifs = null; } @Override public void init(Properties props, String keyPrefix) { int startByte = ConfigUtil.getInt(props, keyPrefix + Config.UNIFORM_GEN_STARTBYTE); int endByte = ConfigUtil.getInt(props, keyPrefix + Config.UNIFORM_GEN_ENDBYTE); double uniqueness = ConfigUtil.getDouble(props, keyPrefix + Config.MOTIF_GEN_UNIQUENESS); if (props.contains(keyPrefix + Config.MOTIF_GEN_LENGTH)) { int motifBytes = ConfigUtil.getInt(props, keyPrefix + Config.MOTIF_GEN_LENGTH); init(startByte, endByte, uniqueness, motifBytes); } else { init(startByte, endByte, uniqueness); } } /** * Give an upper bound for the compression ratio for the algorithm * @return number between 0.0 and 1.0 - 0.0 is perfectly compressible, * 1.0 is incompressible */ public double estMaxCompression() { // Avg bytes required to represent each character (uniformly distributed) double charCompression = range / (double) 255; // random data shouldn't have any inter-character correlations that can // be compressed. Upper bound derived by assuming motif is completely // compressible return charCompression * uniqueness; } @Override public byte[] fill(Random rng, byte[] data) { // Fill motifs now so that we can use rng if (motifs == null) { motifs = new byte[motifBytes]; for (int i = 0; i < motifs.length; i++) { motifs[i] = (byte) (start + rng.nextInt(range)); } } int n = data.length; int chunk = Math.min(MAX_CHUNK_SIZE, motifBytes); for (int i = 0; i < n; i += chunk) { if (rng.nextDouble() < uniqueness) { int chunkEnd = Math.min(n, i + chunk); // New sequence of unique bytes for (int j = i; j < chunkEnd; j++) { data[j] = (byte) (start + rng.nextInt(range)); } } else { int thisChunk = Math.min(chunk, n - i); int k = rng.nextInt(motifBytes - thisChunk + 1); // Copy previous sequence of bytes System.arraycopy(motifs, k, data, i, thisChunk); } } return data; } }