/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.zebra.mapreduce;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.hadoop.zebra.tfile.RandomDistribution.Binomial;
import org.apache.hadoop.zebra.tfile.RandomDistribution.DiscreteRNG;
import org.apache.hadoop.zebra.tfile.RandomDistribution.Zipf;
/**
* A dictionary that generates English words, whose frequency follows Zipf
* distributions, and length follows Binomial distribution.
*/
class Dictionary {
private static final double BINOMIAL_P = 0.3;
private static final double SIGMA = 1.1;
private final int lead;
private final Zipf zipf;
private final String[] dict;
private final long[] wordCnts;
private static String makeWord(DiscreteRNG rng, Random random) {
int len = rng.nextInt();
StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; ++i) {
sb.append((char) ('a' + random.nextInt(26)));
}
return sb.toString();
}
/**
* Constructor
*
* @param entries
* How many words exist in the dictionary.
* @param minWordLen
* Minimum word length.
* @param maxWordLen
* Maximum word length.
* @param freqRatio
* Expected ratio between the most frequent words and the least
* frequent words. (e.g. 100)
*/
public Dictionary(Random random, int entries, int minWordLen, int maxWordLen,
int freqRatio) {
Binomial binomial = new Binomial(random, minWordLen, maxWordLen, BINOMIAL_P);
lead = Math.max(0,
(int) (entries / (Math.exp(Math.log(freqRatio) / SIGMA) - 1)) - 1);
zipf = new Zipf(random, lead, entries + lead, 1.1);
dict = new String[entries];
// Use a set to ensure no dup words in dictionary
Set<String> dictTmp = new HashSet<String>();
for (int i = 0; i < entries; ++i) {
while (true) {
String word = makeWord(binomial, random);
if (!dictTmp.contains(word)) {
dictTmp.add(word);
dict[i] = word;
break;
}
}
}
wordCnts = new long[dict.length];
}
/**
* Get the next word from the dictionary.
*
* @return The next word from the dictionary.
*/
public String nextWord() {
int index = zipf.nextInt() - lead;
++wordCnts[index];
return dict[index];
}
public void resetWordCnts() {
for (int i = 0; i < wordCnts.length; ++i) {
wordCnts[i] = 0;
}
}
public Map<String, Long> getWordCounts() {
Map<String, Long> ret = new HashMap<String, Long>();
for (int i = 0; i < dict.length; ++i) {
if (wordCnts[i] > 0) {
ret.put(dict[i], wordCnts[i]);
}
}
return ret;
}
}