package org.apache.cassandra.stress.generatedata; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.commons.math3.distribution.EnumeratedDistribution; import org.apache.commons.math3.util.Pair; import static com.google.common.base.Charsets.UTF_8; public class DataGenStringDictionary extends DataGen { private final byte space = ' '; private final EnumeratedDistribution<byte[]> words; public DataGenStringDictionary(EnumeratedDistribution<byte[]> wordDistribution) { words = wordDistribution; } @Override public void generate(ByteBuffer fill, long index) { fill(fill, 0); } @Override public void generate(List<ByteBuffer> fills, long index) { for (int i = 0 ; i < fills.size() ; i++) fill(fills.get(0), i); } private void fill(ByteBuffer fill, int column) { fill.clear(); byte[] trg = fill.array(); int i = 0; while (i < trg.length) { if (i > 0) trg[i++] = space; byte[] src = words.sample(); System.arraycopy(src, 0, trg, i, Math.min(src.length, trg.length - i)); i += src.length; } } @Override public boolean isDeterministic() { return true; } public static DataGenFactory getFactory(File file) throws IOException { final List<Pair<byte[], Double>> words = new ArrayList<>(); try (final BufferedReader reader = new BufferedReader(new FileReader(file))) { String line; while ( null != (line = reader.readLine()) ) { String[] pair = line.split(" +"); if (pair.length != 2) throw new IllegalArgumentException("Invalid record in dictionary: \"" + line + "\""); words.add(new Pair<>(pair[1].getBytes(UTF_8), Double.parseDouble(pair[0]))); } final EnumeratedDistribution<byte[]> dist = new EnumeratedDistribution<byte[]>(words); return new DataGenFactory() { @Override public DataGen get() { return new DataGenStringDictionary(dist); } }; } } }