package ch.unibe.scg.cells.benchmarks;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import ch.unibe.scg.cells.Cell;
import ch.unibe.scg.cells.Cells;
import ch.unibe.scg.cells.Codec;
import ch.unibe.scg.cells.InMemoryPipeline;
import ch.unibe.scg.cells.LocalExecutionModule;
import ch.unibe.scg.cells.Mapper;
import ch.unibe.scg.cells.OneShotIterable;
import ch.unibe.scg.cells.Pipeline;
import ch.unibe.scg.cells.Sink;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.io.CharStreams;
import com.google.common.primitives.Ints;
import com.google.inject.Guice;
import com.google.protobuf.ByteString;
/**
* Benchmarks cells performance on a local machine with wordcount problem.
* Input folder can be specified via command line argument.
*/
public class CellsInMemoryWordCountBenchmark {
private final static int TIMES = 50;
final static class WordCount {
final String word;
final String fileName;
int count;
WordCount(String word, String fileName, int count) {
this.count = count;
this.word = word;
this.fileName = fileName;
}
@Override
public String toString() {
return word + ": " + count;
}
}
final static class WordCountCodec implements Codec<WordCount> {
private static final long serialVersionUID = 1L;
@Override
public Cell<WordCount> encode(WordCount s) {
return Cell.make(ByteString.copyFromUtf8(s.word),
ByteString.copyFromUtf8(s.fileName),
ByteString.copyFrom(Ints.toByteArray(s.count)));
}
@Override
public WordCount decode(Cell<WordCount> encoded) throws IOException {
return new WordCount(encoded.getRowKey().toStringUtf8(),
encoded.getColumnKey().toStringUtf8(),
Ints.fromByteArray(encoded.getCellContents().toByteArray()));
}
}
final static class FileContent {
final String fileName;
final String content;
FileContent(String fileName, String content) {
this.fileName = fileName;
this.content = content;
}
}
final static class FileContentCodec implements Codec<FileContent> {
private static final long serialVersionUID = 1L;
private static ByteString colKey = ByteString.copyFromUtf8("c");
@Override
public Cell<FileContent> encode(FileContent b) {
return Cell.make(ByteString.copyFromUtf8(b.fileName),
colKey,
ByteString.copyFromUtf8(b.content));
}
@Override
public FileContent decode(Cell<FileContent> encoded) throws IOException {
return new FileContent(encoded.getRowKey().toStringUtf8(),
encoded.getCellContents().toStringUtf8());
}
}
final static class WordParser implements Mapper<FileContent, WordCount> {
private static final long serialVersionUID = 1L;
@Override
public void close() throws IOException { }
@Override
public void map(FileContent first, OneShotIterable<FileContent> row, Sink<WordCount> sink)
throws IOException, InterruptedException {
Map<String, WordCount> dictionary = new HashMap<>();
for (FileContent file : row) {
for (String word: file.content.split("\\s+")) {
if (!word.isEmpty()) {
if (!dictionary.containsKey(word)) {
dictionary.put(word, new WordCount(word, file.fileName, 0));
}
dictionary.get(word).count++;
}
}
}
for (WordCount wc : dictionary.values()) {
sink.write(wc);
}
}
}
final static class WordCounter implements Mapper<WordCount, WordCount> {
private static final long serialVersionUID = 1L;
@Override
public void close() throws IOException { }
@Override
public void map(WordCount first, OneShotIterable<WordCount> row, Sink<WordCount> sink)
throws IOException, InterruptedException {
int count = 0;
for (WordCount wc : row) {
count += wc.count;
}
sink.write(new WordCount(first.word, first.fileName, count));
}
}
/**
* Runs a wordcount benchmark. You can specify input folder with first argument.
* The default input folder is "benchmarks/data"
*/
public static void main(String[] args) throws IOException, InterruptedException {
String input = "benchmarks/data";
if (args.length > 0) {
input = args[0];
}
double[] timings = new double[TIMES];
NumberFormat f = NumberFormat.getInstance();
f.setMaximumFractionDigits(2);
for (int i = 0; i < TIMES; i++) {
long startTime = System.nanoTime();
try (InMemoryPipeline<FileContent, WordCount> pipe
= Guice.createInjector(new LocalExecutionModule()).getInstance(InMemoryPipeline.Builder.class)
.make(Cells.shard(Cells.encode(readFilesFromDisk(input), new FileContentCodec())))) {
run(pipe);
long total = 0;
for (Iterable<WordCount> wcs : pipe.lastEfflux()) {
total += Iterables.size(wcs);
}
timings[i] = (System.nanoTime() - startTime) / 1_000_000_000.0;
System.out.println(f.format(timings[i]));
System.out.println("Total words: " + total);
}
}
System.out.println("--------------");
System.out.println(String.format("median: %s", f.format(median(timings))));
System.out.println(String.format("min: %s", f.format(min(timings))));
}
static void run(Pipeline<FileContent, WordCount> pipe) throws IOException, InterruptedException {
pipe.influx(new FileContentCodec())
.map(new WordParser())
.shuffle(new WordCountCodec())
.mapAndEfflux(new WordCounter(), new WordCountCodec());
}
static Iterable<FileContent> readFilesFromDisk(String path) {
final ImmutableList.Builder<FileContent> ret = ImmutableList.builder();
for (File f : new File(path).listFiles()) {
try {
ret.add(new FileContent(f.getName(),
CharStreams.toString(new InputStreamReader(new FileInputStream(f), Charsets.UTF_8))));
} catch (IOException e) {
e.printStackTrace();
}
}
return ret.build();
}
private static double median(double[] d) {
if (d == null || d.length == 0) {
throw new IllegalArgumentException("Median of 0 elements is undefined.");
}
double[] copy = Arrays.copyOf(d, d.length);
Arrays.sort(copy);
return copy[copy.length / 2];
}
static double min(double[] d) {
if (d == null || d.length == 0) {
throw new IllegalArgumentException("Min of 0 elements is undefined.");
}
double min = d[0];
for (int i = 1; i < d.length; i++) {
if (d[i] < min) {
min = d[i];
}
}
return min;
}
}