package ch.unibe.scg.cells.hadoop;
import static java.lang.annotation.ElementType.FIELD;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.ElementType.PARAMETER;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
import java.nio.ByteBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.inject.Qualifier;
import org.apache.hadoop.conf.Configuration;
import org.junit.Test;
import ch.unibe.scg.cells.Cell;
import ch.unibe.scg.cells.Cells;
import ch.unibe.scg.cells.CellsModule;
import ch.unibe.scg.cells.Codec;
import ch.unibe.scg.cells.InMemoryPipeline;
import ch.unibe.scg.cells.LocalExecutionModule;
import ch.unibe.scg.cells.Mapper;
import ch.unibe.scg.cells.OneShotIterable;
import ch.unibe.scg.cells.Pipeline;
import ch.unibe.scg.cells.Sink;
import ch.unibe.scg.cells.Source;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.io.CharStreams;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import com.google.inject.TypeLiteral;
import com.google.protobuf.ByteString;
@SuppressWarnings("javadoc")
public final class HadoopPipelineTest {
private final static ByteString FAMILY = ByteString.copyFromUtf8("f");
static class Act {
final int number;
final String content;
Act(int number, String content) {
this.number = number;
this.content = content;
}
}
static class ActCodec implements Codec<Act> {
private static final long serialVersionUID = 1L;
@Override
public Cell<Act> encode(Act act) {
return Cell.make(ByteString.copyFrom(Ints.toByteArray(act.number)), ByteString.copyFromUtf8("t"),
ByteString.copyFromUtf8(act.content));
}
@Override
public Act decode(Cell<Act> encoded) throws IOException {
return new Act(Ints.fromByteArray(encoded.getRowKey().toByteArray()), encoded.getCellContents()
.toStringUtf8());
}
}
private static class Word {
final String word;
final int act;
final int pos;
Word(String word, int act, int pos) {
this.word = word;
this.act = act;
this.pos = pos;
}
}
private static class WordCodec implements Codec<Word> {
private static final long serialVersionUID = 1L;
@Override
public Cell<Word> encode(Word s) {
ByteBuffer col = ByteBuffer.allocate(2 * Ints.BYTES);
col.mark();
col.putInt(s.act);
col.putInt(s.pos);
col.reset();
return Cell.make(
ByteString.copyFromUtf8(s.word),
ByteString.copyFrom(col),
ByteString.EMPTY);
}
@Override
public Word decode(Cell<Word> encoded) throws IOException {
ByteBuffer col = encoded.getColumnKey().asReadOnlyByteBuffer();
int nAct = col.getInt();
int pos = col.getInt();
return new Word(encoded.getRowKey().toStringUtf8(), nAct, pos);
}
}
private static class WordCount {
final String word;
final long count;
WordCount(String word, long count) {
this.word = word;
this.count = count;
}
@Override
public String toString() {
return word + " " + count;
}
}
private static class WordCountCodec implements Codec<WordCount> {
private static final long serialVersionUID = 1L;
@Override
public Cell<WordCount> encode(WordCount s) {
return Cell.make(ByteString.copyFromUtf8(s.word),
ByteString.copyFromUtf8("1"),
ByteString.copyFrom(Longs.toByteArray(s.count)));
}
@Override
public WordCount decode(Cell<WordCount> encoded) throws IOException {
return new WordCount(encoded.getRowKey().toStringUtf8(),
Longs.fromByteArray(encoded.getCellContents().toByteArray()));
}
}
private static class WordParseMapper implements Mapper<Act, Word> {
private static final long serialVersionUID = 1L;
@Override
public void close() { }
@Override
public void map(Act first, OneShotIterable<Act> row, Sink<Word> sink) throws IOException,
InterruptedException {
for (Act act : row) {
Matcher matcher = Pattern.compile("\\w+").matcher(act.content);
while (matcher.find()) {
sink.write(new Word(matcher.group(), first.number, matcher.start()));
}
}
}
}
private static class WordAdderMapper implements Mapper<Word, WordCount> {
private static final long serialVersionUID = 1L;
@Override
public void close() { }
@Override
public void map(Word first, OneShotIterable<Word> row, Sink<WordCount> sink)
throws IOException, InterruptedException {
int len = Iterables.size(row);
sink.write(new WordCount(first.word, len));
}
}
private static class WordFilter implements Mapper<WordCount, WordCount> {
private static final long serialVersionUID = 1L;
@Override
public void close() throws IOException { }
@Override
public void map(WordCount first, OneShotIterable<WordCount> row, Sink<WordCount> sink) throws IOException,
InterruptedException {
for (WordCount wc : row) {
if (wc.word.equals("your")) {
sink.write(wc);
}
}
}
}
@Test
public void testHadoopWordcount() throws IOException, InterruptedException {
TableAdmin tableAdmin = Guice.createInjector(new UnibeModule()).getInstance(TableAdmin.class);
try (Table<Act> in = tableAdmin.createTemporaryTable(FAMILY);
Table<WordCount> eff = tableAdmin.createTemporaryTable(FAMILY)) {
Module tab = new CellsModule() {
@Override
protected void configure() {
installTable(
In.class,
new TypeLiteral<Act>() {},
ActCodec.class,
new HBaseStorage(), new HBaseTableModule<>(in));
installTable(
Eff.class,
new TypeLiteral<WordCount>() {},
WordCountCodec.class,
new HBaseStorage(), new HBaseTableModule<>(eff));
}
};
Injector injector = Guice.createInjector(tab, new UnibeModule());
try (Sink<Act> sink = injector.getInstance(Key.get(new TypeLiteral<Sink<Act>>() {}, In.class))) {
Iterable<Act> acts = readActsFromDisk();
for (Act act : acts) {
sink.write(act);
}
}
run(HadoopPipeline.fromTableToTable(injector.getInstance(Configuration.class), in, eff));
try (Source<WordCount> src = injector.getInstance(Key.get(new TypeLiteral<Source<WordCount>>() {}, Eff.class))) {
WordCount wc = Iterables.getOnlyElement(Iterables.getOnlyElement(src));
assertThat(wc.word, is("your"));
assertThat(wc.count, is(239L));
}
}
}
@Test
public void testInMemoryWordCount() throws IOException, InterruptedException {
try (InMemoryPipeline<Act, WordCount> pipe
= Guice.createInjector(new LocalExecutionModule())
.getInstance(InMemoryPipeline.Builder.class)
.make(Cells.shard(Cells.encode(readActsFromDisk(), new ActCodec())))) {
run(pipe);
long cnt = -1;
for (Iterable<WordCount> wcs : pipe.lastEfflux()) {
for (WordCount wc : wcs) {
if (wc.word.equals("your")) {
cnt = wc.count;
}
}
assertThat(cnt, is(239L));
}
}
}
void run(Pipeline<Act, WordCount> pipeline) throws IOException, InterruptedException {
pipeline.influx(new ActCodec())
.map(new WordParseMapper())
.shuffle(new WordCodec())
.map(new WordAdderMapper())
.shuffle(new WordCountCodec())
.mapAndEfflux(new WordFilter(), new WordCountCodec());
}
@Qualifier
@Target({ FIELD, PARAMETER, METHOD })
@Retention(RUNTIME)
public static @interface In {}
@Qualifier
@Target({ FIELD, PARAMETER, METHOD })
@Retention(RUNTIME)
public static @interface Eff {}
static Iterable<Act> readActsFromDisk() throws IOException {
ImmutableList.Builder<Act> ret = ImmutableList.builder();
String richard = CharStreams.toString(new InputStreamReader(HadoopPipelineTest.class.getResourceAsStream(
"richard-iii.txt"), Charsets.UTF_8));
String[] actStrings = richard.split("\\bACT\\s[IVX]+");
for (int i = 0; i < actStrings.length; i++) {
ret.add(new Act(i, actStrings[i]));
}
return ret.build();
}
}