package ch.unibe.scg.cells.hadoop; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertThat; import java.io.IOException; import java.nio.ByteBuffer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.junit.Before; import org.junit.Test; import ch.unibe.scg.cells.Cell; import ch.unibe.scg.cells.Cells; import ch.unibe.scg.cells.CellsModule; import ch.unibe.scg.cells.Codec; import ch.unibe.scg.cells.InMemoryPipeline; import ch.unibe.scg.cells.LocalExecutionModule; import ch.unibe.scg.cells.Mapper; import ch.unibe.scg.cells.OneShotIterable; import ch.unibe.scg.cells.Pipeline; import ch.unibe.scg.cells.Sink; import ch.unibe.scg.cells.Source; import ch.unibe.scg.cells.hadoop.HadoopPipelineTest.Act; import ch.unibe.scg.cells.hadoop.HadoopPipelineTest.ActCodec; import ch.unibe.scg.cells.hadoop.HadoopPipelineTest.In; import com.google.common.collect.Iterables; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Key; import com.google.inject.Module; import com.google.inject.TypeLiteral; import com.google.protobuf.ByteString; @SuppressWarnings("javadoc") public final class ScgDemo { final private ByteString FAMILY = ByteString.copyFromUtf8("f"); /** Generates data before executing tests. */ @Before public void createRichard() throws IOException, InterruptedException { final String tableName = "richard-iii"; Module tab = new CellsModule() { @Override protected void configure() { installTable( In.class, new TypeLiteral<Act>() {}, ActCodec.class, new HBaseStorage(), new HBaseTableModule<>(tableName, FAMILY)); } }; Injector i = Guice.createInjector(new UnibeModule(), tab); i.getInstance(TableAdmin.class).deleteTable("richard-iii"); i.getInstance(TableAdmin.class).createTable("richard-iii", FAMILY); try (Sink<Act> s = i.getInstance(Key.get(new TypeLiteral<Sink<Act>>() {}, In.class))) { for (Act act : HadoopPipelineTest.readActsFromDisk()) { s.write(act); } } } static class Word { final String word; final int act; final int pos; Word(String word, int act, int pos) { this.word = word; this.act = act; this.pos = pos; } } static class WordCodec implements Codec<Word> { private static final long serialVersionUID = 1L; @Override public Cell<Word> encode(Word s) { ByteBuffer col = ByteBuffer.allocate(2 * Ints.BYTES); col.putInt(s.act); col.putInt(s.pos); col.rewind(); return Cell.make(ByteString.copyFromUtf8(s.word), ByteString.copyFrom(col), ByteString.EMPTY); } @Override public Word decode(Cell<Word> encoded) throws IOException { ByteBuffer col = encoded.getColumnKey().asReadOnlyByteBuffer(); int act = col.getInt(); int pos = col.getInt(); return new Word(encoded.getRowKey().toStringUtf8(), act, pos); } } static class WordParser implements Mapper<Act, Word> { private static final long serialVersionUID = 1L; @Override public void close() throws IOException { } @Override public void map(Act first, OneShotIterable<Act> row, Sink<Word> sink) throws IOException, InterruptedException { for (Act act : row) { Matcher matcher = Pattern.compile("\\w+").matcher(act.content); while (matcher.find()) { sink.write(new Word(matcher.group(), first.number, matcher.start())); } } } } static class WordCounter implements Mapper<Word, WordCount> { private static final long serialVersionUID = 1L; @Override public void close() throws IOException { } @Override // TODO: think of better name for first?! public void map(Word first, OneShotIterable<Word> row, Sink<WordCount> sink) throws IOException, InterruptedException { long count = Iterables.size(row); sink.write(new WordCount(count, first.word)); } } static class WordCount { final long count; final String word; WordCount(long count, String word) { this.count = count; this.word = word; } @Override public String toString() { return word + " " + count; } } static class WordCountCodec implements Codec<WordCount> { private static final long serialVersionUID = 1L; @Override public Cell<WordCount> encode(WordCount s) { return Cell.make(ByteString.copyFromUtf8(s.word), ByteString.copyFrom(Longs.toByteArray(s.count)), ByteString.EMPTY); } @Override public WordCount decode(Cell<WordCount> encoded) throws IOException { return new WordCount(Longs.fromByteArray(encoded.getColumnKey().toByteArray()), encoded.getRowKey().toStringUtf8()); } } @Test public void runInHadoop() throws IOException, InterruptedException { Injector i = Guice.createInjector(new UnibeModule()); TableAdmin tableAdmin = i.getInstance(TableAdmin.class); long cnt = -1L; try (Table<Act> in = tableAdmin.existing("richard-iii", ByteString.copyFromUtf8("f")); Table<WordCount> eff = tableAdmin.createTemporaryTable(ByteString.copyFromUtf8("f"))) { HadoopPipeline<Act, WordCount> pipe = HadoopPipeline.fromTableToTable(i.getInstance(Configuration.class), in, eff); run(pipe); for (Iterable<WordCount> wcs : Cells.decodeSource(eff.asCellSource(), new WordCountCodec())) { for (WordCount wc : wcs) { if (wc.word.equals("your")) { cnt = wc.count; } } } } assertThat(cnt, is(239L)); } @Test public void countWords() throws IOException, InterruptedException { Injector i = Guice.createInjector(new UnibeModule(), new LocalExecutionModule()); TableAdmin tableAdmin = i.getInstance(TableAdmin.class); try (Table<Act> in = tableAdmin.existing("richard-iii", ByteString.copyFromUtf8("f")); InMemoryPipeline<Act, WordCount> pipe = i.getInstance(InMemoryPipeline.Builder.class).make(in.asCellSource())) { run(pipe); long cnt = -1L; try (Source<WordCount> rows = pipe.lastEfflux()) { for (Iterable<WordCount> wcs : rows) { for (WordCount wc : wcs) { if (wc.word.equals("your")) { cnt = wc.count; } } } } assertThat(cnt, is(239L)); } } void run(Pipeline<Act, WordCount> pipe) throws IOException, InterruptedException { pipe.influx(new ActCodec()) .map(new WordParser()) .shuffle(new WordCodec()) .mapAndEfflux(new WordCounter(), new WordCountCodec()); } }