HadoopPipelineTest.java example

Explorer

CC-master
- src
  - ch
    - unibe
      - scg
        cc
        Annotations.java
        CCModule.java
        CannotBeHashedException.java
        CharsetDetector.java
        CharsetTest.java
        CloneExpander.java
        CloneExpanderTest.java
        CloneGroupClusterer.java
        CloneGroupClustererTest.java
        CloneGroupCodec.java
        Function2FineCloner.java
        Function2FineClonerTest.java
        Function2RoughCloner.java
        Function2RoughClonerTest.java
        Function2RoughClonesCodec.java
        FunctionStringCodec.java
        FunctionStringCodecTest.java
        GitInputFormat.java
        GitInputFormatTest.java
        GitPopulator.java
        GitPopulatorTest.java
        HadoopConfigurationTest.java
        Hasher.java
        Main.java
        Normalizer.java
        PipelineRunner.java
        PopularSnippetMaps.java
        PopularSnippetsCodec.java
        Populator.java
        PopulatorCodec.java
        ReplacerNormalizer.java
        ReplacerProvider.java
        ShingleHasher.java
        ShingleHasherTest.java
        Snippet2FunctionsCodec.java
        SnippetSimilarTest.java
        SpamDetector.java
        SpamDetectorTest.java
        StandardHasher.java
        Str.java
        Tokenizer.java
        Type2ReplacerFactory.java
        Type2ReplacerFactoryTest.java
        Utils.java
        javaFrontend
        JavaModule.java
        JavaTokenizer.java
        JavaTokenizerTest.java
        JavaType1ReplacerFactory.java
        JavaType1ReplacerFactoryTest.java
        NormalizerTest.java
        lines
        StringOfLines.java
        StringOfLinesFactory.java
        StringOfLinesTest.java
        regex
        Replace.java
        ReplaceTest.java
        ReplacementString.java
        ReplacementStringTest.java
        cells
        AdapterOneShotIterable.java
        Cell.java
        CellLookupTable.java
        CellSink.java
        CellSource.java
        Cells.java
        CellsModule.java
        CellsModuleTest.java
        Codec.java
        Counter.java
        CounterModule.java
        CounterRegistry.java
        EncodingException.java
        InMemoryPipeline.java
        InMemoryPipelineTest.java
        InMemoryShuffler.java
        InMemoryShufflerTest.java
        InMemorySource.java
        InMemorySourceTest.java
        InMemoryStorage.java
        LexicographicalComparator.java
        LocalCounter.java
        LocalCounterModule.java
        LocalCounterTest.java
        LocalExecutionModule.java
        LookupTable.java
        Mapper.java
        OfflineMapper.java
        OneShotIterable.java
        Pipeline.java
        PipelineStageScope.java
        PipelineStageScoped.java
        ShallowSerializingCopy.java
        Sink.java
        Source.java
        StorageModule.java
        TableModule.java
        benchmarks
        CellsHadoopSVMBenchmark.java
        CellsHadoopWordCountBenchmark.java
        CellsInMemorySVMBenchmark.java
        CellsInMemoryWordCountBenchmark.java
        HadoopBenchmark.java
        HadoopSVMBenchmark.java
        RawFileFormat.java
        SVM.java
        hadoop
        Annotations.java
        CellsTestSuite.java
        HBaseCellLookupTable.java
        HBaseCellSink.java
        HBaseCellSinkTest.java
        HBaseCellSource.java
        HBaseStorage.java
        HBaseTableModule.java
        HTableFactory.java
        HTableProvider.java
        HadoopCounter.java
        HadoopCounterModule.java
        HadoopCounterTest.java
        HadoopPipeline.java
        HadoopPipelineHDFSInputTest.java
        HadoopPipelineTest.java
        JUnitRunner.java
        ScgDemo.java
        SerializableHTable.java
        Table.java
        TableAdmin.java
        UnibeModule.java

package ch.unibe.scg.cells.hadoop;

import static java.lang.annotation.ElementType.FIELD;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.ElementType.PARAMETER;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;

import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
import java.nio.ByteBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.inject.Qualifier;

import org.apache.hadoop.conf.Configuration;
import org.junit.Test;

import ch.unibe.scg.cells.Cell;
import ch.unibe.scg.cells.Cells;
import ch.unibe.scg.cells.CellsModule;
import ch.unibe.scg.cells.Codec;
import ch.unibe.scg.cells.InMemoryPipeline;
import ch.unibe.scg.cells.LocalExecutionModule;
import ch.unibe.scg.cells.Mapper;
import ch.unibe.scg.cells.OneShotIterable;
import ch.unibe.scg.cells.Pipeline;
import ch.unibe.scg.cells.Sink;
import ch.unibe.scg.cells.Source;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.io.CharStreams;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import com.google.inject.TypeLiteral;
import com.google.protobuf.ByteString;

@SuppressWarnings("javadoc")
public final class HadoopPipelineTest {
	private final static ByteString FAMILY = ByteString.copyFromUtf8("f");

	static class Act {
		final int number;
		final String content;

		Act(int number, String content) {
			this.number = number;
			this.content = content;
		}
	}

	static class ActCodec implements Codec<Act> {
		private static final long serialVersionUID = 1L;

		@Override
		public Cell<Act> encode(Act act) {
			return Cell.make(ByteString.copyFrom(Ints.toByteArray(act.number)), ByteString.copyFromUtf8("t"),
					ByteString.copyFromUtf8(act.content));
		}

		@Override
		public Act decode(Cell<Act> encoded) throws IOException {
			return new Act(Ints.fromByteArray(encoded.getRowKey().toByteArray()), encoded.getCellContents()
					.toStringUtf8());
		}
	}

	private static class Word {
		final String word;
		final int act;
		final int pos;

		Word(String word, int act, int pos) {
			this.word = word;
			this.act = act;
			this.pos = pos;
		}
	}

	private static class WordCodec implements Codec<Word> {
		private static final long serialVersionUID = 1L;

		@Override
		public Cell<Word> encode(Word s) {
			ByteBuffer col = ByteBuffer.allocate(2 * Ints.BYTES);
			col.mark();
			col.putInt(s.act);
			col.putInt(s.pos);
			col.reset();
			return Cell.make(
					ByteString.copyFromUtf8(s.word),
					ByteString.copyFrom(col),
					ByteString.EMPTY);
		}

		@Override
		public Word decode(Cell<Word> encoded) throws IOException {
			ByteBuffer col = encoded.getColumnKey().asReadOnlyByteBuffer();
			int nAct = col.getInt();
			int pos = col.getInt();
			return new Word(encoded.getRowKey().toStringUtf8(), nAct, pos);
		}
	}

	private static class WordCount {
		final String word;
		final long count;

		WordCount(String word, long count) {
			this.word = word;
			this.count = count;
		}

		@Override
		public String toString() {
			return word + " " + count;
		}
	}

	private static class WordCountCodec implements Codec<WordCount> {
		private static final long serialVersionUID = 1L;

		@Override
		public Cell<WordCount> encode(WordCount s) {
			return Cell.make(ByteString.copyFromUtf8(s.word),
					ByteString.copyFromUtf8("1"),
					ByteString.copyFrom(Longs.toByteArray(s.count)));
		}

		@Override
		public WordCount decode(Cell<WordCount> encoded) throws IOException {
			return new WordCount(encoded.getRowKey().toStringUtf8(),
					Longs.fromByteArray(encoded.getCellContents().toByteArray()));
		}
	}

	private static class WordParseMapper implements Mapper<Act, Word> {
		private static final long serialVersionUID = 1L;

		@Override
		public void close() { }

		@Override
		public void map(Act first, OneShotIterable<Act> row, Sink<Word> sink) throws IOException,
				InterruptedException {
			for (Act act : row) {
				Matcher matcher = Pattern.compile("\\w+").matcher(act.content);
				while (matcher.find()) {
					sink.write(new Word(matcher.group(), first.number, matcher.start()));
				}
			}
		}
	}

	private static class WordAdderMapper implements Mapper<Word, WordCount> {
		private static final long serialVersionUID = 1L;

		@Override
		public void close() { }

		@Override
		public void map(Word first, OneShotIterable<Word> row, Sink<WordCount> sink)
				throws IOException, InterruptedException {
			int len = Iterables.size(row);
			sink.write(new WordCount(first.word, len));
		}
	}

	private static class WordFilter implements Mapper<WordCount, WordCount> {
		private static final long serialVersionUID = 1L;

		@Override
		public void close() throws IOException { }

		@Override
		public void map(WordCount first, OneShotIterable<WordCount> row, Sink<WordCount> sink) throws IOException,
				InterruptedException {
			for (WordCount wc : row) {
				if (wc.word.equals("your")) {
					sink.write(wc);
				}
			}
		}
	}

	@Test
	public void testHadoopWordcount() throws IOException, InterruptedException {
		TableAdmin tableAdmin = Guice.createInjector(new UnibeModule()).getInstance(TableAdmin.class);

		try (Table<Act> in = tableAdmin.createTemporaryTable(FAMILY);
				Table<WordCount> eff = tableAdmin.createTemporaryTable(FAMILY)) {
			Module tab = new CellsModule() {
				@Override
				protected void configure() {
					installTable(
							In.class,
							new TypeLiteral<Act>() {},
							ActCodec.class,
							new HBaseStorage(), new HBaseTableModule<>(in));
					installTable(
							Eff.class,
							new TypeLiteral<WordCount>() {},
							WordCountCodec.class,
							new HBaseStorage(), new HBaseTableModule<>(eff));
				}
			};

			Injector injector = Guice.createInjector(tab, new UnibeModule());
			try (Sink<Act> sink = injector.getInstance(Key.get(new TypeLiteral<Sink<Act>>() {}, In.class))) {
				Iterable<Act> acts = readActsFromDisk();
				for (Act act : acts) {
					sink.write(act);
				}
			}

			run(HadoopPipeline.fromTableToTable(injector.getInstance(Configuration.class), in, eff));

			try (Source<WordCount> src = injector.getInstance(Key.get(new TypeLiteral<Source<WordCount>>() {}, Eff.class))) {
				WordCount wc = Iterables.getOnlyElement(Iterables.getOnlyElement(src));
				assertThat(wc.word, is("your"));
				assertThat(wc.count, is(239L));
			}
		}
	}

	@Test
	public void testInMemoryWordCount() throws IOException, InterruptedException {
		try (InMemoryPipeline<Act, WordCount> pipe
				= Guice.createInjector(new LocalExecutionModule())
					.getInstance(InMemoryPipeline.Builder.class)
						.make(Cells.shard(Cells.encode(readActsFromDisk(), new ActCodec())))) {
			run(pipe);
			long cnt = -1;
			for (Iterable<WordCount> wcs : pipe.lastEfflux()) {
				for (WordCount wc : wcs) {
					if (wc.word.equals("your")) {
						cnt = wc.count;
					}
				}
				assertThat(cnt, is(239L));
			}
		}
	}

	void run(Pipeline<Act, WordCount> pipeline) throws IOException, InterruptedException {
		pipeline.influx(new ActCodec())
			.map(new WordParseMapper())
			.shuffle(new WordCodec())
			.map(new WordAdderMapper())
			.shuffle(new WordCountCodec())
			.mapAndEfflux(new WordFilter(), new WordCountCodec());
	}

	@Qualifier
	@Target({ FIELD, PARAMETER, METHOD })
	@Retention(RUNTIME)
	public static @interface In {}

	@Qualifier
	@Target({ FIELD, PARAMETER, METHOD })
	@Retention(RUNTIME)
	public static @interface Eff {}

	static Iterable<Act> readActsFromDisk() throws IOException {
		ImmutableList.Builder<Act> ret = ImmutableList.builder();
		String richard = CharStreams.toString(new InputStreamReader(HadoopPipelineTest.class.getResourceAsStream(
				"richard-iii.txt"), Charsets.UTF_8));

		String[] actStrings = richard.split("\\bACT\\s[IVX]+");
		for (int i = 0; i < actStrings.length; i++) {
			ret.add(new Act(i, actStrings[i]));
		}
		return ret.build();
	}
}