package ch.unibe.scg.cc;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.junit.Ignore;
import org.junit.Test;
import ch.unibe.scg.cc.GitPopulator.PackedRef;
import ch.unibe.scg.cc.GitPopulator.PackedRefParser;
import ch.unibe.scg.cc.Protos.GitRepo;
import ch.unibe.scg.cells.Cell;
import ch.unibe.scg.cells.Codec;
import ch.unibe.scg.cells.Mapper;
import ch.unibe.scg.cells.OneShotIterable;
import ch.unibe.scg.cells.Sink;
import ch.unibe.scg.cells.hadoop.HadoopPipeline;
import ch.unibe.scg.cells.hadoop.Table;
import ch.unibe.scg.cells.hadoop.TableAdmin;
import ch.unibe.scg.cells.hadoop.UnibeModule;
import com.google.common.primitives.Ints;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.google.protobuf.ByteString;
@SuppressWarnings("javadoc")
public final class GitInputFormatTest {
private static class RefCountCodec implements Codec<RefCount> {
private static final long serialVersionUID = 1L;
@Override
public Cell<RefCount> encode(RefCount i) {
return Cell.make(ByteString.copyFromUtf8(i.name),
ByteString.copyFrom(Ints.toByteArray(i.count)),
ByteString.EMPTY);
}
@Override
public RefCount decode(Cell<RefCount> encoded) throws IOException {
return new RefCount(
Ints.fromByteArray(encoded.getColumnKey().toByteArray()),
encoded.getRowKey().toStringUtf8());
}
}
private static class RefCount {
final int count;
final String name;
RefCount(int count, String name) {
this.count = count;
this.name = name;
}
}
private static class PackRefCounter implements Mapper<GitRepo, RefCount> {
private static final long serialVersionUID = 1L;
@Override
public void close() throws IOException { }
@Override
public void map(GitRepo first, OneShotIterable<GitRepo> row, Sink<RefCount> sink) throws IOException,
InterruptedException {
for (GitRepo repo : row) {
List<PackedRef> tags = new PackedRefParser().parse(repo.getPackRefs().newInput());
sink.write(new RefCount(tags.size(), repo.getProjectName()));
}
}
}
@Test
@Ignore // Much too slow, atm. We need something to take only the top 10 files.
public void test() throws IOException, InterruptedException {
Injector i = Guice.createInjector(new UnibeModule());
Configuration conf = i.getInstance(Configuration.class);
conf.set(MRJobConfig.MAP_MEMORY_MB, "4000");
conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx3300m");
try(Table<RefCount> tab
= i.getInstance(TableAdmin.class).createTemporaryTable(ByteString.copyFromUtf8("f"))) {
HadoopPipeline<GitRepo, RefCount> pipe = HadoopPipeline.fromHDFSToTable(conf,
GitInputFormat.class,
new Path("har://hdfs-haddock.unibe.ch:/projects/datasetbackup.har/repos/"),
tab);
pipe
.influx(new GitInputFormat.GitRepoCodec())
.mapAndEfflux(new PackRefCounter(), new RefCountCodec());
}
}
}