package ch.unibe.scg.cells.hadoop; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertThat; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.junit.Test; import ch.unibe.scg.cells.Cell; import ch.unibe.scg.cells.Cells; import ch.unibe.scg.cells.Codec; import ch.unibe.scg.cells.Mapper; import ch.unibe.scg.cells.OneShotIterable; import ch.unibe.scg.cells.Sink; import ch.unibe.scg.cells.Source; import com.google.common.collect.Iterables; import com.google.common.primitives.Longs; import com.google.inject.Guice; import com.google.inject.Injector; import com.google.protobuf.ByteString; @SuppressWarnings("javadoc") public final class HadoopPipelineHDFSInputTest { final private static ByteString family = ByteString.copyFromUtf8("f"); /** * A proxy object for SequenceFileAsBinaryInputFormat, * but using ImmutableBytesWritable in the generic type. */ private static class RawTextFileFormat extends FileInputFormat<ImmutableBytesWritable, ImmutableBytesWritable> { final private TextInputFormat underlying = new TextInputFormat(); @Override public RecordReader<ImmutableBytesWritable, ImmutableBytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new RecordReaderProxy(underlying.createRecordReader(split, context)); } } private static class RecordReaderProxy extends RecordReader<ImmutableBytesWritable, ImmutableBytesWritable> { final private RecordReader<LongWritable, Text> underlying; RecordReaderProxy(RecordReader<LongWritable, Text> recordReader) { this.underlying = recordReader; } @Override public void close() throws IOException { underlying.close(); } @Override public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { return new ImmutableBytesWritable(Longs.toByteArray(underlying.getCurrentKey().get())); } @Override public ImmutableBytesWritable getCurrentValue() throws IOException, InterruptedException { return new ImmutableBytesWritable(underlying.getCurrentValue().getBytes()); } @Override public float getProgress() throws IOException, InterruptedException { return underlying.getProgress(); } @Override public void initialize(InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException { underlying.initialize(split, ctx); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return underlying.nextKeyValue(); } } private static class IdentityMapper implements Mapper<File, File> { private static final long serialVersionUID = 1L; @Override public void close() throws IOException { } @Override public void map(File first, OneShotIterable<File> row, Sink<File> sink) throws IOException, InterruptedException { for (File f : row) { sink.write(f); } } } private static class File { final long lineNumber; final String contents; File(long lineNumber, String contents) { this.lineNumber = lineNumber; this.contents = contents; } } private static class FileCodec implements Codec<File> { private static final long serialVersionUID = 1L; @Override public Cell<File> encode(File f) { return Cell.make(ByteString.copyFrom(Longs.toByteArray(f.lineNumber)), ByteString.copyFromUtf8("1"), ByteString.copyFromUtf8(f.contents)); } @Override public File decode(Cell<File> encoded) throws IOException { return new File(Longs.fromByteArray(encoded.getRowKey().toByteArray()), encoded.getCellContents().toStringUtf8()); } } @Test public void testReadBigSCript() throws IOException, InterruptedException { Injector injector = Guice.createInjector(new UnibeModule()); int cnt = 0; try (Table<File> tab = injector.getInstance(TableAdmin.class).createTemporaryTable(family)) { HadoopPipeline<File, File> pipe = HadoopPipeline.fromHDFSToTable( injector.getInstance(Configuration.class), RawTextFileFormat.class, new Path("hdfs://haddock.unibe.ch/user/nes/upgrade-squeezestep.script"), tab); pipe .influx(new FileCodec()) .mapAndEfflux(new IdentityMapper(), new FileCodec()); try (Source<File> files = Cells.decodeSource(tab.asCellSource(), new FileCodec())) { for (Iterable<File> row : files) { cnt += Iterables.size(row); } } } assertThat(cnt, is(7836)); // TODO: wc reports the file size as 4798. Why the difference? } }