package ch.unibe.scg.cells.hadoop; import static com.google.common.io.BaseEncoding.base64; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.NoSuchElementException; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.mapreduce.TableReducer; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.partition.BinaryPartitioner; import ch.unibe.scg.cells.AdapterOneShotIterable; import ch.unibe.scg.cells.Cell; import ch.unibe.scg.cells.CellSink; import ch.unibe.scg.cells.Cells; import ch.unibe.scg.cells.Codec; import ch.unibe.scg.cells.Mapper; import ch.unibe.scg.cells.OneShotIterable; import ch.unibe.scg.cells.Pipeline; import ch.unibe.scg.cells.Sink; import ch.unibe.scg.cells.Source; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.UnmodifiableIterator; import com.google.common.primitives.Bytes; import com.google.common.primitives.Ints; import com.google.protobuf.ByteString; /** The Hadoop version of a {@link Pipeline}. */ public class HadoopPipeline<IN, EFF> implements Pipeline<IN, EFF> { final private static byte[] fam = ByteString.copyFromUtf8("f").toByteArray(); final static private Logger logger = Logger.getLogger(HadoopPipeline.class.getName()); final private Configuration baseConfiguration; final private MapConfigurer<IN> firstMapConfigurer; final private Table<EFF> efflux; final private TableAdmin admin; private HadoopPipeline(Configuration baseConfiguration, MapConfigurer<IN> firstMapConfigurer, Table<EFF> efflux, TableAdmin admin) { this.baseConfiguration = baseConfiguration; this.firstMapConfigurer = firstMapConfigurer; this.efflux = efflux; this.admin = admin; } /** @return a Pipeline that will run map/reduce jobs in the cluster. */ public static <IN, EFF> HadoopPipeline<IN, EFF> fromTableToTable(Configuration configuration, Table<IN> influx, Table<EFF> efflux) { return new HadoopPipeline<>(configuration, new TableInputConfigurer<>(influx), efflux, new TableAdmin(configuration, new HTableFactory(configuration))); } /** @return a Pipeline that reads from HDFS, but writes to an HBase table. */ public static <IN, EFF> HadoopPipeline<IN, EFF> fromHDFSToTable(Configuration configuration, Class<? extends FileInputFormat<ImmutableBytesWritable, ImmutableBytesWritable>> inputFormat, Path inputPath, Table<EFF> efflux) { return new HadoopPipeline<>(configuration, new HDFSInputConfigurer<IN>(inputFormat, inputPath), efflux, new TableAdmin(configuration, new HTableFactory(configuration))); } @Override public MappablePipeline<IN, EFF> influx(Codec<IN> c) { return new HadoopMappablePipeline<>(firstMapConfigurer, c); } /** Configure the map part of the job to run from a table, or from HDFS, as the case may be. */ private interface MapConfigurer<MAP_IN> extends Closeable { <MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> outCodec) throws IOException; } /** MapConfigurer for the case of table input. */ private static class TableInputConfigurer<MAP_IN> implements MapConfigurer<MAP_IN> { final private Table<MAP_IN> src; TableInputConfigurer(Table<MAP_IN> src) { this.src = src; } @Override public <MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> outCodec) throws IOException { logger.fine("Input table: " + src.getTableName()); HadoopTableMapper<MAP_IN, MAP_OUT> hMapper = new HadoopTableMapper<>(map, mapSrcCodec, outCodec, src.getFamilyName()); writeObjectToConf(job.getConfiguration(), hMapper); Scan scan = HBaseCellSource.makeScan(); scan.addFamily(src.getFamilyName().toByteArray()); TableMapReduceUtil.initTableMapperJob(src.getTableName(), // input table scan, // Scan instance to control CF and attribute selection DecoratorHadoopTableMapper.class, // mapper class ImmutableBytesWritable.class, // mapper output key ImmutableBytesWritable.class, // mapper output value job); } @Override public void close() throws IOException { src.close(); } } /** MapConfigurer for the case of HDFS input. */ private static class HDFSInputConfigurer<MAP_IN> implements MapConfigurer<MAP_IN> { final private Class<? extends FileInputFormat<ImmutableBytesWritable, ImmutableBytesWritable>> inputFormat; final private Path inputPath; HDFSInputConfigurer( Class<? extends FileInputFormat<ImmutableBytesWritable, ImmutableBytesWritable>> inputFormat, Path inputPath) { this.inputFormat = inputFormat; this.inputPath = inputPath; } @Override public <MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> outCodec) throws IOException { HadoopMapper<MAP_IN, MAP_OUT> hMapper = new HadoopMapper<>(map, mapSrcCodec, outCodec); writeObjectToConf(job.getConfiguration(), hMapper); job.setInputFormatClass(inputFormat); job.setMapperClass(DecoratorHadoopMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(ImmutableBytesWritable.class); FileInputFormat.addInputPath(job, inputPath); } @Override public void close() throws IOException { // Nothing to do. } } /** Hadoop mapper for jobs that don't read from a table. For table reading jobs, see {@link HadoopTableMapper} */ private static class HadoopMapper<I, E> extends org.apache.hadoop.mapreduce.Mapper< ImmutableBytesWritable, ImmutableBytesWritable, // KEYIN, KEYOUT ImmutableBytesWritable, ImmutableBytesWritable> // VALUEIN, VALUEOUT implements Serializable { private static final long serialVersionUID = 1L; final private Mapper<I, E> underlying; final private Codec<I> inputCodec; final private Codec<E> outputCodec; HadoopMapper(Mapper<I, E> underlying, Codec<I> inputCodec, Codec<E> outputCodec) { this.underlying = underlying; this.inputCodec = inputCodec; this.outputCodec = outputCodec; } @Override protected void map(ImmutableBytesWritable key, ImmutableBytesWritable value, Context context) throws IOException, InterruptedException { Cell<I> cell = Cell.<I> make(ByteString.copyFrom(key.get()), ByteString.copyFrom(key.get()), ByteString.copyFrom(value.get())); I decoded = inputCodec.decode(cell); try (Sink<E> sink = Cells.encodeSink( HadoopPipeline.<E, ImmutableBytesWritable, ImmutableBytesWritable> makeMapperSink(context), outputCodec)) { // TODO: maybe use a specific oneshotiterable? underlying.map(decoded, new AdapterOneShotIterable<>(Arrays.asList(decoded)), sink); } } /** Overwritten to escalate visibility */ @Override protected void setup(Context context) throws IOException, InterruptedException { // Nothing to do. } @Override protected void cleanup(Context context) throws IOException, InterruptedException { underlying.close(); } } private static class DecoratorHadoopMapper<I, E> extends org.apache.hadoop.mapreduce.Mapper< ImmutableBytesWritable, ImmutableBytesWritable, // KEYIN, VALUEIN ImmutableBytesWritable, ImmutableBytesWritable> { //KEYOUT, VALUEOUT private HadoopMapper<I, E> decorated; @Override protected void map(ImmutableBytesWritable key, ImmutableBytesWritable value, Context context) throws IOException, InterruptedException { decorated.map(key, value, context); } @SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed. @Override protected void setup(Context context) throws IOException, InterruptedException { decorated = readObjectFromConf(context, HadoopMapper.class); decorated.setup(context); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { decorated.cleanup(context); } } /** * A special input stream, which is aware about hadoop's contexts. * Supports both mapper and reducer contexts. Used to deserialize context-dependent objects * across different machines. */ static class HadoopContextObjectInputStream extends ObjectInputStream { // TaskAttemptContext is the most base class of hadoop Context hierarchy that has no generic parameters. final private TaskAttemptContext context; HadoopContextObjectInputStream(InputStream decorated, TaskAttemptContext context) throws IOException { super(decorated); this.context = context; } Counter getCounter(String name) { return context.getCounter("cells", name); } } private static class IdentityMapper<T> implements Mapper<T, T> { private static final long serialVersionUID = 1L; @Override public void close() throws IOException { } @Override public void map(T first, OneShotIterable<T> row, Sink<T> sink) throws IOException, InterruptedException { for (T e : row) { sink.write(e); } } } private class HadoopMappablePipeline<I> implements MappablePipeline<I, EFF> { final private Codec<I> srcCodec; final private MapConfigurer<I> mapConfigurer; HadoopMappablePipeline(MapConfigurer<I> mapConfigurer, Codec<I> srcCodec) { this.mapConfigurer = mapConfigurer; this.srcCodec = srcCodec; } @Override public <E> ShuffleablePipeline<E, EFF> map(Mapper<I, E> m) { return new HadoopShuffleablePipelineAfterMap<>(mapConfigurer, srcCodec, m); } @Override public void mapAndEfflux(Mapper<I, EFF> m, Codec<EFF> codec) throws IOException, InterruptedException { run(mapConfigurer, srcCodec, m, codec, new IdentityMapper<EFF>(), codec, efflux); } } private class HadoopReducablePipeline<MAP_IN, MAP_OUT> implements MappablePipeline<MAP_OUT, EFF> { final private MapConfigurer<MAP_IN> mapConfigurer; final private Codec<MAP_IN> mapSrcCodec; final private Mapper<MAP_IN, MAP_OUT> map; final private Codec<MAP_OUT> reduceSrcCodec; HadoopReducablePipeline(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec) { this.mapConfigurer = mapConfigurer; this.mapSrcCodec = mapSrcCodec; this.map = map; this.reduceSrcCodec = reduceSrcCodec; } @Override public <E> ShuffleablePipeline<E, EFF> map(Mapper<MAP_OUT, E> m) { return new HadoopShuffleablePipelineAfterReduce<>(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, m); } @Override public void mapAndEfflux(Mapper<MAP_OUT, EFF> m, Codec<EFF> codec) throws IOException, InterruptedException { run(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, m, codec, efflux); mapConfigurer.close(); } } private class HadoopShuffleablePipelineAfterMap<I, E> implements ShuffleablePipeline<E, EFF> { final private MapConfigurer<I> mapConfigurer; final private Codec<I> srcCodec; final private Mapper<I, E> mapper; HadoopShuffleablePipelineAfterMap(MapConfigurer<I> mapConfigurer, Codec<I> srcCodec, Mapper<I, E> mapper) { this.mapConfigurer = mapConfigurer; this.srcCodec = srcCodec; this.mapper = mapper; } @Override public MappablePipeline<E, EFF> shuffle(Codec<E> codec) throws IOException { return new HadoopReducablePipeline<>(mapConfigurer, srcCodec, mapper, codec); } } private class HadoopShuffleablePipelineAfterReduce<MAP_IN, MAP_OUT, E> implements ShuffleablePipeline<E, EFF> { final private MapConfigurer<MAP_IN> mapConfigurer; final private Codec<MAP_IN> mapSrcCodec; final private Mapper<MAP_IN, MAP_OUT> map; final private Codec<MAP_OUT> reduceSrcCodec; final private Mapper<MAP_OUT, E> reduce; HadoopShuffleablePipelineAfterReduce(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec, Mapper<MAP_OUT, E> reduce) { this.mapConfigurer = mapConfigurer; this.mapSrcCodec = mapSrcCodec; this.map = map; this.reduceSrcCodec = reduceSrcCodec; this.reduce = reduce; } @SuppressWarnings("resource") // target gets closed in TableInputConfigurer.close, at the end of the MR. // TODO: Is there a more robust way to enforce that table gets closed? @Override public MappablePipeline<E, EFF> shuffle(Codec<E> codec) throws IOException, InterruptedException { Table<E> target = admin.createTemporaryTable(ByteString.copyFrom(fam)); run(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, reduce, codec, target); // This will delete temporary tables if needed. mapConfigurer.close(); return new HadoopMappablePipeline<>(new TableInputConfigurer<>(target), codec); } } private <E, MAP_IN, MAP_OUT> void run(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec, Mapper<MAP_OUT, E> reduce, Codec<E> codec, Table<E> target) throws IOException, InterruptedException { // TODO: The map configuration is split into a separate object, but the reduce part isn't. // That's a strange symmetry break that should be fixed. Job job = Job.getInstance(baseConfiguration); mapConfigurer.configure(job, mapSrcCodec, map, reduceSrcCodec); HadoopReducer<MAP_OUT, E> hReducer = new HadoopReducer<>(reduce, reduceSrcCodec, codec); writeObjectToConf(job.getConfiguration(), hReducer); TableMapReduceUtil.initTableReducerJob( target.getTableName(), // output table DecoratorHadoopReducer.class, // reducer class job); logger.fine("Output table: " + target.getTableName()); job.setGroupingComparatorClass(KeyGroupingComparator.class); job.setSortComparatorClass(KeySortingComparator.class); job.setPartitionerClass(KeyGroupingPartitioner.class); job.setJarByClass(getClass()); // Submit the job, then poll for progress until the job is complete try { job.waitForCompletion(true); } catch (ClassNotFoundException e) { throw new RuntimeException("Loading of your job failed. ", e); } } /** Loads the configured HadoopReducer and runs it. */ private static class DecoratorHadoopReducer<I, E> extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable> { private HadoopReducer<I, E> decorated; @Override protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values, Context context) throws IOException, InterruptedException { values = new AdapterOneShotIterable<>(values); // values doesn't behave well when quizzed twice, so check. decorated.reduce(key, values, context); } @SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed. @Override protected void setup(Context context) throws IOException, InterruptedException { decorated = readObjectFromConf(context, HadoopReducer.class); decorated.setup(context); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { decorated.cleanup(context); } } private static class HadoopReducer<I, E> extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable> implements Serializable { static final private long serialVersionUID = 1L; final private Mapper<I, E> mapper; final private Codec<I> inCodec; final private Codec<E> outCodec; HadoopReducer(Mapper<I, E> mapper, Codec<I> inCodec, Codec<E> outCodec) { this.mapper = mapper; this.inCodec = inCodec; this.outCodec = outCodec; } /** * Format of input: key = len(rowKey) | rowKey | first colKey. value = len(colKey) | colKey | contents. * <p> Careful! values is iterable only once! */ @Override protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values, Context context) throws IOException, InterruptedException { // Extract rowKey from key. ByteBuffer rawKey = ByteBuffer.wrap(key.get()); byte[] lenBytes = new byte[Ints.BYTES]; rawKey.get(lenBytes); int keyLen = Ints.fromByteArray(lenBytes); final ByteString rowKey = ByteString.copyFrom(rawKey, keyLen); final Iterable<Cell<I>> transformedRow = Iterables.transform(values, new Function<ImmutableBytesWritable, Cell<I>>() { @Override public Cell<I> apply(ImmutableBytesWritable value) { ByteBuffer rawContent = ByteBuffer.wrap(value.get()); byte[] colKeyLenBytes = new byte[Ints.BYTES]; rawContent.get(colKeyLenBytes); int colKeyLen = Ints.fromByteArray(colKeyLenBytes); ByteString colKey = ByteString.copyFrom(rawContent, colKeyLen); ByteString cellContent = ByteString.copyFrom(rawContent); return Cell.<I> make(rowKey, colKey, cellContent); } }); final Iterable<Cell<I>> row = new Iterable<Cell<I>>() { @Override public Iterator<Cell<I>> iterator() { return new FilteringIterator<>(transformedRow.iterator()); } }; runRow(Cells.encodeSink(makeSink(context), outCodec), Cells.decode(row, inCodec), mapper); } /** Format has to match {@link #readCell} below. */ private CellSink<E> makeSink(final Context context) { return new CellSink<E>() { final private static long serialVersionUID = 1L; @Override public void close() throws IOException { // Nothing to close. } @Override public void write(Cell<E> cell) throws IOException, InterruptedException { byte[] rowKey = cell.getRowKey().toByteArray(); Put put = new Put(rowKey); put.add(fam, cell.getColumnKey().toByteArray(), cell.getCellContents().toByteArray()); context.write(new ImmutableBytesWritable(rowKey), put); } }; } /** Redefined to escalate visibility. */ @Override protected void setup(Context context) throws IOException, InterruptedException { // Nothing to do. } @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); mapper.close(); } } /** * Takes an Iterator and modifies it such that consecutive cells that are * equal are suppressed. Specifically, if two consecutive cells are equal to * each other, only the first is in the FilteringIterator. * * <p> * The underlying iterator never serves nulls. */ private static class FilteringIterator<T> extends UnmodifiableIterator<Cell<T>> { final private Iterator<Cell<T>> underlying; /** The next element to be returned. If there is no next element that could be returned, null. */ private Cell<T> next; FilteringIterator(Iterator<Cell<T>> underlying) { // set the underlying data provider, get the first value if present this.underlying = underlying; if (underlying.hasNext()) { next = underlying.next(); } } @Override public boolean hasNext() { return next != null; } @Override public Cell<T> next() { if (!hasNext()) { // As per Iterator interface. throw new NoSuchElementException(); } Cell<T> ret = next; // Move next to next valid return value, or null if there is none. while (underlying.hasNext()) { Cell<T> cand = underlying.next(); if (!cand.equals(next)) { next = cand; return ret; } } // We didn’t return early, there’s nothing to return next time around. next = null; return ret; } } /** Loads the configured HadoopTableMapper and runs it. */ private static class DecoratorHadoopTableMapper<I, E> extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable> { private HadoopTableMapper<I, E> decorated; @Override protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { decorated.map(key, value, context); } @SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed. @Override protected void setup(Context context) throws IOException, InterruptedException { decorated = readObjectFromConf(context, HadoopTableMapper.class); decorated.setup(context); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { decorated.cleanup(context); } } /** Read obj from conf from key obj.getClassName(), in base64 encoding of its java serialization. */ private static <T> T readObjectFromConf(TaskAttemptContext context, Class<T> clazz) throws IOException { try { return (T) new HadoopContextObjectInputStream(new ByteArrayInputStream(base64().decode( context.getConfiguration().getRaw(clazz.getName()))), context).readObject(); } catch (ClassNotFoundException e) { throw new RuntimeException("Couldn't load " + clazz + ". You probably didn't ship the JAR properly to the server. See the README", e); } } /** Write obj into conf under key obj.getClassName(), in base64 encoding of its java serialization. */ private static <T> void writeObjectToConf(Configuration conf, T obj) throws IOException { // Serialize into byte array. ByteArrayOutputStream bos = new ByteArrayOutputStream(); new ObjectOutputStream(bos).writeObject(obj); byte[] serialized = bos.toByteArray(); conf.set(obj.getClass().getName(), base64().encode(serialized)); } private static class HadoopTableMapper<I, E> extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable> implements Serializable { private static final long serialVersionUID = 1L; private final Mapper<I, E> mapper; private final Codec<I> inCodec; private final Codec<E> outCodec; /** Do not modify. */ private final byte[] family; // TODO: hang on to sink. HadoopTableMapper(Mapper<I, E> mapper, Codec<I> inCodec, Codec<E> outCodec, ByteString family) { this.mapper = mapper; this.inCodec = inCodec; this.outCodec = outCodec; this.family = family.toByteArray(); } @Override protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { try (CellSink<E> cellSink = makeMapperSink(context)) { List<Cell<I>> cellRow = new ArrayList<>(); for (Entry<byte[], byte[]> kv : value.getFamilyMap(family).entrySet()) { cellRow.add(Cell.<I> make( ByteString.copyFrom(key.get()), ByteString.copyFrom(kv.getKey()), ByteString.copyFrom(kv.getValue()))); } runRow(Cells.encodeSink(cellSink, outCodec), Cells.decode(cellRow, inCodec), mapper); } } /** Re-implemented to escalate visibility. */ @Override protected void setup(Context context) { // Nothing to do. } @Override protected void cleanup(Context context) throws IOException, InterruptedException { mapper.close(); } } /** Format has to match {@link #readCell} below. */ private static <E, KEYIN, KEYOUT> CellSink<E> makeMapperSink( final org.apache.hadoop.mapreduce.Mapper<KEYIN, KEYOUT, ImmutableBytesWritable, ImmutableBytesWritable>.Context context) { return new CellSink<E>() { final private static long serialVersionUID = 1L; @Override public void close() { // Nothing to close. } @Override public void write(Cell<E> cell) throws IOException, InterruptedException { // Format: key = rowKeyLen | rowKey | colKey. // value = colKeyLen | colKey | contents byte[] key = Bytes.concat(Ints.toByteArray(cell.getRowKey().size()), cell.getRowKey().toByteArray(), cell.getColumnKey().toByteArray()); byte[] value = Bytes.concat(Ints.toByteArray(cell.getColumnKey().size()), cell.getColumnKey().toByteArray(), cell.getCellContents().toByteArray()); context.write(new ImmutableBytesWritable(key), new ImmutableBytesWritable(value)); } }; } private static class KeyGroupingPartitioner extends Partitioner<ImmutableBytesWritable, ImmutableBytesWritable> { final private BinaryPartitioner<ImmutableBytesWritable> defaultPartitioner = new BinaryPartitioner<>(); @Override public int getPartition(ImmutableBytesWritable key, ImmutableBytesWritable value, int parts) { return defaultPartitioner .getPartition(new BytesWritable(readEmptyCell(key).getRowKey().toByteArray()), value, parts); } } private static class KeyGroupingComparator extends WritableComparator { KeyGroupingComparator() { super(ImmutableBytesWritable.class, true); } @Override public int compare(WritableComparable a, WritableComparable b) { Cell<Void> l = readEmptyCell((ImmutableBytesWritable) a); Cell<Void> r = readEmptyCell((ImmutableBytesWritable) b); return l.getRowKey().asReadOnlyByteBuffer().compareTo(r.getRowKey().asReadOnlyByteBuffer()); } } private static class KeySortingComparator extends WritableComparator { KeySortingComparator() { super(ImmutableBytesWritable.class, true); } @Override public int compare(WritableComparable a, WritableComparable b) { Cell<Void> l = readEmptyCell((ImmutableBytesWritable) a); Cell<Void> r = readEmptyCell((ImmutableBytesWritable) b); return l.compareTo(r); } } /** Read cell as written from mapper output. Leave cell contents empty. Used for sorting. */ private static Cell<Void> readEmptyCell(ImmutableBytesWritable rawWritable) { ByteBuffer raw = ByteBuffer.wrap(rawWritable.get()); // Read len byte[] rawLen = new byte[Ints.BYTES]; raw.get(rawLen); int len = Ints.fromByteArray(rawLen); ByteString rowKey = ByteString.copyFrom(raw, len); ByteString colKey = ByteString.copyFrom(raw); return Cell.make(rowKey, colKey, ByteString.EMPTY); } /** Don't call for empty rows. */ private static <I, E> void runRow(Sink<E> sink, Iterable<I> row, Mapper<I, E> mapper) throws IOException, InterruptedException { Iterator<I> iter = row.iterator(); I first = iter.next(); Iterable<I> gluedRow = Iterables.concat(Arrays.asList(first), new AdapterOneShotIterable<>(iter)); mapper.map(first, new AdapterOneShotIterable<>(gluedRow), sink); } @Override public Source<EFF> lastEfflux() { throw new RuntimeException("Not implemented"); // TODO: Implement. } }