HadoopPipeline.java example

Explorer

CC-master
- src
  - ch
    - unibe
      - scg
        cc
        Annotations.java
        CCModule.java
        CannotBeHashedException.java
        CharsetDetector.java
        CharsetTest.java
        CloneExpander.java
        CloneExpanderTest.java
        CloneGroupClusterer.java
        CloneGroupClustererTest.java
        CloneGroupCodec.java
        Function2FineCloner.java
        Function2FineClonerTest.java
        Function2RoughCloner.java
        Function2RoughClonerTest.java
        Function2RoughClonesCodec.java
        FunctionStringCodec.java
        FunctionStringCodecTest.java
        GitInputFormat.java
        GitInputFormatTest.java
        GitPopulator.java
        GitPopulatorTest.java
        HadoopConfigurationTest.java
        Hasher.java
        Main.java
        Normalizer.java
        PipelineRunner.java
        PopularSnippetMaps.java
        PopularSnippetsCodec.java
        Populator.java
        PopulatorCodec.java
        ReplacerNormalizer.java
        ReplacerProvider.java
        ShingleHasher.java
        ShingleHasherTest.java
        Snippet2FunctionsCodec.java
        SnippetSimilarTest.java
        SpamDetector.java
        SpamDetectorTest.java
        StandardHasher.java
        Str.java
        Tokenizer.java
        Type2ReplacerFactory.java
        Type2ReplacerFactoryTest.java
        Utils.java
        javaFrontend
        JavaModule.java
        JavaTokenizer.java
        JavaTokenizerTest.java
        JavaType1ReplacerFactory.java
        JavaType1ReplacerFactoryTest.java
        NormalizerTest.java
        lines
        StringOfLines.java
        StringOfLinesFactory.java
        StringOfLinesTest.java
        regex
        Replace.java
        ReplaceTest.java
        ReplacementString.java
        ReplacementStringTest.java
        cells
        AdapterOneShotIterable.java
        Cell.java
        CellLookupTable.java
        CellSink.java
        CellSource.java
        Cells.java
        CellsModule.java
        CellsModuleTest.java
        Codec.java
        Counter.java
        CounterModule.java
        CounterRegistry.java
        EncodingException.java
        InMemoryPipeline.java
        InMemoryPipelineTest.java
        InMemoryShuffler.java
        InMemoryShufflerTest.java
        InMemorySource.java
        InMemorySourceTest.java
        InMemoryStorage.java
        LexicographicalComparator.java
        LocalCounter.java
        LocalCounterModule.java
        LocalCounterTest.java
        LocalExecutionModule.java
        LookupTable.java
        Mapper.java
        OfflineMapper.java
        OneShotIterable.java
        Pipeline.java
        PipelineStageScope.java
        PipelineStageScoped.java
        ShallowSerializingCopy.java
        Sink.java
        Source.java
        StorageModule.java
        TableModule.java
        benchmarks
        CellsHadoopSVMBenchmark.java
        CellsHadoopWordCountBenchmark.java
        CellsInMemorySVMBenchmark.java
        CellsInMemoryWordCountBenchmark.java
        HadoopBenchmark.java
        HadoopSVMBenchmark.java
        RawFileFormat.java
        SVM.java
        hadoop
        Annotations.java
        CellsTestSuite.java
        HBaseCellLookupTable.java
        HBaseCellSink.java
        HBaseCellSinkTest.java
        HBaseCellSource.java
        HBaseStorage.java
        HBaseTableModule.java
        HTableFactory.java
        HTableProvider.java
        HadoopCounter.java
        HadoopCounterModule.java
        HadoopCounterTest.java
        HadoopPipeline.java
        HadoopPipelineHDFSInputTest.java
        HadoopPipelineTest.java
        JUnitRunner.java
        ScgDemo.java
        SerializableHTable.java
        Table.java
        TableAdmin.java
        UnibeModule.java

package ch.unibe.scg.cells.hadoop;

import static com.google.common.io.BaseEncoding.base64;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.logging.Logger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.partition.BinaryPartitioner;

import ch.unibe.scg.cells.AdapterOneShotIterable;
import ch.unibe.scg.cells.Cell;
import ch.unibe.scg.cells.CellSink;
import ch.unibe.scg.cells.Cells;
import ch.unibe.scg.cells.Codec;
import ch.unibe.scg.cells.Mapper;
import ch.unibe.scg.cells.OneShotIterable;
import ch.unibe.scg.cells.Pipeline;
import ch.unibe.scg.cells.Sink;
import ch.unibe.scg.cells.Source;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.UnmodifiableIterator;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Ints;
import com.google.protobuf.ByteString;

/** The Hadoop version of a {@link Pipeline}. */
public class HadoopPipeline<IN, EFF> implements Pipeline<IN, EFF> {
	final private static byte[] fam = ByteString.copyFromUtf8("f").toByteArray();
	final static private Logger logger = Logger.getLogger(HadoopPipeline.class.getName());

	final private Configuration baseConfiguration;
	final private MapConfigurer<IN> firstMapConfigurer;
	final private Table<EFF> efflux;
	final private TableAdmin admin;

	private HadoopPipeline(Configuration baseConfiguration, MapConfigurer<IN> firstMapConfigurer, Table<EFF> efflux,
			TableAdmin admin) {
		this.baseConfiguration = baseConfiguration;
		this.firstMapConfigurer = firstMapConfigurer;
		this.efflux = efflux;
		this.admin = admin;
	}

	/** @return a Pipeline that will run map/reduce jobs in the cluster. */
	public static <IN, EFF> HadoopPipeline<IN, EFF> fromTableToTable(Configuration configuration,
			Table<IN> influx, Table<EFF> efflux) {
		return new HadoopPipeline<>(configuration,
				new TableInputConfigurer<>(influx),
				efflux,
				new TableAdmin(configuration, new HTableFactory(configuration)));
	}

	/** @return a Pipeline that reads from HDFS, but writes to an HBase table. */
	public static <IN, EFF> HadoopPipeline<IN, EFF> fromHDFSToTable(Configuration configuration,
			Class<? extends FileInputFormat<ImmutableBytesWritable,
					ImmutableBytesWritable>> inputFormat,
			Path inputPath,
			Table<EFF> efflux) {
		return new HadoopPipeline<>(configuration,
				new HDFSInputConfigurer<IN>(inputFormat, inputPath),
				efflux,
				new TableAdmin(configuration, new HTableFactory(configuration)));
	}

	@Override
	public MappablePipeline<IN, EFF> influx(Codec<IN> c) {
		return new HadoopMappablePipeline<>(firstMapConfigurer, c);
	}

	/** Configure the map part of the job to run from a table, or from HDFS, as the case may be. */
	private interface MapConfigurer<MAP_IN> extends Closeable {
		<MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map,
				Codec<MAP_OUT> outCodec) throws IOException;
	}

	/** MapConfigurer for the case of table input. */
	private static class TableInputConfigurer<MAP_IN> implements MapConfigurer<MAP_IN> {
		final private Table<MAP_IN> src;

		TableInputConfigurer(Table<MAP_IN> src) {
			this.src = src;
		}

		@Override
		public <MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec,
				Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> outCodec) throws IOException {
			logger.fine("Input table: " + src.getTableName());

			HadoopTableMapper<MAP_IN, MAP_OUT> hMapper
					= new HadoopTableMapper<>(map, mapSrcCodec, outCodec, src.getFamilyName());
			writeObjectToConf(job.getConfiguration(), hMapper);

			Scan scan = HBaseCellSource.makeScan();
			scan.addFamily(src.getFamilyName().toByteArray());

			TableMapReduceUtil.initTableMapperJob(src.getTableName(), // input table
					scan, // Scan instance to control CF and attribute selection
					DecoratorHadoopTableMapper.class, // mapper class
					ImmutableBytesWritable.class, // mapper output key
					ImmutableBytesWritable.class, // mapper output value
					job);
		}

		@Override
		public void close() throws IOException {
			src.close();
		}
	}

	/** MapConfigurer for the case of HDFS input. */
	private static class HDFSInputConfigurer<MAP_IN> implements MapConfigurer<MAP_IN> {
		final private Class<? extends FileInputFormat<ImmutableBytesWritable,
				ImmutableBytesWritable>> inputFormat;
		final private Path inputPath;

		HDFSInputConfigurer(
				Class<? extends FileInputFormat<ImmutableBytesWritable,
						ImmutableBytesWritable>> inputFormat,
				Path inputPath) {
			this.inputFormat = inputFormat;
			this.inputPath = inputPath;
		}

		@Override
		public <MAP_OUT> void configure(Job job, Codec<MAP_IN> mapSrcCodec, Mapper<MAP_IN, MAP_OUT> map,
				Codec<MAP_OUT> outCodec) throws IOException {
			HadoopMapper<MAP_IN, MAP_OUT> hMapper = new HadoopMapper<>(map, mapSrcCodec, outCodec);
			writeObjectToConf(job.getConfiguration(), hMapper);

			job.setInputFormatClass(inputFormat);
			job.setMapperClass(DecoratorHadoopMapper.class);

			job.setMapOutputKeyClass(ImmutableBytesWritable.class);
			job.setMapOutputValueClass(ImmutableBytesWritable.class);
			FileInputFormat.addInputPath(job, inputPath);
		}

		@Override
		public void close() throws IOException {
			// Nothing to do.
		}
	}

	/** Hadoop mapper for jobs that don't read from a table. For table reading jobs, see {@link HadoopTableMapper} */
	private static class HadoopMapper<I, E> extends org.apache.hadoop.mapreduce.Mapper<
			ImmutableBytesWritable, ImmutableBytesWritable, // KEYIN, KEYOUT
			ImmutableBytesWritable, ImmutableBytesWritable>  // VALUEIN, VALUEOUT
			implements Serializable {
		private static final long serialVersionUID = 1L;

		final private Mapper<I, E> underlying;
		final private Codec<I> inputCodec;
		final private Codec<E> outputCodec;

		HadoopMapper(Mapper<I, E> underlying, Codec<I> inputCodec, Codec<E> outputCodec) {
			this.underlying = underlying;
			this.inputCodec = inputCodec;
			this.outputCodec = outputCodec;
		}

		@Override
		protected void map(ImmutableBytesWritable key, ImmutableBytesWritable value,
				Context context) throws IOException, InterruptedException {
			Cell<I> cell = Cell.<I> make(ByteString.copyFrom(key.get()),
					ByteString.copyFrom(key.get()),
					ByteString.copyFrom(value.get()));
			I decoded = inputCodec.decode(cell);
			try (Sink<E> sink = Cells.encodeSink(
					HadoopPipeline.<E, ImmutableBytesWritable, ImmutableBytesWritable> makeMapperSink(context),
					outputCodec)) {
				// TODO: maybe use a specific oneshotiterable?
				underlying.map(decoded, new AdapterOneShotIterable<>(Arrays.asList(decoded)), sink);
			}
		}

		/** Overwritten to escalate visibility */
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			// Nothing to do.
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			underlying.close();
		}
	}

	private static class DecoratorHadoopMapper<I, E> extends org.apache.hadoop.mapreduce.Mapper<
			ImmutableBytesWritable, ImmutableBytesWritable, // KEYIN, VALUEIN
			ImmutableBytesWritable, ImmutableBytesWritable> { //KEYOUT, VALUEOUT
		private HadoopMapper<I, E> decorated;

		@Override
		protected void map(ImmutableBytesWritable key, ImmutableBytesWritable value,
				Context context) throws IOException, InterruptedException {
			decorated.map(key, value, context);
		}

		@SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed.
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			decorated = readObjectFromConf(context, HadoopMapper.class);
			decorated.setup(context);
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			decorated.cleanup(context);
		}
	}

	/**
	 * A special input stream, which is aware about hadoop's contexts.
	 * Supports both mapper and reducer contexts. Used to deserialize context-dependent objects
	 * across different machines.
	 */
	static class HadoopContextObjectInputStream extends ObjectInputStream {
		// TaskAttemptContext is the most base class of hadoop Context hierarchy that has no generic parameters.
		final private TaskAttemptContext context;

		HadoopContextObjectInputStream(InputStream decorated, TaskAttemptContext context) throws IOException {
			super(decorated);
			this.context = context;
		}

		Counter getCounter(String name) {
			return context.getCounter("cells", name);
		}
	}

	private static class IdentityMapper<T> implements Mapper<T, T> {
		private static final long serialVersionUID = 1L;

		@Override
		public void close() throws IOException { }

		@Override
		public void map(T first, OneShotIterable<T> row, Sink<T> sink)
				throws IOException, InterruptedException {
			for (T e : row) {
				sink.write(e);
			}
		}
	}

	private class HadoopMappablePipeline<I> implements MappablePipeline<I, EFF> {
		final private Codec<I> srcCodec;
		final private MapConfigurer<I> mapConfigurer;

		HadoopMappablePipeline(MapConfigurer<I> mapConfigurer, Codec<I> srcCodec) {
			this.mapConfigurer = mapConfigurer;
			this.srcCodec = srcCodec;
		}

		@Override
		public <E> ShuffleablePipeline<E, EFF> map(Mapper<I, E> m) {
			return new HadoopShuffleablePipelineAfterMap<>(mapConfigurer, srcCodec, m);
		}

		@Override
		public void mapAndEfflux(Mapper<I, EFF> m, Codec<EFF> codec)
				throws IOException, InterruptedException {
			run(mapConfigurer, srcCodec, m, codec, new IdentityMapper<EFF>(), codec, efflux);
		}
	}

	private class HadoopReducablePipeline<MAP_IN, MAP_OUT> implements MappablePipeline<MAP_OUT, EFF> {
		final private MapConfigurer<MAP_IN> mapConfigurer;
		final private Codec<MAP_IN> mapSrcCodec;
		final private Mapper<MAP_IN, MAP_OUT> map;
		final private Codec<MAP_OUT> reduceSrcCodec;

		HadoopReducablePipeline(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec,
				Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec) {
			this.mapConfigurer = mapConfigurer;
			this.mapSrcCodec = mapSrcCodec;
			this.map = map;
			this.reduceSrcCodec = reduceSrcCodec;
		}

		@Override
		public <E> ShuffleablePipeline<E, EFF> map(Mapper<MAP_OUT, E> m) {
			return new HadoopShuffleablePipelineAfterReduce<>(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, m);
		}

		@Override
		public void mapAndEfflux(Mapper<MAP_OUT, EFF> m, Codec<EFF> codec) throws IOException, InterruptedException {
			run(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, m, codec, efflux);
			mapConfigurer.close();
		}
	}

	private class HadoopShuffleablePipelineAfterMap<I, E> implements ShuffleablePipeline<E, EFF> {
		final private MapConfigurer<I> mapConfigurer;
		final private Codec<I> srcCodec;
		final private Mapper<I, E> mapper;

		HadoopShuffleablePipelineAfterMap(MapConfigurer<I> mapConfigurer, Codec<I> srcCodec, Mapper<I, E> mapper) {
			this.mapConfigurer = mapConfigurer;
			this.srcCodec = srcCodec;
			this.mapper = mapper;
		}

		@Override
		public MappablePipeline<E, EFF> shuffle(Codec<E> codec) throws IOException {
			return new HadoopReducablePipeline<>(mapConfigurer, srcCodec, mapper, codec);
		}
	}

	private class HadoopShuffleablePipelineAfterReduce<MAP_IN, MAP_OUT, E> implements ShuffleablePipeline<E, EFF> {
		final private MapConfigurer<MAP_IN> mapConfigurer;
		final private Codec<MAP_IN> mapSrcCodec;
		final private Mapper<MAP_IN, MAP_OUT> map;

		final private Codec<MAP_OUT> reduceSrcCodec;
		final private Mapper<MAP_OUT, E> reduce;

		HadoopShuffleablePipelineAfterReduce(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec,
				Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec,
				Mapper<MAP_OUT, E> reduce) {
			this.mapConfigurer = mapConfigurer;
			this.mapSrcCodec = mapSrcCodec;
			this.map = map;
			this.reduceSrcCodec = reduceSrcCodec;
			this.reduce = reduce;
		}

		@SuppressWarnings("resource") // target gets closed in TableInputConfigurer.close, at the end of the MR.
		// TODO: Is there a more robust way to enforce that table gets closed?
		@Override
		public MappablePipeline<E, EFF> shuffle(Codec<E> codec) throws IOException, InterruptedException {
			Table<E> target = admin.createTemporaryTable(ByteString.copyFrom(fam));

			run(mapConfigurer, mapSrcCodec, map, reduceSrcCodec, reduce, codec, target);

			// This will delete temporary tables if needed.
			mapConfigurer.close();

			return new HadoopMappablePipeline<>(new TableInputConfigurer<>(target), codec);
		}
	}

	private <E, MAP_IN, MAP_OUT> void run(MapConfigurer<MAP_IN> mapConfigurer, Codec<MAP_IN> mapSrcCodec,
			Mapper<MAP_IN, MAP_OUT> map, Codec<MAP_OUT> reduceSrcCodec,
			Mapper<MAP_OUT, E> reduce, Codec<E> codec, Table<E> target)
			throws IOException,	InterruptedException {
		// TODO: The map configuration is split into a separate object, but the reduce part isn't.
		// That's a strange symmetry break that should be fixed.
		Job job = Job.getInstance(baseConfiguration);
		mapConfigurer.configure(job, mapSrcCodec, map, reduceSrcCodec);

		HadoopReducer<MAP_OUT, E> hReducer = new HadoopReducer<>(reduce, reduceSrcCodec, codec);
		writeObjectToConf(job.getConfiguration(), hReducer);

		TableMapReduceUtil.initTableReducerJob(
				target.getTableName(), // output table
				DecoratorHadoopReducer.class, // reducer class
				job);
		logger.fine("Output table: " + target.getTableName());

		job.setGroupingComparatorClass(KeyGroupingComparator.class);
		job.setSortComparatorClass(KeySortingComparator.class);
		job.setPartitionerClass(KeyGroupingPartitioner.class);
		job.setJarByClass(getClass());

	    // Submit the job, then poll for progress until the job is complete
	    try {
			job.waitForCompletion(true);
		} catch (ClassNotFoundException e) {
			throw new RuntimeException("Loading of your job failed. ", e);
		}
	}

	/** Loads the configured HadoopReducer and runs it.  */
	private static class DecoratorHadoopReducer<I, E> extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable> {
		private HadoopReducer<I, E> decorated;

		@Override
		protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values, Context context)
				throws IOException, InterruptedException {
			values = new AdapterOneShotIterable<>(values); // values doesn't behave well when quizzed twice, so check.
			decorated.reduce(key, values, context);
		}

		@SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed.
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			decorated = readObjectFromConf(context, HadoopReducer.class);
			decorated.setup(context);
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			decorated.cleanup(context);
		}
	}

	private static class HadoopReducer<I, E> extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable>
			implements Serializable {
		static final private long serialVersionUID = 1L;

		final private Mapper<I, E> mapper;
		final private Codec<I> inCodec;
		final private Codec<E> outCodec;

		HadoopReducer(Mapper<I, E> mapper, Codec<I> inCodec, Codec<E> outCodec) {
			this.mapper = mapper;
			this.inCodec = inCodec;
			this.outCodec = outCodec;
		}

		/**
		 * Format of input: key = len(rowKey) | rowKey | first colKey. value = len(colKey) | colKey | contents.
		 * <p> Careful! values is iterable only once!
		 */
		@Override
		protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values, Context context)
				throws IOException, InterruptedException {
			// Extract rowKey from key.
			ByteBuffer rawKey = ByteBuffer.wrap(key.get());
			byte[] lenBytes = new byte[Ints.BYTES];
			rawKey.get(lenBytes);
			int keyLen = Ints.fromByteArray(lenBytes);
			final ByteString rowKey = ByteString.copyFrom(rawKey, keyLen);

			final Iterable<Cell<I>> transformedRow = Iterables.transform(values, new Function<ImmutableBytesWritable, Cell<I>>() {
				@Override public Cell<I> apply(ImmutableBytesWritable value) {
					ByteBuffer rawContent = ByteBuffer.wrap(value.get());
					byte[] colKeyLenBytes = new byte[Ints.BYTES];
					rawContent.get(colKeyLenBytes);
					int colKeyLen = Ints.fromByteArray(colKeyLenBytes);
					ByteString colKey = ByteString.copyFrom(rawContent, colKeyLen);
					ByteString cellContent = ByteString.copyFrom(rawContent);
					return Cell.<I> make(rowKey, colKey, cellContent);
				}
			});

			final Iterable<Cell<I>> row = new Iterable<Cell<I>>() {
				@Override public Iterator<Cell<I>> iterator() {
					return new FilteringIterator<>(transformedRow.iterator());
				}
			};
			runRow(Cells.encodeSink(makeSink(context), outCodec), Cells.decode(row, inCodec), mapper);
		}

		/** Format has to match {@link #readCell} below. */
		private CellSink<E> makeSink(final Context context) {
			return new CellSink<E>() {
				final private static long serialVersionUID = 1L;

				@Override
				public void close() throws IOException {
					// Nothing to close.
				}

				@Override
				public void write(Cell<E> cell) throws IOException, InterruptedException {
					byte[] rowKey = cell.getRowKey().toByteArray();
					Put put = new Put(rowKey);
					put.add(fam, cell.getColumnKey().toByteArray(), cell.getCellContents().toByteArray());
					context.write(new ImmutableBytesWritable(rowKey), put);
				}
			};
		}

		/** Redefined to escalate visibility. */
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			// Nothing to do.
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			super.cleanup(context);
			mapper.close();
		}
	}

	/**
	 * Takes an Iterator and modifies it such that consecutive cells that are
	 * equal are suppressed. Specifically, if two consecutive cells are equal to
	 * each other, only the first is in the FilteringIterator.
	 *
	 * <p>
	 * The underlying iterator never serves nulls.
	 */
	private static class FilteringIterator<T> extends UnmodifiableIterator<Cell<T>> {
		final private Iterator<Cell<T>> underlying;
		/** The next element to be returned. If there is no next element that could be returned, null. */
		private Cell<T> next;

		FilteringIterator(Iterator<Cell<T>> underlying) {
			// set the underlying data provider, get the first value if present
			this.underlying = underlying;
			if (underlying.hasNext()) {
				next = underlying.next();
			}
		}

		@Override
		public boolean hasNext() {
			return next != null;
		}

		@Override
		public Cell<T> next() {
			if (!hasNext()) {
				// As per Iterator interface.
				throw new NoSuchElementException();
			}

			Cell<T> ret = next;

			// Move next to next valid return value, or null if there is none.
			while (underlying.hasNext()) {
				Cell<T> cand = underlying.next();
				if (!cand.equals(next)) {
					next = cand;
					return ret;
				}
			}

			// We didn’t return early, there’s nothing to return next time around.
			next = null;
			return ret;
		}
	}

	/** Loads the configured HadoopTableMapper and runs it. */
	private static class DecoratorHadoopTableMapper<I, E> extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable> {
		private HadoopTableMapper<I, E> decorated;

		@Override
		protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
			decorated.map(key, value, context);
		}

		@SuppressWarnings("unchecked") // Unavoidable, since class literals cannot be generically typed.
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			decorated = readObjectFromConf(context, HadoopTableMapper.class);
			decorated.setup(context);
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			decorated.cleanup(context);
		}
	}



	/** Read obj from conf from key obj.getClassName(), in base64 encoding of its java serialization. */
	private static <T> T readObjectFromConf(TaskAttemptContext context, Class<T> clazz) throws IOException {
		try {
			return (T) new HadoopContextObjectInputStream(new ByteArrayInputStream(base64().decode(
					context.getConfiguration().getRaw(clazz.getName()))), context).readObject();
		} catch (ClassNotFoundException e) {
			throw new RuntimeException("Couldn't load " + clazz
					+ ". You probably didn't ship the JAR properly to the server. See the README", e);
		}
	}

	/** Write obj into conf under key obj.getClassName(), in base64 encoding of its java serialization. */
	private static <T> void writeObjectToConf(Configuration conf, T obj) throws IOException {
		// Serialize into byte array.
		ByteArrayOutputStream bos = new ByteArrayOutputStream();
		new ObjectOutputStream(bos).writeObject(obj);
		byte[] serialized = bos.toByteArray();

		conf.set(obj.getClass().getName(), base64().encode(serialized));
	}

	private static class HadoopTableMapper<I, E> extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable>
			implements Serializable {
		private static final long serialVersionUID = 1L;

		private final Mapper<I, E> mapper;
		private final Codec<I> inCodec;
		private final Codec<E> outCodec;
		/** Do not modify. */
		private final byte[] family;
		// TODO: hang on to sink.

		HadoopTableMapper(Mapper<I, E> mapper, Codec<I> inCodec, Codec<E> outCodec, ByteString family) {
			this.mapper = mapper;
			this.inCodec = inCodec;
			this.outCodec = outCodec;
			this.family = family.toByteArray();
		}

		@Override
		protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
			try (CellSink<E> cellSink = makeMapperSink(context)) {
				List<Cell<I>> cellRow = new ArrayList<>();
				for (Entry<byte[], byte[]> kv : value.getFamilyMap(family).entrySet()) {
					cellRow.add(Cell.<I> make(
							ByteString.copyFrom(key.get()),
							ByteString.copyFrom(kv.getKey()),
							ByteString.copyFrom(kv.getValue())));
				}
				runRow(Cells.encodeSink(cellSink, outCodec), Cells.decode(cellRow, inCodec), mapper);
			}
		}

		/** Re-implemented to escalate visibility. */
		@Override
		protected void setup(Context context) {
			// Nothing to do.
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			mapper.close();
		}
	}

	/** Format has to match {@link #readCell} below. */
	private static <E, KEYIN, KEYOUT> CellSink<E> makeMapperSink(
			final org.apache.hadoop.mapreduce.Mapper<KEYIN, KEYOUT, ImmutableBytesWritable, ImmutableBytesWritable>.Context context) {
		return new CellSink<E>() {
			final private static long serialVersionUID = 1L;

			@Override
			public void close() {
				// Nothing to close.
			}

			@Override
			public void write(Cell<E> cell) throws IOException, InterruptedException {
				// Format: key = rowKeyLen | rowKey | colKey.
				//         value = colKeyLen | colKey | contents
				byte[] key = Bytes.concat(Ints.toByteArray(cell.getRowKey().size()),
						cell.getRowKey().toByteArray(), cell.getColumnKey().toByteArray());
				byte[] value = Bytes.concat(Ints.toByteArray(cell.getColumnKey().size()),
						cell.getColumnKey().toByteArray(), cell.getCellContents().toByteArray());

				context.write(new ImmutableBytesWritable(key), new ImmutableBytesWritable(value));
			}
		};
	}

	private static class KeyGroupingPartitioner extends Partitioner<ImmutableBytesWritable, ImmutableBytesWritable>  {
		final private BinaryPartitioner<ImmutableBytesWritable> defaultPartitioner = new BinaryPartitioner<>();

		@Override
		public int getPartition(ImmutableBytesWritable key, ImmutableBytesWritable value, int parts) {
			return defaultPartitioner
					.getPartition(new BytesWritable(readEmptyCell(key).getRowKey().toByteArray()), value, parts);
		}
	}

	private static class KeyGroupingComparator extends WritableComparator {
		KeyGroupingComparator() {
			super(ImmutableBytesWritable.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			Cell<Void> l = readEmptyCell((ImmutableBytesWritable) a);
			Cell<Void> r = readEmptyCell((ImmutableBytesWritable) b);
			return l.getRowKey().asReadOnlyByteBuffer().compareTo(r.getRowKey().asReadOnlyByteBuffer());
		}
	}

	private static class KeySortingComparator extends WritableComparator {
		KeySortingComparator() {
			super(ImmutableBytesWritable.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			Cell<Void> l = readEmptyCell((ImmutableBytesWritable) a);
			Cell<Void> r = readEmptyCell((ImmutableBytesWritable) b);
			return l.compareTo(r);
		}
	}

	/** Read cell as written from mapper output. Leave cell contents empty. Used for sorting. */
	private static Cell<Void> readEmptyCell(ImmutableBytesWritable rawWritable) {
		ByteBuffer raw = ByteBuffer.wrap(rawWritable.get());
		// Read len
		byte[] rawLen = new byte[Ints.BYTES];
		raw.get(rawLen);
		int len = Ints.fromByteArray(rawLen);

		ByteString rowKey = ByteString.copyFrom(raw, len);

		ByteString colKey = ByteString.copyFrom(raw);

		return Cell.make(rowKey, colKey, ByteString.EMPTY);
	}

	/** Don't call for empty rows. */
	private static <I, E> void runRow(Sink<E> sink, Iterable<I> row, Mapper<I, E> mapper)
			throws IOException, InterruptedException {
		Iterator<I> iter = row.iterator();
		I first = iter.next();
		Iterable<I> gluedRow = Iterables.concat(Arrays.asList(first), new AdapterOneShotIterable<>(iter));
		mapper.map(first, new AdapterOneShotIterable<>(gluedRow), sink);
	}

	@Override
	public Source<EFF> lastEfflux() {
		throw new RuntimeException("Not implemented"); // TODO: Implement.
	}
}