package com.skp.experiment.common.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.StatusReporter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
import org.apache.hadoop.util.ReflectionUtils;
/**
* This class re-implemented MultithreadedMapper in hadoop. most class are heavily stolen from hadoop.
* Basically multiple threads runs map method simultaneously
* with static shared memory that is loaded with setup.
* Mapper implementations using this MapRunnable must be thread-safe.
* Be careful to use this class since this should be only called when all of following needs meet.
* <ol>
* <li>Load lots of lots of data into memory in setup.</li>
* <li>each thread doesn`t create any conflicts on this memory.</li>
* <li>each map method is CPU intensive</li>
* </ol>
* @author doyoung yoon
*
* @param <K1>
* @param <V1>
* @param <K2>
* @param <V2>
*/
public class MultithreadedMapMapper<K1, V1, K2, V2> extends
MultithreadedMapper<K1, V1, K2, V2> {
private static final Log LOG = LogFactory.getLog(MultithreadedMapper.class);
protected Class<? extends Mapper<K1,V1,K2,V2>> mapClass;
protected Context outer;
protected List<MapRunner> runners;
protected class SubMapRecordReader extends RecordReader<K1,V1> {
private K1 key;
private V1 value;
private Configuration conf;
@Override
public void close() throws IOException {
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void initialize(InputSplit split,
TaskAttemptContext context
) throws IOException, InterruptedException {
conf = context.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
synchronized (outer) {
if (!outer.nextKeyValue()) {
return false;
}
key = ReflectionUtils.copy(outer.getConfiguration(),
outer.getCurrentKey(), key);
value = ReflectionUtils.copy(conf, outer.getCurrentValue(), value);
return true;
}
}
public K1 getCurrentKey() {
return key;
}
@Override
public V1 getCurrentValue() {
return value;
}
}
protected class SubMapRecordWriter extends RecordWriter<K2,V2> {
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
}
@Override
public void write(K2 key, V2 value) throws IOException,
InterruptedException {
synchronized (outer) {
outer.write(key, value);
}
}
}
protected class SubMapStatusReporter extends StatusReporter {
@Override
public Counter getCounter(Enum<?> name) {
return outer.getCounter(name);
}
@Override
public Counter getCounter(String group, String name) {
return outer.getCounter(group, name);
}
@Override
public void progress() {
outer.progress();
}
@Override
public void setStatus(String status) {
outer.setStatus(status);
}
}
protected class MapRunner extends Thread {
private Context subcontext;
private Throwable throwable;
MapRunner(Context context) throws IOException, InterruptedException {
subcontext = new Context(outer.getConfiguration(),
outer.getTaskAttemptID(),
new SubMapRecordReader(),
new SubMapRecordWriter(),
context.getOutputCommitter(),
new SubMapStatusReporter(),
outer.getInputSplit());
}
public Throwable getThrowable() {
return throwable;
}
@Override
public void run() {
try {
/* this differ from Hadoop`s MultithrededMapper implementation.
* now each thread only need to run map method. */
while (subcontext.nextKeyValue()) {
map(subcontext.getCurrentKey(), subcontext.getCurrentValue(), subcontext);
}
} catch (Throwable ie) {
throwable = ie;
}
}
}
/**
* Run the application's maps using a thread pool.
*/
@Override
public void run(Context context) throws IOException, InterruptedException {
/* this differ from hadoop`s MultithreadedMapper here
* we only need to span threads to run map() method, not setup, cleanup
* */
setup(context);
outer = context;
int numberOfThreads = getNumberOfThreads(context);
mapClass = getMapperClass(context);
if (LOG.isDebugEnabled()) {
LOG.info("Configuring multithread runner to use " + numberOfThreads +
" threads");
}
runners = new ArrayList<MapRunner>(numberOfThreads);
for(int i=0; i < numberOfThreads; ++i) {
MapRunner thread = new MapRunner(context);
thread.start();
runners.add(i, thread);
}
for(int i=0; i < numberOfThreads; ++i) {
MapRunner thread = runners.get(i);
thread.join();
Throwable th = thread.throwable;
if (th != null) {
if (th instanceof IOException) {
throw (IOException) th;
} else if (th instanceof InterruptedException) {
throw (InterruptedException) th;
} else {
throw new RuntimeException(th);
}
}
}
/* run cleanup only one time per task */
cleanup(context);
}
}