/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.mapreduce;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;
import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.GuaguaRuntimeException;
import ml.shifu.guagua.GuaguaService;
import ml.shifu.guagua.hadoop.io.GuaguaInputSplit;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.master.GuaguaMasterService;
import ml.shifu.guagua.util.Progressable;
import ml.shifu.guagua.worker.GuaguaWorkerService;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link GuaguaMapper} is the Hadoop Mapper implementation for both guagua master and guagua workers.
*
* <p>
* Use <code>(GuaguaInputSplit) context.getInputSplit()</code> to check whether this task is guagua master or guagua
* worker.
*
* <p>
* {@link #guaguaService} is the interface for both guagua Master and Worker implementation. According to
* {@link #isMaster}, master service and worker service will be determined.
*
* <p>
* Only mapper, no reducer for guagua MapReduce implementation. And in this mapper
* {@link #run(org.apache.hadoop.mapreduce.Mapper.Context)} is override while
* {@link #map(Object, Object, org.apache.hadoop.mapreduce.Mapper.Context)} is not since we don't need to iterate mapper
* raw input.
*/
public class GuaguaMapper<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> extends
Mapper<LongWritable, Text, Text, Text> {
private static final Logger LOG = LoggerFactory.getLogger(GuaguaMapper.class);
/**
* Whether the mapper task is master.
*/
private boolean isMaster;
/**
* Service instance to call real guagua master or guagua worker logic.
*/
private GuaguaService guaguaService;
@Override
protected void setup(Context context) throws java.io.IOException, InterruptedException {
GuaguaInputSplit inputSplit = (GuaguaInputSplit) context.getInputSplit();
this.setMaster(inputSplit.isMaster());
if(this.isMaster()) {
context.setStatus("Master initializing ...");
this.setGuaguaService(new GuaguaMasterService<MASTER_RESULT, WORKER_RESULT>());
} else {
context.setStatus("Worker initializing ...");
this.setGuaguaService(new GuaguaWorkerService<MASTER_RESULT, WORKER_RESULT>());
List<GuaguaFileSplit> splits = new LinkedList<GuaguaFileSplit>();
for(int i = 0; i < inputSplit.getFileSplits().length; i++) {
FileSplit fs = inputSplit.getFileSplits()[i];
GuaguaFileSplit gfs = new GuaguaFileSplit(fs.getPath().toString(), fs.getStart(), fs.getLength());
if(inputSplit.getExtensions() != null && i < inputSplit.getExtensions().length) {
gfs.setExtension(inputSplit.getExtensions()[i]);
}
splits.add(gfs);
}
this.getGuaguaService().setSplits(splits);
}
Properties props = replaceConfToProps(context.getConfiguration());
this.getGuaguaService().setAppId(context.getConfiguration().get(GuaguaMapReduceConstants.MAPRED_JOB_ID));
this.getGuaguaService().setContainerId(
context.getConfiguration().get(GuaguaMapReduceConstants.MAPRED_TASK_PARTITION));
this.getGuaguaService().init(props);
this.getGuaguaService().start();
}
/**
* We have to replace {@link Configuration} to {@link Properties} because of no dependency on hadoop in guagua-core.
*/
private Properties replaceConfToProps(Configuration configuration) {
Properties properties = new Properties();
for(Entry<String, String> entry: configuration) {
properties.put(entry.getKey(), entry.getValue());
if(LOG.isDebugEnabled()) {
if(entry.getKey().startsWith(GuaguaConstants.GUAGUA)) {
LOG.debug("{}:{}", entry.getKey(), entry.getValue());
}
}
}
return properties;
}
/**
* Run guagua service according {@link #isMaster} setting. Iteration, coordination will be included in service
* running.
*
* <p>
* {@link #cleanup(org.apache.hadoop.mapreduce.Mapper.Context)} is called in finally block to make sure resources
* can be cleaned.
*
* <p>
* Guagua try best to update progress for each iteration. And also task status will be updated in each iteration in
* hadoop job web ui.
*/
@Override
public void run(final Context context) throws IOException, InterruptedException {
Exception e = null;
try {
this.setup(context);
final int iterations = context.getConfiguration().getInt(GuaguaConstants.GUAGUA_ITERATION_COUNT, -1);
this.getGuaguaService().run(new Progressable() {
@Override
public void progress(int iteration, int totalIteration, String status, boolean isLastUpdate,
boolean isKill) {
if(isKill) {
failTask(null, context.getConfiguration());
return;
}
context.progress();
// set currentItertion to GuaguaRecordReader to make sure GuaguaRecordReader can update progress
GuaguaMRRecordReader.setCurrentIteration(iteration);
// update progress.
try {
context.nextKeyValue();
} catch (IOException e) {
throw new GuaguaRuntimeException(e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
if(isLastUpdate) {
LOG.info("Application progress: {}%.", (iteration * 100 / iterations));
}
// Status will be displayed in Hadoop job ui.
if(status != null && status.length() != 0) {
context.setStatus(status);
}
}
});
} catch (Throwable t) {
LOG.error("Error in guagua main run method.", t);
failTask(t, context.getConfiguration());
e = new GuaguaRuntimeException(t);
} finally {
try {
this.cleanup(context);
} catch (Throwable t) {
failTask(t, context.getConfiguration());
e = new GuaguaRuntimeException(t);
}
}
if(e == null && !this.isMaster) {
// update worker done counters
context.getCounter(GuaguaMapReduceConstants.GUAGUA_STATUS, GuaguaMapReduceConstants.DONE_WORKERS)
.increment(1L);
}
if(e == null && this.isMaster) {
// update master done counters
context.getCounter(GuaguaMapReduceConstants.GUAGUA_STATUS, GuaguaMapReduceConstants.MASTER_SUCCESS)
.increment(1);
}
}
/**
* In our cluster with hadoop-0.20.2-cdh3u4a, runtime exception is thrown to Child but mapper status doesn't change
* to failed. We fail this task to make sure our fail-over can make job successful.
*/
private void failTask(Throwable t, Configuration conf) {
LOG.error("failtask: Killing task: {} ", conf.get(GuaguaMapReduceConstants.MAPRED_TASK_ID));
throw new GuaguaFailTaskRuntimeException("Fail task because of not heathy inside.", t);
}
@Override
protected void cleanup(org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
this.getGuaguaService().stop();
}
public boolean isMaster() {
return isMaster;
}
public void setMaster(boolean isMaster) {
this.isMaster = isMaster;
}
public GuaguaService getGuaguaService() {
return guaguaService;
}
public void setGuaguaService(GuaguaService guaguaService) {
this.guaguaService = guaguaService;
}
}