GuaguaMapper.java example

Explorer
guagua-master
/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.mapreduce;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;

import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.GuaguaRuntimeException;
import ml.shifu.guagua.GuaguaService;
import ml.shifu.guagua.hadoop.io.GuaguaInputSplit;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.master.GuaguaMasterService;
import ml.shifu.guagua.util.Progressable;
import ml.shifu.guagua.worker.GuaguaWorkerService;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * {@link GuaguaMapper} is the Hadoop Mapper implementation for both guagua master and guagua workers.
 * 
 * <p>
 * Use <code>(GuaguaInputSplit) context.getInputSplit()</code> to check whether this task is guagua master or guagua
 * worker.
 * 
 * <p>
 * {@link #guaguaService} is the interface for both guagua Master and Worker implementation. According to
 * {@link #isMaster}, master service and worker service will be determined.
 * 
 * <p>
 * Only mapper, no reducer for guagua MapReduce implementation. And in this mapper
 * {@link #run(org.apache.hadoop.mapreduce.Mapper.Context)} is override while
 * {@link #map(Object, Object, org.apache.hadoop.mapreduce.Mapper.Context)} is not since we don't need to iterate mapper
 * raw input.
 */
public class GuaguaMapper<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> extends
        Mapper<LongWritable, Text, Text, Text> {

    private static final Logger LOG = LoggerFactory.getLogger(GuaguaMapper.class);

    /**
     * Whether the mapper task is master.
     */
    private boolean isMaster;

    /**
     * Service instance to call real guagua master or guagua worker logic.
     */
    private GuaguaService guaguaService;

    @Override
    protected void setup(Context context) throws java.io.IOException, InterruptedException {
        GuaguaInputSplit inputSplit = (GuaguaInputSplit) context.getInputSplit();
        this.setMaster(inputSplit.isMaster());
        if(this.isMaster()) {
            context.setStatus("Master initializing ...");
            this.setGuaguaService(new GuaguaMasterService<MASTER_RESULT, WORKER_RESULT>());
        } else {
            context.setStatus("Worker initializing ...");
            this.setGuaguaService(new GuaguaWorkerService<MASTER_RESULT, WORKER_RESULT>());

            List<GuaguaFileSplit> splits = new LinkedList<GuaguaFileSplit>();
            for(int i = 0; i < inputSplit.getFileSplits().length; i++) {
                FileSplit fs = inputSplit.getFileSplits()[i];
                GuaguaFileSplit gfs = new GuaguaFileSplit(fs.getPath().toString(), fs.getStart(), fs.getLength());
                if(inputSplit.getExtensions() != null && i < inputSplit.getExtensions().length) {
                    gfs.setExtension(inputSplit.getExtensions()[i]);
                }
                splits.add(gfs);
            }
            this.getGuaguaService().setSplits(splits);
        }
        Properties props = replaceConfToProps(context.getConfiguration());
        this.getGuaguaService().setAppId(context.getConfiguration().get(GuaguaMapReduceConstants.MAPRED_JOB_ID));
        this.getGuaguaService().setContainerId(
                context.getConfiguration().get(GuaguaMapReduceConstants.MAPRED_TASK_PARTITION));
        this.getGuaguaService().init(props);
        this.getGuaguaService().start();
    }

    /**
     * We have to replace {@link Configuration} to {@link Properties} because of no dependency on hadoop in guagua-core.
     */
    private Properties replaceConfToProps(Configuration configuration) {
        Properties properties = new Properties();
        for(Entry<String, String> entry: configuration) {
            properties.put(entry.getKey(), entry.getValue());
            if(LOG.isDebugEnabled()) {
                if(entry.getKey().startsWith(GuaguaConstants.GUAGUA)) {
                    LOG.debug("{}:{}", entry.getKey(), entry.getValue());
                }
            }
        }
        return properties;
    }

    /**
     * Run guagua service according {@link #isMaster} setting. Iteration, coordination will be included in service
     * running.
     * 
     * <p>
     * {@link #cleanup(org.apache.hadoop.mapreduce.Mapper.Context)} is called in finally block to make sure resources
     * can be cleaned.
     * 
     * <p>
     * Guagua try best to update progress for each iteration. And also task status will be updated in each iteration in
     * hadoop job web ui.
     */
    @Override
    public void run(final Context context) throws IOException, InterruptedException {
        Exception e = null;
        try {
            this.setup(context);
            final int iterations = context.getConfiguration().getInt(GuaguaConstants.GUAGUA_ITERATION_COUNT, -1);
            this.getGuaguaService().run(new Progressable() {

                @Override
                public void progress(int iteration, int totalIteration, String status, boolean isLastUpdate,
                        boolean isKill) {
                    if(isKill) {
                        failTask(null, context.getConfiguration());
                        return;
                    }
                    context.progress();
                    // set currentItertion to GuaguaRecordReader to make sure GuaguaRecordReader can update progress
                    GuaguaMRRecordReader.setCurrentIteration(iteration);
                    // update progress.
                    try {
                        context.nextKeyValue();
                    } catch (IOException e) {
                        throw new GuaguaRuntimeException(e);
                    } catch (InterruptedException e) {
                        Thread.currentThread().interrupt();
                    }
                    if(isLastUpdate) {
                        LOG.info("Application progress: {}%.", (iteration * 100 / iterations));
                    }

                    // Status will be displayed in Hadoop job ui.
                    if(status != null && status.length() != 0) {
                        context.setStatus(status);
                    }
                }
            });
        } catch (Throwable t) {
            LOG.error("Error in guagua main run method.", t);
            failTask(t, context.getConfiguration());
            e = new GuaguaRuntimeException(t);
        } finally {
            try {
                this.cleanup(context);
            } catch (Throwable t) {
                failTask(t, context.getConfiguration());
                e = new GuaguaRuntimeException(t);
            }
        }

        if(e == null && !this.isMaster) {
            // update worker done counters
            context.getCounter(GuaguaMapReduceConstants.GUAGUA_STATUS, GuaguaMapReduceConstants.DONE_WORKERS)
                    .increment(1L);
        }
        if(e == null && this.isMaster) {
            // update master done counters
            context.getCounter(GuaguaMapReduceConstants.GUAGUA_STATUS, GuaguaMapReduceConstants.MASTER_SUCCESS)
                    .increment(1);
        }
    }

    /**
     * In our cluster with hadoop-0.20.2-cdh3u4a, runtime exception is thrown to Child but mapper status doesn't change
     * to failed. We fail this task to make sure our fail-over can make job successful.
     */
    private void failTask(Throwable t, Configuration conf) {
        LOG.error("failtask: Killing task: {} ", conf.get(GuaguaMapReduceConstants.MAPRED_TASK_ID));
        throw new GuaguaFailTaskRuntimeException("Fail task because of not heathy inside.", t);
    }

    @Override
    protected void cleanup(org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, Text>.Context context)
            throws IOException, InterruptedException {
        this.getGuaguaService().stop();
    }

    public boolean isMaster() {
        return isMaster;
    }

    public void setMaster(boolean isMaster) {
        this.isMaster = isMaster;
    }

    public GuaguaService getGuaguaService() {
        return guaguaService;
    }

    public void setGuaguaService(GuaguaService guaguaService) {
        this.guaguaService = guaguaService;
    }

}