AbstractCombineWorkerComputable.java example

Explorer
guagua-master
/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.worker;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.io.GuaguaRecordReader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A high-effective implementation to load data and do computation. This is different with
 * {@link AbstractWorkerComputable}, only {@link #doCompute(Bytable, Bytable, WorkerContext)} for each record are
 * published to user. But the first iteration to load data is included in computation.
 * 
 * <p>
 * Worker result should be updated in {@link #doCompute(Bytable, Bytable, WorkerContext)}, and which will also be
 * populated to Master when all records are processed in one iteration.
 * 
 * <p>
 * To load data successfully, make sure {@link GuaguaRecordReader} is initialized firstly. in
 * {@link #initRecordReader(GuaguaFileSplit)}:
 * 
 * <pre>
 * this.setRecordReader(new GuaguaSequenceAsTextRecordReader());
 * this.getRecordReader().initialize(fileSplit);
 * </pre>
 * 
 * or directly use other constructors:
 * 
 * <pre>
 * this.setRecordReader(new GuaguaSequenceAsTextRecordReader(fileSplit));
 * </pre>
 * 
 * <p>
 * After data is loaded in the first iteration, one can store the data into collections (meomory or disk) to do later
 * iteration logic. But OOM issue should be taken care by users.
 * 
 * @param <MASTER_RESULT>
 *            master result for computation in each iteration.
 * @param <WORKER_RESULT>
 *            worker result for computation in each iteration.
 * @param <KEY>
 *            key type for each record
 * @param <VALUE>
 *            value type for each record
 */
public abstract class AbstractCombineWorkerComputable<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable, KEY extends Bytable, VALUE extends Bytable>
        implements WorkerComputable<MASTER_RESULT, WORKER_RESULT> {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractCombineWorkerComputable.class);

    private AtomicBoolean isLoaded = new AtomicBoolean(false);

    private GuaguaRecordReader<KEY, VALUE> recordReader;

    // By using map to store data into memory, please control memory by your self to avoid OOM in worker.
    private Map<KEY, VALUE> dataMap = null;

    protected AbstractCombineWorkerComputable() {
        this(false);
    }

    protected AbstractCombineWorkerComputable(boolean isOrder) {
        if(isOrder) {
            dataMap = new TreeMap<KEY, VALUE>();
        } else {
            dataMap = new HashMap<KEY, VALUE>();
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see ml.shifu.guagua.worker.WorkerComputable#compute(ml.shifu.guagua.worker.WorkerContext)
     */
    @Override
    public WORKER_RESULT compute(WorkerContext<MASTER_RESULT, WORKER_RESULT> context) throws IOException {
        if(context.isFirstIteration()) {
            if(this.isLoaded.compareAndSet(false, true)) {
                init(context);

                long start = System.nanoTime();
                preLoad(context);
                long count = 0;
                for(GuaguaFileSplit fileSplit: context.getFileSplits()) {
                    LOG.info("Loading filesplit: {}", fileSplit);
                    try {
                        initRecordReader(fileSplit);
                        while(getRecordReader().nextKeyValue()) {
                            KEY currentKey = getRecordReader().getCurrentKey();
                            VALUE currentValue = getRecordReader().getCurrentValue();
                            doCompute(currentKey, currentValue, context);
                            dataMap.put(currentKey, currentValue);
                            count += 1L;
                        }
                    } finally {
                        if(getRecordReader() != null) {
                            getRecordReader().close();
                        }
                    }
                }
                if(count == 0L) {
                    throw new IllegalStateException(
                            "Record account in such worker is zero, please check if any exceptions in your input data.");
                }
                postLoad(context);
                LOG.info("Load {} records.", count);
                LOG.info("Data loading time with first iteration computing:{}ms",
                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
            }
        } else {
            long start = System.nanoTime();
            try {
                for(Map.Entry<KEY, VALUE> entry: dataMap.entrySet()) {
                    doCompute(entry.getKey(), entry.getValue(), context);
                }
            } finally {
                LOG.info("Computation time for application {} container {} iteration {}: {}ms.", context.getAppId(),
                        context.getContainerId(), context.getCurrentIteration(),
                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
            }
        }
        return context.getWorkerResult();
    }

    /**
     * Do some pre work before loading data.
     */
    protected void preLoad(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext) {
    }

    /**
     * Do some post work after loading data.
     */
    protected void postLoad(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext) {
    }

    /**
     * Each {@link GuaguaFileSplit} must be initialized before loading data.
     */
    public abstract void initRecordReader(GuaguaFileSplit fileSplit) throws IOException;

    /**
     * Initialization work for the whole computation
     */
    public abstract void init(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext);

    /**
     * Computation by each record, all update can be set to WORKER_RESULT by
     * {@code context.setCurrentWorkerResult(WORKER_RESULT)};
     */
    public abstract void doCompute(KEY currentKey, VALUE currentValue,
            WorkerContext<MASTER_RESULT, WORKER_RESULT> context);

    public GuaguaRecordReader<KEY, VALUE> getRecordReader() {
        return recordReader;
    }

    public void setRecordReader(GuaguaRecordReader<KEY, VALUE> recordReader) {
        this.recordReader = recordReader;
    }

}