/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.worker;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.io.GuaguaRecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A high-effective implementation to load data and do computation. This is different with
* {@link AbstractWorkerComputable}, only {@link #doCompute(Bytable, Bytable, WorkerContext)} for each record are
* published to user. But the first iteration to load data is included in computation.
*
* <p>
* Worker result should be updated in {@link #doCompute(Bytable, Bytable, WorkerContext)}, and which will also be
* populated to Master when all records are processed in one iteration.
*
* <p>
* To load data successfully, make sure {@link GuaguaRecordReader} is initialized firstly. in
* {@link #initRecordReader(GuaguaFileSplit)}:
*
* <pre>
* this.setRecordReader(new GuaguaSequenceAsTextRecordReader());
* this.getRecordReader().initialize(fileSplit);
* </pre>
*
* or directly use other constructors:
*
* <pre>
* this.setRecordReader(new GuaguaSequenceAsTextRecordReader(fileSplit));
* </pre>
*
* <p>
* After data is loaded in the first iteration, one can store the data into collections (meomory or disk) to do later
* iteration logic. But OOM issue should be taken care by users.
*
* @param <MASTER_RESULT>
* master result for computation in each iteration.
* @param <WORKER_RESULT>
* worker result for computation in each iteration.
* @param <KEY>
* key type for each record
* @param <VALUE>
* value type for each record
*/
public abstract class AbstractCombineWorkerComputable<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable, KEY extends Bytable, VALUE extends Bytable>
implements WorkerComputable<MASTER_RESULT, WORKER_RESULT> {
private static final Logger LOG = LoggerFactory.getLogger(AbstractCombineWorkerComputable.class);
private AtomicBoolean isLoaded = new AtomicBoolean(false);
private GuaguaRecordReader<KEY, VALUE> recordReader;
// By using map to store data into memory, please control memory by your self to avoid OOM in worker.
private Map<KEY, VALUE> dataMap = null;
protected AbstractCombineWorkerComputable() {
this(false);
}
protected AbstractCombineWorkerComputable(boolean isOrder) {
if(isOrder) {
dataMap = new TreeMap<KEY, VALUE>();
} else {
dataMap = new HashMap<KEY, VALUE>();
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.worker.WorkerComputable#compute(ml.shifu.guagua.worker.WorkerContext)
*/
@Override
public WORKER_RESULT compute(WorkerContext<MASTER_RESULT, WORKER_RESULT> context) throws IOException {
if(context.isFirstIteration()) {
if(this.isLoaded.compareAndSet(false, true)) {
init(context);
long start = System.nanoTime();
preLoad(context);
long count = 0;
for(GuaguaFileSplit fileSplit: context.getFileSplits()) {
LOG.info("Loading filesplit: {}", fileSplit);
try {
initRecordReader(fileSplit);
while(getRecordReader().nextKeyValue()) {
KEY currentKey = getRecordReader().getCurrentKey();
VALUE currentValue = getRecordReader().getCurrentValue();
doCompute(currentKey, currentValue, context);
dataMap.put(currentKey, currentValue);
count += 1L;
}
} finally {
if(getRecordReader() != null) {
getRecordReader().close();
}
}
}
if(count == 0L) {
throw new IllegalStateException(
"Record account in such worker is zero, please check if any exceptions in your input data.");
}
postLoad(context);
LOG.info("Load {} records.", count);
LOG.info("Data loading time with first iteration computing:{}ms",
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
}
} else {
long start = System.nanoTime();
try {
for(Map.Entry<KEY, VALUE> entry: dataMap.entrySet()) {
doCompute(entry.getKey(), entry.getValue(), context);
}
} finally {
LOG.info("Computation time for application {} container {} iteration {}: {}ms.", context.getAppId(),
context.getContainerId(), context.getCurrentIteration(),
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
}
}
return context.getWorkerResult();
}
/**
* Do some pre work before loading data.
*/
protected void preLoad(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext) {
}
/**
* Do some post work after loading data.
*/
protected void postLoad(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext) {
}
/**
* Each {@link GuaguaFileSplit} must be initialized before loading data.
*/
public abstract void initRecordReader(GuaguaFileSplit fileSplit) throws IOException;
/**
* Initialization work for the whole computation
*/
public abstract void init(WorkerContext<MASTER_RESULT, WORKER_RESULT> workerContext);
/**
* Computation by each record, all update can be set to WORKER_RESULT by
* {@code context.setCurrentWorkerResult(WORKER_RESULT)};
*/
public abstract void doCompute(KEY currentKey, VALUE currentValue,
WorkerContext<MASTER_RESULT, WORKER_RESULT> context);
public GuaguaRecordReader<KEY, VALUE> getRecordReader() {
return recordReader;
}
public void setRecordReader(GuaguaRecordReader<KEY, VALUE> recordReader) {
this.recordReader = recordReader;
}
}