/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.worker;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import ml.shifu.guagua.BasicCoordinator;
import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.coordinator.zk.GuaguaZooKeeper.Filter;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.util.StringUtils;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs.Ids;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link AbstractWorkerCoordinator} has some common implementation for both async and sync worker coordinator.
*
* <p>
* Common functions include: znodes cleaning up, fail-over support and others.
*
* @param <MASTER_RESULT>
* master result for computation in each iteration.
* @param <WORKER_RESULT>
* worker result for computation in each iteration.
*/
public abstract class AbstractWorkerCoordinator<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> extends
BasicCoordinator<MASTER_RESULT, WORKER_RESULT> implements WorkerInterceptor<MASTER_RESULT, WORKER_RESULT> {
private static final Logger LOG = LoggerFactory.getLogger(AbstractWorkerCoordinator.class);
@Override
public void preIteration(WorkerContext<MASTER_RESULT, WORKER_RESULT> context) {
LOG.info("Start itertion {} with container id {} and app id {}.", context.getCurrentIteration(),
context.getContainerId(), context.getAppId());
}
@Override
public void postApplication(final WorkerContext<MASTER_RESULT, WORKER_RESULT> context) {
new BasicCoordinatorCommand() {
@Override
public void doExecute() throws KeeperException, InterruptedException {
try {
// if clean up zk znodes cost two much running time, one can set zk cleanup flag. But to make sure
// clean the znodes manually after application.
String zkCleanUpEnabled = StringUtils.get(
context.getProps().getProperty(GuaguaConstants.GUAGUA_ZK_CLEANUP_ENABLE),
GuaguaConstants.GUAGUA_ZK_DEFAULT_CLEANUP_VALUE);
if(Boolean.TRUE.toString().equalsIgnoreCase(zkCleanUpEnabled)) {
// delete worker znode
String appId = context.getAppId();
String containerId = context.getContainerId();
int currentIteration = context.getCurrentIteration();
String currentWorkerNode = getCurrentWorkerNode(appId, containerId, currentIteration - 1)
.toString();
try {
getZooKeeper().deleteExt(currentWorkerNode, -1, false);
} catch (KeeperException.NoNodeException e) {
if(System.nanoTime() % 20 == 0) {
LOG.warn("No such node:{}", currentWorkerNode);
}
}
// create last worker znode to notice master done state of this worker.
String appWorkerNode = getCurrentWorkerNode(appId, containerId, currentIteration).toString();
getZooKeeper()
.createExt(appWorkerNode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
}
} finally {
close();
}
}
}.execute();
}
protected void setMasterResult(final WorkerContext<MASTER_RESULT, WORKER_RESULT> context,
final String appMasterNode, final String appMasterSplitNode) throws KeeperException, InterruptedException {
if(context.getCurrentIteration() == GuaguaConstants.GUAGUA_INIT_STEP) {
return;
}
final long start = System.nanoTime();
byte[] data = getBytesFromZNode(appMasterNode, appMasterSplitNode);
if(data != null && data.length > 0) {
MASTER_RESULT lastMasterResult = getMasterSerializer().bytesToObject(data,
context.getMasterResultClassName());
context.setLastMasterResult(lastMasterResult);
}
LOG.info("Master result size is {} and read master result run time time {}ms", data.length,
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
}
protected class FailOverCoordinatorCommand extends BasicCoordinatorCommand {
private final WorkerContext<MASTER_RESULT, WORKER_RESULT> context;
public FailOverCoordinatorCommand(WorkerContext<MASTER_RESULT, WORKER_RESULT> context) {
this.context = context;
}
@Override
public void doExecute() throws KeeperException, InterruptedException {
String masterBaseNode = getMasterBaseNode(context.getAppId()).toString();
List<String> masterIterations = null;
try {
masterIterations = getZooKeeper().getChildrenExt(masterBaseNode, false, false, false, new Filter() {
@Override
public boolean filter(String path) {
try {
Integer.parseInt(path);
return false;
} catch (Exception e) {
return true;
}
}
});
} catch (KeeperException.NoNodeException e) {
LOG.warn("No such node:{}", masterBaseNode);
}
if(masterIterations != null && masterIterations.size() > 0) {
Collections.sort(masterIterations, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return Integer.valueOf(o1).compareTo(Integer.valueOf(o2));
}
});
LOG.info("DEBUG: master children:{}", masterIterations);
try {
int restartedIteration = Integer.valueOf(masterIterations.get(masterIterations.size() - 1));
context.setCurrentIteration(restartedIteration);
LOG.info("Container {} restarted at: {} step.", context.getContainerId(), restartedIteration);
} catch (NumberFormatException e) {
context.setCurrentIteration(GuaguaConstants.GUAGUA_INIT_STEP);
}
}
}
}
}