/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.worker;
import java.util.concurrent.TimeUnit;
import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.master.MasterComputable;
import ml.shifu.guagua.master.SyncMasterCoordinator;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link SyncWorkerCoordinator} is used to as a worker barrier for each iteration.
*
* <p>
* For each iteration, {@link SyncWorkerCoordinator} will wait until master's signal.
*
* <p>
* To start a new iteration, {@link SyncMasterCoordinator} will write a znode for each iteration like
* '/_guagua/job_201312041304_189025/master/{currentIteration}' with with {@link MasterComputable} result as its data.
* {@link SyncWorkerCoordinator} is trying to detect whether it exists, if yes, to start a new iteration.
*
* <p>
* Worker result will be written into each worker iteration znode for master to get.
*
* @param <MASTER_RESULT>
* master result for computation in each iteration.
* @param <WORKER_RESULT>
* worker result for computation in each iteration.
*/
public class SyncWorkerCoordinator<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> extends
AbstractWorkerCoordinator<MASTER_RESULT, WORKER_RESULT> {
private static final Logger LOG = LoggerFactory.getLogger(SyncWorkerCoordinator.class);
@Override
public void preApplication(final WorkerContext<MASTER_RESULT, WORKER_RESULT> context) {
// initialize zookeeper and other props
initialize(context.getProps());
new FailOverCoordinatorCommand(context).execute();
new BasicCoordinatorCommand() {
@Override
public void doExecute() throws KeeperException, InterruptedException {
String appId = context.getAppId();
int currentIteration = context.getCurrentIteration();
String containerId = context.getContainerId();
final String appMasterNode = getCurrentMasterNode(appId, currentIteration).toString();
// create worker init znode.
Stat stat = null;
String znode = null;
try {
znode = getRootNode().toString();
stat = getZooKeeper().exists(znode, false);
if(stat == null) {
getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
}
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", znode);
}
try {
znode = getAppNode(appId).toString();
stat = getZooKeeper().exists(znode, false);
if(stat == null) {
getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
}
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", znode);
}
try {
znode = getWorkerBaseNode(appId).toString();
stat = getZooKeeper().exists(znode, false);
if(stat == null) {
getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
}
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", znode);
}
try {
znode = getWorkerBaseNode(appId, currentIteration).toString();
stat = getZooKeeper().exists(znode, false);
if(stat == null) {
getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
}
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", znode);
}
try {
znode = getCurrentWorkerNode(appId, containerId, currentIteration).toString();
getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", znode);
}
// check whether master is ok to start iterations.
new RetryCoordinatorCommand(isFixedTime(), getSleepTime()) {
@Override
public boolean retryExecution() throws KeeperException, InterruptedException {
try {
return getZooKeeper().exists(appMasterNode, false) != null;
} catch (KeeperException.NoNodeException e) {
// to avoid log flood
if(System.nanoTime() % 10 == 0) {
LOG.warn("No such node:{}", appMasterNode);
}
return false;
}
}
}.execute();
if(context.getCurrentIteration() != GuaguaConstants.GUAGUA_INIT_STEP) {
final String appMasterSplitNode = getCurrentMasterSplitNode(appId, currentIteration).toString();
setMasterResult(context, appMasterNode, appMasterSplitNode);
}
LOG.info("Master initilization is done.");
}
}.execute();
}
@Override
public void postIteration(final WorkerContext<MASTER_RESULT, WORKER_RESULT> context) {
new BasicCoordinatorCommand() {
@Override
public void doExecute() throws KeeperException, InterruptedException {
String appId = context.getAppId();
String containerId = context.getContainerId();
int currentIteration = context.getCurrentIteration();
final String appMasterNode = getCurrentMasterNode(appId, currentIteration).toString();
String appWorkerNode = getCurrentWorkerNode(appId, containerId, currentIteration).toString();
final String appWorkerSplitNode = getCurrentWorkerSplitNode(appId, containerId, currentIteration)
.toString();
// create worker iteration znode, set app worker znode to EPHEMERAL to save znode resources.
boolean isSplit = false;
try {
byte[] bytes = getWorkerSerializer().objectToBytes(context.getWorkerResult());
isSplit = setBytesToZNode(appWorkerNode, appWorkerSplitNode, bytes, CreateMode.PERSISTENT);
} catch (KeeperException.NodeExistsException e) {
LOG.warn("Has such node:{}", appWorkerNode);
}
// remove -1 znode, no needed
if(context.getCurrentIteration() >= 1) {
String znode = getWorkerNode(appId, containerId, currentIteration - 1).toString();
try {
getZooKeeper().deleteExt(znode, -1, false);
if(isSplit) {
znode = getCurrentWorkerSplitNode(appId, containerId, currentIteration - 1).toString();
getZooKeeper().deleteExt(znode, -1, true);
}
} catch (KeeperException.NoNodeException e) {
if(System.nanoTime() % 20 == 0) {
LOG.warn("No such node:{}", znode);
}
}
}
long start = System.nanoTime();
new RetryCoordinatorCommand(isFixedTime(), getSleepTime()) {
@Override
public boolean retryExecution() throws KeeperException, InterruptedException {
try {
return getZooKeeper().exists(appMasterNode, false) != null;
} catch (KeeperException.NoNodeException e) {
// to avoid log flood
if(System.nanoTime() % 10 == 0) {
LOG.warn("No such node:{}", appMasterNode);
}
return false;
}
}
}.execute();
LOG.info("Application {} container {} iteration {} waiting ends with {}ms execution time.",
context.getAppId(), context.getContainerId(), context.getCurrentIteration(),
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
String appMasterSplitNode = getCurrentMasterSplitNode(appId, currentIteration).toString();
setMasterResult(context, appMasterNode, appMasterSplitNode);
LOG.info("Master computation is done.");
}
}.execute();
}
}