package org.shanbo.feluca.distribute.launch;
import gnu.trove.set.hash.TIntHashSet;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.recipes.barriers.DistributedBarrier;
import org.apache.curator.framework.recipes.barriers.DistributedDoubleBarrier;
import org.apache.zookeeper.KeeperException;
import org.shanbo.feluca.common.Constants;
import org.shanbo.feluca.data2.DataEntry;
import org.shanbo.feluca.data2.Vector;
import org.shanbo.feluca.distribute.model.horizon.MModelClient;
import org.shanbo.feluca.distribute.model.horizon.MModelLocal;
import org.shanbo.feluca.distribute.model.horizon.MModelServer;
import org.shanbo.feluca.distribute.model.vertical.FloatReducerClient;
import org.shanbo.feluca.distribute.model.vertical.ReduceServer;
import org.shanbo.feluca.paddle.AlgoDeployConf;
import org.shanbo.feluca.paddle.GlobalConfig;
import org.shanbo.feluca.util.Config;
import org.shanbo.feluca.util.JSONUtil;
import org.shanbo.feluca.util.ZKUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class LoopingBase implements Runnable{
Logger log ;
CuratorFramework zkClient ;
DistributedDoubleBarrier loopBarrier;
DistributedBarrier startBarrier;
DistributedBarrier finishBarrier;
protected GlobalConfig conf;
protected int loops;
protected int looping;
protected int shardId;
protected boolean useSyncModel;
//data & computation
protected DataEntry dataEntry; //auto close;
protected ReduceServer reduceServer;
protected FloatReducerClient reducerClient;
//server & startingGun(with one of the server)
protected MModelLocal local;
protected MModelClient modelClient;
protected MModelServer modelServer;
StartingGun2 startingGun; //one and only one with a job
public static void distinctIds(TIntHashSet idSet, Vector v){
for(int i = 0; i < v.getSize(); i ++){
idSet.add(v.getFId(i));
}
}
public LoopingBase(GlobalConfig conf) throws Exception{
log = LoggerFactory.getLogger(this.getClass());
init(conf);
boolean inRam = new Boolean(Config.get().get("dataEntry.inRam", "false"));
dataEntry = DataEntry.createDataEntry(Constants.Base.getWorkerRepository() + Constants.Base.DATA_DIR + "/" + conf.getDataName(),
"\\.v\\." + this.shardId + "\\.dat", inRam);
}
/**
*
* @throws Exception
*/
private void init(GlobalConfig conf) throws Exception{
this.conf = conf;
zkClient = ZKUtils.newClient();
loopBarrier = new DistributedDoubleBarrier(zkClient,
Constants.Algorithm.ZK_ALGO_CHROOT + "/" + conf.getAlgorithmName() + Constants.Algorithm.ZK_WAITING_PATH,
conf.getWorkers().size());
startBarrier = new DistributedBarrier(zkClient, Constants.Algorithm.ZK_ALGO_CHROOT + "/" + conf.getAlgorithmName() + "/start");
finishBarrier = new DistributedBarrier(zkClient, Constants.Algorithm.ZK_ALGO_CHROOT + "/" + conf.getAlgorithmName() + "/finish");
local = new MModelLocal();
shardId = conf.getShardId();
loops = conf.getAlgorithmConf().getInteger(Constants.Algorithm.LOOPS);
useSyncModel = JSONUtil.getConf(conf.getAlgorithmConf(), Constants.Algorithm.OPEN_MODEL_SERVER, false);
AlgoDeployConf deployConf = conf.getDeployConf();
//data server and client can be separated from a worker-node.
//
if (deployConf.isReduceServer()){
reduceServer = new ReduceServer(conf.getWorkerName(), conf.getWorkers().size(), conf.getAlgorithmName());
}
if (deployConf.isStartingGun()){
startingGun = new StartingGun2(conf.getAlgorithmName(), conf.getReduceServers().size(), conf.getWorkers().size());
}
modelServer = new MModelServer(conf.getWorkerName(), conf.getAlgorithmName(), local);
modelClient = new MModelClient(conf.getWorkers(), shardId, local);
reducerClient = new FloatReducerClient(conf.getReduceServers(), shardId);
}
private void openDataInput() throws Exception{
dataEntry.reOpen();
}
/**
* initial your members, you may need to create PartialModel for Client
* @throws Exception
*/
protected void startup() throws Exception{}
/**
* release your resources
* @throws Exception
*/
protected void cleanup() throws Exception{}
public final void run(){
try{
if (reduceServer!= null){
reduceServer.start();
}
if (modelServer != null){
modelServer.start();
}
zkClient.start();
ZKUtils.createIfNotExist(zkClient, Constants.Algorithm.ZK_ALGO_CHROOT + "/" + conf.getAlgorithmName() + "/start");
ZKUtils.createIfNotExist(zkClient, Constants.Algorithm.ZK_ALGO_CHROOT + "/" + conf.getAlgorithmName() + "/finish");
startup();
if (startingGun!= null){//only one will be started
startingGun.startAndWait(); //wait for all servers started
System.out.println("startingGun.started");
}
startBarrier.waitOnBarrier();//wait until start signal(reduceServers & modelServers all started) then start loop watching
System.out.println("loop inside");
reducerClient.connect();//connecting; algorithms always use reducer instead of syncModel
modelClient.connect(); //
for(looping = 0 ; looping < loops && earlyStop() == false;looping++){
System.out.print("loop--:----(" + looping);
loopBarrier.enter();
System.out.println(")");
openDataInput();
computeLoop();
loopBarrier.leave();
}
modelClient.close(); //
reducerClient.close();
if (startingGun != null){ //do cleanup() first
startingGun.setFinish(); //tell all workers to finish job
}
cleanup();
finishBarrier.waitOnBarrier();
}catch (Exception e) {
log.error( " exception during running " ,e);
}
finally {
try {
closeAll();
} catch (Exception e) {
log.error( " close error " ,e);
}
}
}
/**
* todo
* @throws InterruptedException
* @throws KeeperException
*/
private void closeAll() throws Exception{
if (reduceServer!= null){
reduceServer.stop();
}
if (modelServer!= null){
modelServer.stop();
}
if (startingGun!= null){
startingGun .close();
}
zkClient.close();
}
/**
* early jump from the loops
* @return
*/
protected boolean earlyStop(){return false;}
protected abstract void computeLoop()throws Exception;
}