/* * Copyright [2013-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.guagua; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Properties; import java.util.concurrent.Callable; import java.util.concurrent.CompletionService; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import ml.shifu.guagua.coordinator.zk.GuaguaZooKeeper; import ml.shifu.guagua.io.Bytable; import ml.shifu.guagua.io.Serializer; import ml.shifu.guagua.util.NumberFormatUtils; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.Code; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.Watcher.Event.EventType; import org.apache.zookeeper.Watcher.Event.KeeperState; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * {@link BasicCoordinator} is a basic implementation for both SyncWorkerCoordinator and SyncMasterCoordinator: * <ul> * <li>1. A {@link Watcher} to monitor zookeeper znodes.</li> * <li>2. Help functions to construct master znodes and worker znodes.</li> * <li>3. Basic functions to convert object to bytes or bytes to object.</li> * <li>4. Heart beat used to avoid ZooKeeperSessionExpiredException.</li> * </ul> * * @param <MASTER_RESULT> * master result for computation in each iteration. * @param <WORKER_RESULT> * worker result for computation in each iteration. */ public class BasicCoordinator<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> implements Watcher { private static final Logger LOG = LoggerFactory.getLogger(BasicCoordinator.class); /** * Common zookeeper impmentation to operate znode. */ private GuaguaZooKeeper zooKeeper; /** * Wait to connect zookeeper server successfully. */ private CountDownLatch zkConnLatch = new CountDownLatch(1); /** * Default waiting time to check master or worker progress from zookeeper servers. */ protected static final int WAIT_SLOT_MILLS = 300; /** * Waiting time to check master or worker progress from zookeeper servers. */ private long sleepTime = WAIT_SLOT_MILLS; /** * Fixed-time waiting or each time increasing the waiting time. */ private boolean isFixedTime = true; /** * {@link #masterSerializer} is used to serialize and de-serialize master results. */ private Serializer<MASTER_RESULT> masterSerializer; /** * {@link #workerSerializer} is used to serialize and de-serialize worker results. */ private Serializer<WORKER_RESULT> workerSerializer; /** * Heartbeat thread instance to send heart beat to zookeeper servers. */ private HeartBeat heartBeat; /** * Heart beat checking time. */ private static final long HEART_BEAT_SLEEP_TIME = 15 * 1000L; /** * Zookeeper has default heartbeat info, but sometimes failed, set a switch for that. */ private boolean zkHeartBeatEnabled = false; /** * Create a thread pool to save master result or deserialize master result from zookeeper in parallel, only for case * if size of results over 1MB which is limitation per zk znde */ private ExecutorService threadPool; public BasicCoordinator() { } protected StringBuilder getMasterBaseNode(final String appId) { return new StringBuilder(50).append(getAppNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_ZK_MASTER_NODE); } protected StringBuilder getMasterNode(final String appId, final int iteration) { return new StringBuilder(50).append(getMasterBaseNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(iteration); } protected StringBuilder getCurrentMasterNode(final String appId, final int iteration) { return getMasterNode(appId, iteration); } protected StringBuilder getCurrentMasterSplitNode(final String appId, final int iteration) { return new StringBuilder(50).append(getMasterBaseNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_ZK_SPLIT_NODE).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(iteration); } protected StringBuilder getLastMasterNode(final String appId, final int iteration) { return getMasterNode(appId, iteration - 1); } protected StringBuilder getRootNode() { return new StringBuilder(10).append(GuaguaConstants.ZOOKEEPER_SEPARATOR).append( GuaguaConstants.GUAGUA_ZK_ROOT_NODE); } protected StringBuilder getBaseMasterElectionNode(final String appId) { return new StringBuilder(20).append(getAppNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_MASTER_ELECTION); } protected StringBuilder getMasterElectionNode(final String appId, final long sessionId) { return new StringBuilder(40).append(getBaseMasterElectionNode(appId)) .append(GuaguaConstants.ZOOKEEPER_SEPARATOR).append(sessionId); } protected StringBuilder getAppNode(final String appId) { return new StringBuilder(20).append(getRootNode()).append(GuaguaConstants.ZOOKEEPER_SEPARATOR).append(appId); } protected StringBuilder getWorkerBaseNode(final String appId) { return new StringBuilder(50).append(getAppNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_ZK_WORKERS_NODE); } protected StringBuilder getWorkerBaseNode(final String appId, final int iteration) { return new StringBuilder(50).append(getWorkerBaseNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(iteration); } protected StringBuilder getWorkerNode(final String appId, final String containerId, final int iteration) { return new StringBuilder(50).append(getWorkerBaseNode(appId, iteration)) .append(GuaguaConstants.ZOOKEEPER_SEPARATOR).append(containerId); } protected StringBuilder getCurrentWorkerNode(final String appId, final String containerId, final int iteration) { return getWorkerNode(appId, containerId, iteration); } protected StringBuilder getCurrentWorkerSplitNode(final String appId, final String containerId, final int iteration) { return new StringBuilder(50).append(getAppNode(appId)).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_ZK_WORKERS_NODE).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(GuaguaConstants.GUAGUA_ZK_SPLIT_NODE).append(GuaguaConstants.ZOOKEEPER_SEPARATOR) .append(iteration).append(GuaguaConstants.ZOOKEEPER_SEPARATOR).append(containerId); } protected StringBuilder getLastWorkerNode(String appId, String containerId, int iteration) { return getWorkerNode(appId, containerId, iteration - 1); } /** * Coordinator initialization. */ protected void initialize(Properties props) { this.zkHeartBeatEnabled = Boolean.TRUE.toString().equalsIgnoreCase( props.getProperty(GuaguaConstants.GUAGUA_ZK_HEARTBEAT_ENABLED, Boolean.FALSE.toString())); checkAndSetZooKeeper(props); setSleepTime(NumberFormatUtils.getLong(props.getProperty(GuaguaConstants.GUAGUA_COORDINATOR_SLEEP_UNIT), WAIT_SLOT_MILLS)); setFixedTime(Boolean.TRUE.toString().equalsIgnoreCase( props.getProperty(GuaguaConstants.GUAGUA_COORDINATOR_FIXED_SLEEP_ENABLE, GuaguaConstants.GUAGUA_COORDINATOR_FIXED_SLEEP))); this.threadPool = Executors.newFixedThreadPool(Integer.parseInt(props.getProperty( "guagua.master.result.thread.number", 8 + ""))); } /** * Set up connection with given zookeeper settings. */ protected void checkAndSetZooKeeper(Properties props) { if(getZooKeeper() == null) { try { String zkServers = props.getProperty(GuaguaConstants.GUAGUA_ZK_SERVERS); if(zkServers == null || zkServers.length() == 0) { throw new GuaguaRuntimeException("Not set 'guagua.zk.servers'. Should be set for coordination."); } int sessionTimeout = NumberFormatUtils.getInt( props.getProperty(GuaguaConstants.GUAGUA_ZK_SESSION_TIMEOUT), GuaguaConstants.GUAGUA_ZK_SESSON_DEFAULT_TIMEOUT); int maxRetryAttempts = NumberFormatUtils.getInt( props.getProperty(GuaguaConstants.GUAGUA_ZK_MAX_ATTEMPTS), GuaguaConstants.GUAGUA_ZK_DEFAULT_MAX_ATTEMPTS); int retryWaitMsecs = NumberFormatUtils.getInt( props.getProperty(GuaguaConstants.GUAGUA_ZK_RETRY_WAIT_MILLS), GuaguaConstants.GUAGUA_ZK_DEFAULT_RETRY_WAIT_MILLS); setZooKeeper(new GuaguaZooKeeper(zkServers, sessionTimeout, maxRetryAttempts, retryWaitMsecs, this)); // wait to connect successful to zookeeper. this.getZkConnLatch().await(); } catch (IOException e) { throw new GuaguaRuntimeException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new GuaguaRuntimeException(e); } } // heartbeat if(this.zkHeartBeatEnabled) { startHeartbeat(); } // remember to stop it } /** * Close resources like zookeeper, thread pool */ protected void close() throws InterruptedException { if(this.zkHeartBeatEnabled) { stopHeartBeat(); } if(getZooKeeper() != null) { getZooKeeper().close(); } // shut down thread pool this.threadPool.shutdownNow(); this.threadPool.awaitTermination(2, TimeUnit.SECONDS); } protected void startHeartbeat() { this.heartBeat = new HeartBeat(); this.heartBeat.setDaemon(true); this.heartBeat.setName("ZooKeeper HeartBeat"); } // should be invoked in postAllication. protected void stopHeartBeat() throws InterruptedException { this.heartBeat.setFollow(false); this.heartBeat.interrupt(); this.heartBeat.join(HEART_BEAT_SLEEP_TIME + 1000); } @Override public void process(final WatchedEvent event) { LOG.debug("process: Got a new event, path = {}, type = {}, state = {}", event.getPath(), event.getType(), event.getState()); if((event.getPath() == null) && (event.getType() == EventType.None)) { if(event.getState() == KeeperState.SyncConnected) { LOG.info("process: Asynchronous connection complete."); this.getZkConnLatch().countDown(); } else { LOG.warn("process: Got unknown null path event {}.", event); } return; } } /** * Set bytes to znode, if bytes is over zookeeper data limit(1MB), use children znodes to store each part. * * @return if result is split. */ protected boolean setBytesToZNode(String znode, String splitZnode, byte[] bytes, CreateMode createNode) throws KeeperException, InterruptedException { LOG.debug("bytes length:{}", bytes.length); final int zkDataLimit = GuaguaConstants.GUAGUA_ZK_DATA_LIMIT; if(bytes.length > zkDataLimit) { // TODO don't recursively create split znode to avoid too many requests to zk servers. getZooKeeper().createExt(splitZnode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true); int childrenSize = (bytes.length % zkDataLimit == 0) ? (bytes.length / zkDataLimit) : (bytes.length / zkDataLimit) + 1; int currentLen = bytes.length; CompletionService<Integer> completionService = new ExecutorCompletionService<Integer>(this.threadPool); for(int i = 0; i < childrenSize; i++) { int bytesLength = 0; if(currentLen >= zkDataLimit) { currentLen -= zkDataLimit; bytesLength = zkDataLimit; } else { bytesLength = currentLen; } completionService.submit(new SaveResultToZookeeper(bytes, i, bytesLength, zkDataLimit, splitZnode, createNode)); } int rCnt = 0; while(rCnt < childrenSize) { try { completionService.take().get(); } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } rCnt += 1; } getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, createNode, false); return true; } else { getZooKeeper().createExt(znode, bytes, Ids.OPEN_ACL_UNSAFE, createNode, false); return false; } } public class SaveResultToZookeeper implements Callable<Integer> { private byte[] rawBytes; private int index; private int currentLen; private int zkDataLimit; private String splitZnode; private CreateMode createNode; public SaveResultToZookeeper(byte[] rawBytes, int index, int currentLen, int zkDataLimit, String splitZnode, CreateMode createNode) { this.rawBytes = rawBytes; this.index = index; this.currentLen = currentLen; this.zkDataLimit = zkDataLimit; this.splitZnode = splitZnode; this.createNode = createNode; } @Override public Integer call() throws Exception { byte[] currentBytes = new byte[currentLen]; System.arraycopy(rawBytes, index * zkDataLimit, currentBytes, 0, currentBytes.length); getZooKeeper().createExt(splitZnode + GuaguaConstants.ZOOKEEPER_SEPARATOR + index, currentBytes, Ids.OPEN_ACL_UNSAFE, createNode, false); return index; } } /** * This is reverse method to {@link #setBytesToZNode(String, String, byte[], CreateMode)}. Firstly get data from * {@link Code znode}. If data is empty, get data from its children. */ protected byte[] getBytesFromZNode(String znode, String splitZnode) throws KeeperException, InterruptedException { byte[] data = getZooKeeper().getData(znode, null, null); if(data != null) { return data; } final List<String> children = getZooKeeper().getChildrenExt(splitZnode, false, true, new ChildrenComparator()); if(children == null || children.size() == 0) { return null; } CompletionService<BytesPair> completionService = new ExecutorCompletionService<BytesPair>(this.threadPool); List<BytesPair> bytesPairList = new ArrayList<BytesPair>(children.size()); int wholeLength = 0; for(int i = 0; i < children.size(); i++) { final int index = i; completionService.submit(new GetSplitBytes(getZooKeeper(), index, children.get(index))); } int rCnt = 0; while(rCnt < children.size()) { try { BytesPair bp = completionService.take().get(); wholeLength += bp.bytes.length; bytesPairList.add(bp); } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } rCnt += 1; } Collections.sort(bytesPairList, new Comparator<BytesPair>() { @Override public int compare(BytesPair o1, BytesPair o2) { // return Integer.valueOf(s1).compareTo(Integer.valueOf(s2)); return Integer.valueOf(o1.index).compareTo(Integer.valueOf(o2.index)); } }); byte[] results = new byte[wholeLength]; for(int i = 0, currentLength = 0; i < bytesPairList.size(); i++) { byte[] currentBytes = bytesPairList.get(i).bytes; if(currentBytes != null) { System.arraycopy(currentBytes, 0, results, currentLength, currentBytes.length); currentLength += currentBytes.length; } } LOG.debug("znode results.length:{}", results.length); return results; } public static class GetSplitBytes implements Callable<BytesPair> { private GuaguaZooKeeper zookeeper; private int index; private String znode; public GetSplitBytes(GuaguaZooKeeper zookeeper, int index, String znode) { this.zookeeper = zookeeper; this.index = index; this.znode = znode; } @Override public BytesPair call() throws Exception { byte[] data = zookeeper.getData(znode, null, null); return new BytesPair(index, data); } } private static class BytesPair { public int index; public byte[] bytes; public BytesPair(int index, byte[] bytes) { super(); this.index = index; this.bytes = bytes; } } /** * Compare int by string inputs. */ private static class ChildrenComparator implements Comparator<String>, Serializable { private static final long serialVersionUID = 7871289234100249905L; @Override public int compare(String s1, String s2) { return Integer.valueOf(s1).compareTo(Integer.valueOf(s2)); } } public GuaguaZooKeeper getZooKeeper() { return zooKeeper; } public void setZooKeeper(GuaguaZooKeeper zooKeeper) { this.zooKeeper = zooKeeper; } public long getSleepTime() { return sleepTime; } public void setSleepTime(long sleepTime) { this.sleepTime = sleepTime; } public boolean isFixedTime() { return isFixedTime; } public void setFixedTime(boolean isFixedTime) { this.isFixedTime = isFixedTime; } public Serializer<WORKER_RESULT> getWorkerSerializer() { return workerSerializer; } public void setWorkerSerializer(Serializer<WORKER_RESULT> workerSerializer) { this.workerSerializer = workerSerializer; } public Serializer<MASTER_RESULT> getMasterSerializer() { return masterSerializer; } public void setMasterSerializer(Serializer<MASTER_RESULT> masterSerializer) { this.masterSerializer = masterSerializer; } public CountDownLatch getZkConnLatch() { return zkConnLatch; } /** * A heartbeat thread to avoid zookeeper session time out. */ private class HeartBeat extends Thread { private volatile boolean follow = true; @Override public void run() { while(isFollow()) { try { Thread.sleep(HEART_BEAT_SLEEP_TIME); LOG.debug("DEBUG: Heartbeat."); Stat exists = getZooKeeper().exists(getRootNode().toString(), false); LOG.debug("DEBUG: Heartbeat {}", exists); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (KeeperException.SessionExpiredException e) { throw new GuaguaRuntimeException(e); } catch (KeeperException e) { if(System.nanoTime() % 20 == 0) { LOG.info("Heartbeat zookeeper exception, can be ignored."); } } } } public boolean isFollow() { return follow; } public void setFollow(boolean follow) { this.follow = follow; } } /** * {@link CoordinatorCommand} is used for consistent process of zookeeper coordination. */ public static interface CoordinatorCommand { /** * Command method. */ void execute(); } /** * {@link BasicCoordinatorCommand} is to process exceptions for zookeeper operations. */ public static abstract class BasicCoordinatorCommand implements CoordinatorCommand { @Override public void execute() { try { doExecute(); } catch (InterruptedException e) { // transfer interrupt state to caller thread. Thread.currentThread().interrupt(); } catch (Exception e) { throw new GuaguaRuntimeException(e); } } /** * Real method to do coordinator operation. */ public abstract void doExecute() throws Exception, InterruptedException; } /** * {@link RetryCoordinatorCommand} is used to wrap retry logic. {@link RetryCoordinatorCommand#retryExecution()} * will be retried by a fixed sleeping time or an increasing time. */ public abstract static class RetryCoordinatorCommand extends BasicCoordinatorCommand { private long sleepUnitTime = WAIT_SLOT_MILLS; private boolean isFixedTime = true; private long startTime = 0; public RetryCoordinatorCommand(boolean isFixedTime, long sleepUnitTime) { this.isFixedTime = isFixedTime; this.sleepUnitTime = sleepUnitTime; this.startTime = System.currentTimeMillis(); } @Override public void doExecute() throws Exception, InterruptedException { int attempt = 0; do { ++attempt; if(this.isFixedTime) { Thread.sleep(this.sleepUnitTime); } else { Thread.sleep(attempt * this.sleepUnitTime); } if(retryExecution()) { return; } } while(attempt < Integer.MAX_VALUE); } public abstract boolean retryExecution() throws Exception, InterruptedException; public long getElapsedTime() { return System.currentTimeMillis() - this.startTime; } /** * Return true for {@link #retryExecution()} if in minWorkersTimeout time get {@link Code (int) (workers * * minWorkersRatio))} workers completed. */ protected boolean isTerminated(int workersCompleted, int workers, double minWorkersRatio, long minWorkersTimeout) { if(workers <= 10) { minWorkersRatio = 1d; } LOG.debug("DEBUG: workersCompleted={}, workers={}, minWorkersRatio={}, minWorkersTimeout={}", workersCompleted, workers, minWorkersRatio, minWorkersTimeout); return workers == workersCompleted || (getElapsedTime() >= minWorkersTimeout && workersCompleted >= (int) (workers * minWorkersRatio)); } } }