/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.corona; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class CoronaNodeRestarter extends Thread { /** Class logger */ public static final Log LOG = LogFactory.getLog(CoronaNodeRestarter.class); enum RestartStatus { INITIAL, READY, DONE, } public static class NodeRestartInfo { ClusterNode node; RestartStatus status; NodeRestartInfo(ClusterNode node) { this.node = node; status = RestartStatus.INITIAL; } } private final CoronaConf conf; private final NodeManager nodeManager; // The interval that each bacth will be set to be restarted private final long restartInterval; // All the nodes will be set to be restarted in batches private int restartBatch; private List<NodeRestartInfo> workingList; public CoronaNodeRestarter(CoronaConf conf, NodeManager nodeManager) { this.conf = conf; this.nodeManager = nodeManager; this.restartBatch = 1000; this.restartInterval = conf.getCoronaNodeRestartInterval(); workingList = new ArrayList<NodeRestartInfo>(); } public boolean checkStatus(ClusterNodeInfo nodeToCheck) { synchronized (workingList) { for (NodeRestartInfo workingNode: workingList) { if (workingNode.node.getName().toString().equals( nodeToCheck.getName().toString())) { if (workingNode.status == RestartStatus.READY) { // nodeManager.deleteNode is used instead of ClusterManager.nodeTimeout // due to potential deadlock since that one calls delete() nodeManager.deleteNode(workingNode.node); workingNode.status = RestartStatus.DONE; LOG.info("Notify " + nodeToCheck.getName().toString() + " to restart"); return true; } } } } return false; } public void delete(String nodeToDelete) { synchronized (workingList) { Iterator<NodeRestartInfo> workingIt = workingList.iterator(); while (workingIt.hasNext()) { NodeRestartInfo tmpnode = workingIt.next(); if (tmpnode.node.getName().toString().equals(nodeToDelete)) { LOG.info("Delete " + nodeToDelete + " from the working list"); workingIt.remove(); break; } } } } public void add(List<ClusterNode> nodesToKill, boolean forceFlag, int batchSize) { synchronized (workingList) { this.restartBatch = batchSize; if (forceFlag) { workingList.clear(); } for (ClusterNode node: nodesToKill) { workingList.add(new NodeRestartInfo(node)); } this.interrupt(); } } @Override public void run() { while (true) { synchronized (workingList) { try { if (workingList.size() == 0) { workingList.wait(); } } catch (InterruptedException e) { } int currentIndex = 0; int changed = 0; while (currentIndex < workingList.size() && changed < restartBatch) { NodeRestartInfo workingNode = workingList.get(currentIndex); if (workingNode.status == RestartStatus.INITIAL) { workingNode.status = RestartStatus.READY; LOG.info("set " + workingNode.node.getName().toString() + " to be ready for restart"); changed++; } currentIndex++; } // cleanup the notified ones Iterator<NodeRestartInfo> workingIt = workingList.iterator(); while (workingIt.hasNext()) { NodeRestartInfo tmpnode = workingIt.next(); if (tmpnode.status == RestartStatus.DONE) { workingIt.remove(); } } } try { Thread.sleep(restartInterval); } catch (InterruptedException e) { } } } }