/* * ProActive Parallel Suite(TM): * The Open Source library for parallel and distributed * Workflows & Scheduling, Orchestration, Cloud Automation * and Big Data Analysis on Enterprise Grids & Clouds. * * Copyright (c) 2007 - 2017 ActiveEon * Contact: contact@activeeon.com * * This library is free software: you can redistribute it and/or * modify it under the terms of the GNU Affero General Public License * as published by the Free Software Foundation: version 3 of * the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * If needed, contact us to obtain a release under GPL Version 2 or 3 * or a different license than the AGPL. */ package org.ow2.proactive.resourcemanager.nodesource.infrastructure; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import org.objectweb.proactive.core.node.Node; import org.ow2.proactive.resourcemanager.exception.RMException; import org.ow2.proactive.resourcemanager.nodesource.common.Configurable; import org.ow2.proactive.utils.FileToBytesConverter; /** Abstract infrastructure Manager implementation based on hosts list file. */ public abstract class HostsFileBasedInfrastructureManager extends InfrastructureManager { public static final int DEFAULT_NODE_TIMEOUT = 60 * 1000; public static final int DEFAULT_NODE_DEPLOYMENT_FAILURE_THRESHOLD = 5; public static final long DEFAULT_WAIT_TIME_BETWEEN_NODE_DEPLOYMENT_FAILURES = 5000; @Configurable(fileBrowser = true, description = "Absolute path of the file containing\nthe list of remote hosts") protected File hostsList; @Configurable(description = "in ms. After this timeout expired\nthe node is considered to be lost") protected int nodeTimeOut = HostsFileBasedInfrastructureManager.DEFAULT_NODE_TIMEOUT; @Configurable(description = "Maximum number of failed attempt to deploy on \na host before discarding it") protected int maxDeploymentFailure = HostsFileBasedInfrastructureManager.DEFAULT_NODE_DEPLOYMENT_FAILURE_THRESHOLD; @Configurable(description = "Milliseconds to wait after each failed attempt to deploy on \na host") protected long waitBetweenDeploymentFailures = HostsFileBasedInfrastructureManager.DEFAULT_WAIT_TIME_BETWEEN_NODE_DEPLOYMENT_FAILURES; /** * map of free hosts with the number of nodes to deploy on each host */ private ConcurrentHashMap<InetAddress, Integer> freeHosts = new ConcurrentHashMap<>(); /** * The set of nodes for which one the registerAcquiredNode has been run. */ private Hashtable<String, InetAddress> registeredNodes = new Hashtable<>(); /** * Nodes previously removed */ final ConcurrentHashMap<InetAddress, AtomicInteger> removedNodes = new ConcurrentHashMap<>(); /** * To notify the control loop of the deploying node timeout */ protected ConcurrentHashMap<String, Boolean> pnTimeout = new ConcurrentHashMap<>(); /** * Acquire one node per available host */ @Override public void acquireAllNodes() { while (freeHosts.size() > 0) { acquireNode(); } } /** * Acquire one node on an available host */ @Override public void acquireNode() { final InetAddress tmpHost; final int nbNodes; if (freeHosts.size() == 0) { logger.info("Attempting to acquire nodes while all hosts are already deployed."); return; } Iterator<Map.Entry<InetAddress, Integer>> iterator = freeHosts.entrySet().iterator(); final Map.Entry<InetAddress, Integer> tmpEntry = iterator.next(); iterator.remove(); tmpHost = tmpEntry.getKey(); nbNodes = tmpEntry.getValue(); logger.info("Acquiring a new node. #freeHosts:" + freeHosts.size() + " #registered: " + registeredNodes.size()); this.nodeSource.executeInParallel(new Runnable() { public void run() { try { startNodeImplWithRetries(tmpHost, nbNodes, maxDeploymentFailure); //node acquisition went well for host so we update the threshold logger.debug("Node acquisition ended. #freeHosts:" + freeHosts.size() + " #registered: " + registeredNodes.size()); } catch (Exception e) { String description = "Could not acquire node on host " + tmpHost + ". NS's state refreshed regarding last checked exception: #freeHosts:" + freeHosts.size() + " #registered: " + registeredNodes.size(); logger.error(description, e); return; } } }); } /** * Configures the infrastructre. * parameters[0] = hosts list file content * parameters[1] = timeout of the node deployment * parameters[2] = max deployment failure * parameters[3] = wait time between failures */ @Override protected void configure(Object... parameters) { if (parameters == null || parameters.length < 4) { throw new IllegalArgumentException("Not enough parameter provided to the infrastructure."); } int index = 0; try { byte[] bytes = (byte[]) parameters[index++]; this.hostsList = File.createTempFile("hosts", "list"); FileToBytesConverter.convertByteArrayToFile(bytes, this.hostsList); readHosts(this.hostsList); this.hostsList.delete(); } catch (Exception e) { throw new IllegalArgumentException("Could not read hosts file", e); } try { this.nodeTimeOut = Integer.parseInt(parameters[index++].toString()); } catch (NumberFormatException e) { logger.warn("Number format exception occurred at ns configuration, default acq timeout value set: " + HostsFileBasedInfrastructureManager.DEFAULT_NODE_TIMEOUT + "ms"); this.nodeTimeOut = HostsFileBasedInfrastructureManager.DEFAULT_NODE_TIMEOUT; } try { this.maxDeploymentFailure = Integer.parseInt(parameters[index++].toString()); } catch (NumberFormatException e) { logger.warn("Number format exception occurred at ns configuration, default attemp value set: " + HostsFileBasedInfrastructureManager.DEFAULT_NODE_DEPLOYMENT_FAILURE_THRESHOLD); this.maxDeploymentFailure = HostsFileBasedInfrastructureManager.DEFAULT_NODE_DEPLOYMENT_FAILURE_THRESHOLD; } try { this.waitBetweenDeploymentFailures = Integer.parseInt(parameters[index++].toString()); } catch (NumberFormatException e) { logger.warn("Number format exception occurred at ns configuration, default wait time between failures value set: " + HostsFileBasedInfrastructureManager.DEFAULT_WAIT_TIME_BETWEEN_NODE_DEPLOYMENT_FAILURES); this.waitBetweenDeploymentFailures = HostsFileBasedInfrastructureManager.DEFAULT_WAIT_TIME_BETWEEN_NODE_DEPLOYMENT_FAILURES; } } /** * Internal host file parser * <p> * File format: * one host per line, optionally followed by a space and an integer describing the maximum * number of runtimes (1 if not specified). Example: * <pre> * example.com * example.org 5 * example.net 3 * </pre> * @param f the file from which hosts names are to be extracted * @throws IOException parsing failed */ protected void readHosts(File f) throws IOException { BufferedReader in = new BufferedReader(new FileReader(f)); String line = ""; while ((line = in.readLine()) != null) { if (line == "" || line.trim().length() == 0) continue; String[] elts = line.split(" "); int num = 1; if (elts.length > 1) { try { num = Integer.parseInt(elts[1]); if (num < 1) { throw new IllegalArgumentException("Cannot launch less than one runtime per host."); } } catch (Exception e) { logger.warn("Error while parsing hosts file: " + e.getMessage(), e); num = 1; } } String host = elts[0]; try { InetAddress addr = InetAddress.getByName(host); this.freeHosts.putIfAbsent(addr, num); } catch (UnknownHostException ex) { throw new RuntimeException("Unknown host: " + host, ex); } } } /** * This method is called by Infrastructure Manager in case of a deploying node removal. * We take advantage of it to specify to the remote process control loop of the removal. * This one will then exit. */ @Override protected void notifyDeployingNodeLost(String pnURL) { this.pnTimeout.put(pnURL, new Boolean(true)); } /** * Parent IM notifies about a new node registration */ @Override protected void notifyAcquiredNode(Node node) throws RMException { String nodeName = node.getNodeInformation().getName(); this.registeredNodes.put(nodeName, node.getVMInformation().getInetAddress()); if (logger.isDebugEnabled()) { logger.debug("New expected node registered: #freeHosts:" + freeHosts.size() + " #registered: " + registeredNodes.size()); } } /** * {@inheritDoc} */ @Override public void removeNode(Node node) { InetAddress host = null; String nodeName = node.getNodeInformation().getName(); if ((host = registeredNodes.remove(nodeName)) != null) { logger.debug("Removing node " + node.getNodeInformation().getURL() + " from " + this.getClass().getSimpleName()); // remember the node removed removedNodes.putIfAbsent(host, new AtomicInteger(0)); removedNodes.get(host).incrementAndGet(); if (!registeredNodes.containsValue(host)) { try { this.killNodeImpl(node, host); } catch (Exception e) { logger.trace("An exception occurred during node removal", e); } // in case all nodes relative to this host were removed kill the JVM freeHosts.putIfAbsent(host, removedNodes.remove(host).intValue()); } logger.info("Node " + nodeName + " removed. #freeHosts:" + freeHosts.size() + " #registered nodes: " + registeredNodes.size()); } else { logger.error("Node " + nodeName + " is not known as a node belonging to this infrastructure manager"); } } @Override public void onDownNodeReconnection(Node node) { InetAddress host = node.getNodeInformation().getVMInformation().getInetAddress(); // Yes, this method may experience race conditions // like most of the other methods of this class... // See https://github.com/ow2-proactive/scheduling/issues/2811 AtomicInteger nbNodesRemoved = removedNodes.get(host); if (nbNodesRemoved != null) { nbNodesRemoved.decrementAndGet(); registeredNodes.put(node.getNodeInformation().getName(), host); } } protected boolean anyTimedOut(List<String> nodesUrl) { for (String nodeUrl : nodesUrl) { if (pnTimeout.get(nodeUrl)) { return true; } } return false; } protected void removeTimeouts(List<String> nodesUrl) { for (String nodeUrl : nodesUrl) { pnTimeout.remove(nodeUrl); } } protected void addTimeouts(List<String> nodesUrl) { for (String pnUrl : nodesUrl) { this.pnTimeout.put(pnUrl, false); } } protected void startNodeImplWithRetries(final InetAddress host, final int nbNodes, int retries) throws RMException { while (true) { final List<String> depNodeURLs = new ArrayList<>(nbNodes); try { startNodeImpl(host, nbNodes, depNodeURLs); return; } catch (Exception e) { logger.warn("Failed nodes deployment in host : " + host + ", retries left : " + retries); if (isInfiniteRetries(retries) || retries > 0) { removeNodes(depNodeURLs); waitPeriodBeforeRetry(); retries = getRetriesLeft(retries); } else { logger.error("Tries threshold reached for host " + host + ". This host is not part of the deployment process anymore."); throw e; } } } } private boolean isInfiniteRetries(int retries) { return retries == -1; } private void waitPeriodBeforeRetry() { try { Thread.sleep(waitBetweenDeploymentFailures); } catch (InterruptedException e1) { Thread.currentThread().interrupt(); } } private int getRetriesLeft(int retries) { int retriesLeft = (retries > 0) ? --retries : retries; return retriesLeft; } /** * @param depNodeURLs */ private void removeNodes(List<String> depNodeURLs) { for (String node : depNodeURLs) { internalRemoveDeployingNode(node); } } /** * Launch the node on the host passed as parameter * @param host The host on which one the node will be started * @param nbNodes number of nodes to deploy * @param depNodeURLs list of deploying or lost nodes urls created * @throws RMException If the node hasn't been started. Very important to take care of that * in implementations to keep the infrastructure in a coherent state. */ protected abstract void startNodeImpl(InetAddress host, int nbNodes, List<String> depNodeURLs) throws RMException; /** * Kills the node passed as parameter * @param node The node to kill * @param host * @throws RMException if a problem occurred while removing */ protected abstract void killNodeImpl(Node node, InetAddress host) throws RMException; }