/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.net.InetSocketAddress; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.mapred.protocal.FairSchedulerProtocol; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.StringUtils; /** * Moves slots between two MapReduce clusters which runs TaskTrackers on the * same set of machines */ public class HourGlass implements Runnable { static { Configuration.addDefaultResource("hour-glass.xml"); } public final static String SERVERS_KEY = "mapred.hourglass.fairscheduler.servers"; public final static String WEIGHTS_KEY = "mapred.hourglass.fairscheduler.weights"; public final static String MAX_MAP_KEY = "mapred.hourglass.map.tasks.maximum"; public final static String MAX_REDUCE_KEY = "mapred.hourglass.reduce.tasks.maximum"; public final static String CPU_MAP_KEY = "mapred.hourglass.cpus.to.maptasks"; public final static String CPU_REDUCE_KEY = "mapred.hourglass.cpus.to.reducetasks"; public final static String INTERVAL_KEY = "mapred.hourglass.update.interval"; public final static String SHARE_THRESHOLD_KEY = "mapred.hourglass.share.threshold"; // The if the share is lower than this threshold, the cluster gets 0 slots float shareThreshold = 0.01F; public static Log LOG = LogFactory.getLog(HourGlass.class); long updateInterval = 10000L; volatile boolean running = true; Configuration conf; Cluster clusters[] = new Cluster[2]; // Stores the initial maximum slot limit loaded from the conf int defaultMaxMapSlots; int defaultMaxReduceSlots; // Stores the initial #CPU to maximum slot limit loaded from the conf Map<Integer, Integer> defaultCpuToMaxMapSlots = null; Map<Integer, Integer> defaultCpuToMaxReduceSlots = null; final static TaskType MAP_AND_REDUCE[] = new TaskType[] {TaskType.MAP, TaskType.REDUCE}; public HourGlass(Configuration conf) throws IOException { this.conf = conf; defaultMaxMapSlots = conf.getInt(MAX_MAP_KEY, Integer.MAX_VALUE); defaultMaxReduceSlots = conf.getInt(MAX_REDUCE_KEY, Integer.MAX_VALUE); defaultCpuToMaxMapSlots = loadCpuToMaxSlots(conf, TaskType.MAP); defaultCpuToMaxReduceSlots = loadCpuToMaxSlots(conf, TaskType.REDUCE); shareThreshold = conf.getFloat(SHARE_THRESHOLD_KEY, shareThreshold); try { String config; config = conf.get(SERVERS_KEY); String addresses[] = config.replaceAll("\\s", "").split(","); config = conf.get(WEIGHTS_KEY); double weights[] = new double[2]; String str[] = config.replaceAll("\\s", "").split(","); weights[0] = Double.parseDouble(str[0]); weights[1] = Double.parseDouble(str[1]); if (weights[0] < 0 || weights[1] < 0 || (weights[0] == 0 && weights[1] == 0)) { throw new IOException(); } clusters[0] = new Cluster(addresses[0], weights[0], conf); clusters[1] = new Cluster(addresses[1], weights[1], conf); } catch (Exception e) { String msg = "Must assign exactly two server addresses and " + "the corresponding positive weights in hour-glass.xml"; LOG.error(msg); throw new IOException(msg); } updateInterval = conf.getLong(WEIGHTS_KEY, updateInterval); } public Map<Integer, Integer> loadCpuToMaxSlots( Configuration conf, TaskType type) { String config = type == TaskType.MAP ? conf.get(CPU_MAP_KEY) : conf.get(CPU_REDUCE_KEY); Map<Integer, Integer> defaultCpuToMaxSlots = new HashMap<Integer, Integer>(); if (config != null) { for (String s : config.replaceAll("\\s", "").split(",")) { String pair[] = s.split(":"); int cpus = Integer.parseInt(pair[0]); int tasks = Integer.parseInt(pair[1]); LOG.info(String.format( "Number of CPUs to tasks. %s CPU : %s %s", cpus, tasks, type)); defaultCpuToMaxSlots.put(cpus, tasks); } } return defaultCpuToMaxSlots; } /** * Hold the states of one MapReduce cluster */ static class Cluster { FairSchedulerProtocol client; Map<String, TaskTrackerStatus> taskTrackers = new HashMap<String, TaskTrackerStatus>(); String address; double weight; // Higher weight will get more share int runnableMaps; // Runnable maps on the cluster int runnableReduces; // Runnable reduces on the cluster double targetMapShare; // The share of maps to achieve double targetReduceShare; // The share of reduces to achieve Cluster(String address, double weight, Configuration conf) throws IOException { this.client = createClient(address, conf); this.weight = weight; this.address = address; } /** * Obtain the cluster information from RPC * @throws IOException */ void updateClusterStatus() throws IOException { taskTrackers.clear(); for (TaskTrackerStatus status : client.getTaskTrackerStatus()) { String host = NetUtils.normalizeHostName(status.getHost()); taskTrackers.put(host, status); } runnableMaps = client.getRunnableTasks(TaskType.MAP); runnableReduces = client.getRunnableTasks(TaskType.REDUCE); LOG.info(String.format("Update cluster status. " + "cluster:%s runnableMaps:%s runnableReduces:%s", address, runnableMaps, runnableReduces)); } /** * Set the maximum slot of a tasktracker * @param tracker The status of the tasktracker to set * @param type The type of the task to set * @param slots The number of slots to set * @throws IOException */ void setFSMaxSlots(TaskTrackerStatus tracker, TaskType type, int slots) throws IOException { client.setFSMaxSlots(tracker.getTrackerName(), type, slots); } /** * Obtain the maximum slots of a tasktracker of one cluster * @param status The status of the tasktracker * @param type The type of the task * @return The number of slots of the type on the TT * @throws IOException */ int getMaxSlots(TaskTrackerStatus status, TaskType type) throws IOException { return client.getMaxSlots(status, type); } } /** * Update the task share of the clusters * @param clusters Two clusters with tasktrackers shares same nodes */ static private void updateShares(Cluster clusters[]) { assert(clusters.length == 2); if (clusters[0].runnableMaps == 0 && clusters[0].runnableMaps == 0 && clusters[1].runnableReduces == 0 && clusters[1].runnableReduces == 0) { // Do nothing if both clusters are empty return; } // Update target task shares using runnable tasks and weight if (!(clusters[0].runnableMaps == 0 && clusters[1].runnableMaps == 0)) { clusters[0].targetMapShare = clusters[0].runnableMaps * clusters[0].weight / (clusters[0].runnableMaps * clusters[0].weight + clusters[1].runnableMaps * clusters[1].weight); clusters[1].targetMapShare = 1 - clusters[0].targetMapShare; } if (!(clusters[0].runnableReduces == 0 && clusters[1].runnableReduces == 0)) { clusters[0].targetReduceShare = clusters[0].runnableReduces * clusters[0].weight / (clusters[0].runnableReduces * clusters[0].weight + clusters[1].runnableReduces * clusters[1].weight); clusters[1].targetReduceShare = 1 - clusters[0].targetReduceShare; } for (int i = 0; i < 2; ++i) { LOG.info(String.format("Update Shares. " + "cluster%s:%s runnableMaps:%s runnableReduces:%s " + "weight:%s targetMapShare:%s targetReduceShare:%s", i, clusters[i].address, clusters[i].weight, clusters[i].runnableMaps, clusters[i].runnableReduces, clusters[i].targetMapShare, clusters[i].targetReduceShare)); } } /** * Keep moving slots between two clusters according to their runnable tasks. * These clusters are assumed to run tasktrackers on the same set of machines */ @Override public void run() { long lastUpdate = -1L; // Start balancing the clusters while (running) { try { Thread.sleep(updateInterval / 10); long now = JobTracker.getClock().getTime(); if (now - lastUpdate > updateInterval) { lastUpdate = now; doMoveSlots(clusters); } } catch (Exception e) { LOG.error("Exception while balancing cluster.", e); } } } /** * Move slots on each tasktracker between two clusters such that their share * of the slots meets the target share. * @param clusters Two clusters * @throws IOException */ private void doMoveSlots(Cluster clusters[]) throws IOException { // Obtain the new status of the clusters clusters[0].updateClusterStatus(); clusters[1].updateClusterStatus(); // Compute the target shares of the clusters updateShares(clusters); TaskTrackerStatus taskTrackers[] = new TaskTrackerStatus[2]; int currentTasks[] = new int[2]; int maxTasks[] = new int[2]; int occupiedTasks[] = new int[2]; double targetShares[] = new double[2]; Set<String> allTaskTrackers = new HashSet<String>(); allTaskTrackers.addAll(clusters[0].taskTrackers.keySet()); allTaskTrackers.addAll(clusters[1].taskTrackers.keySet()); // Set the slots for each TaskTracker to achieve the target share for (String host : allTaskTrackers) { boolean inBothClusters = true; for (int i = 0; i < 2; ++i) { // Check if the host is in both clusters if (!clusters[i].taskTrackers.containsKey(host)) { inBothClusters = false; LOG.warn(String.format( "%s is in cluster%s:%s but not int cluster%s:%s", 1 - i, clusters[1 - i].address, i, clusters[i].address)); TaskTrackerStatus status = clusters[1 - i].taskTrackers.get(host); // If it is only in one cluster, this cluster gets all slots for (TaskType type : MAP_AND_REDUCE) { int totalSlots = getTotalSlots(status, type); int maxSlots = clusters[1 - i].getMaxSlots(status, type); if (maxSlots < totalSlots) { clusters[1 - i].setFSMaxSlots(status, type, totalSlots); } } } } if (!inBothClusters) { continue; } // Both the clusters have this host taskTrackers[0] = clusters[0].taskTrackers.get(host); taskTrackers[1] = clusters[1].taskTrackers.get(host); for (TaskType type : MAP_AND_REDUCE) { int totalSlots = getTotalSlots(taskTrackers[0], type); // Compute the free slots from occupiedTasks and maxTasks maxTasks[0] = clusters[0].getMaxSlots(taskTrackers[0], type); maxTasks[1] = clusters[1].getMaxSlots(taskTrackers[1], type); if (type == TaskType.MAP) { occupiedTasks[0] = taskTrackers[0].countOccupiedMapSlots(); occupiedTasks[1] = taskTrackers[1].countOccupiedMapSlots(); targetShares[0] = clusters[0].targetMapShare; targetShares[1] = clusters[1].targetMapShare; } else { occupiedTasks[0] = taskTrackers[0].countOccupiedReduceSlots(); occupiedTasks[1] = taskTrackers[1].countOccupiedReduceSlots(); targetShares[0] = clusters[0].targetReduceShare; targetShares[1] = clusters[1].targetReduceShare; } currentTasks[0] = Math.max(occupiedTasks[0], maxTasks[0]); currentTasks[1] = Math.max(occupiedTasks[1], maxTasks[1]); int freeSlots = totalSlots - currentTasks[0] - currentTasks[1]; // Determine where the slots should flow int dst = (maxTasks[0] / (0.01 + targetShares[0]) < maxTasks[1] / (0.01 + targetShares[1])) ? 0 : 1; int src = 1 - dst; // If there are free slots, give them to the destination cluster if (freeSlots > 0) { clusters[dst].setFSMaxSlots( taskTrackers[dst], type, maxTasks[dst] + freeSlots); LOG.info(String.format("Increase %s %s for cluster%s on %s. " + "maxTasks%s:%s maxTasks%s:%s " + "occupiedTasks%s:%s occupiedTasks%s:%s", freeSlots, type, dst, host, dst, maxTasks[dst], src, maxTasks[src], dst, occupiedTasks[dst], src, occupiedTasks[src])); } // Release more slots from source cluster if necessary int targetSlots = targetShares[src] > shareThreshold ? (int)Math.ceil(totalSlots * targetShares[src]) : 0; if (maxTasks[src] > targetSlots) { clusters[src].setFSMaxSlots(taskTrackers[src], type, targetSlots); LOG.info(String.format("Release %s %s for cluster%s on %s. " + "maxTasks%s:%s maxTasks%s:%s " + "occupiedTasks%s:%s occupiedTasks%s:%s", maxTasks[src] - targetSlots, type, src, host, src, maxTasks[src], dst, maxTasks[dst], src, occupiedTasks[src], dst, occupiedTasks[dst])); } } } } /** * Stop the loop that balancing the cluster */ public void stop() { running = false; } /** * Obtain the two clusters combined total slots of a tasktracker */ private int getTotalSlots( TaskTrackerStatus status, TaskType type) { Map<Integer, Integer> defaultCpuToMaxSlots = (type == TaskType.MAP) ? defaultCpuToMaxMapSlots : defaultCpuToMaxReduceSlots; int cpus = status.getResourceStatus().getNumProcessors(); Integer slots = defaultCpuToMaxSlots.get(cpus); if (slots == null) { slots = (type == TaskType.MAP) ? defaultMaxMapSlots : defaultMaxReduceSlots; } int taskTrackerSlots = (type == TaskType.MAP) ? status.getMaxMapSlots() : status.getMaxReduceSlots(); return Math.min(slots, taskTrackerSlots); } /** * Create a FariScheduler RPC client * @param target The host:port of the RPC server * @param conf The configuration * @return The FairScheduler client * @throws IOException */ private static FairSchedulerProtocol createClient( String target, Configuration conf) throws IOException { InetSocketAddress addr = NetUtils.createSocketAddr(target); UserGroupInformation ugi = UserGroupInformation.getCurrentUGI(); LOG.info("Connecting to " + addr); return (FairSchedulerProtocol) RPC.getProxy(FairSchedulerProtocol.class, FairSchedulerProtocol.versionID, addr, ugi, conf, NetUtils.getSocketFactory(conf, FairSchedulerProtocol.class)); } /** * Start the HourGlass process */ public static void main(String argv[]) { StringUtils.startupShutdownMessage(HourGlass.class, argv, LOG); try { HourGlass hourGlass = new HourGlass(new Configuration()); hourGlass.run(); } catch (Throwable e) { LOG.fatal(StringUtils.stringifyException(e)); System.exit(-1); } } }