/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.corona; import java.io.IOException; import java.util.Arrays; import java.util.EnumMap; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; /** * Utility class for corona configuration. */ public class CoronaConf extends Configuration { /** Logger. */ public static final Log LOG = LogFactory.getLog(CoronaConf.class); /** The includes file. */ public static final String HOSTS_FILE = "cm.hosts"; /** The excludes file. */ public static final String EXCLUDE_HOSTS_FILE = "cm.hosts.exclude"; /** * The name of the file which will contain the CM's state when it goes for * an upgrade. */ public static final String CM_STATE_FILE = "cm.state"; /** The RPC address of the Cluster Manager. */ public static final String CM_ADDRESS = "cm.server.address"; /** * This boolean property is used to fix whether compression would be used * while saving the CM state or not. While debugging, it is preferable * that this should be false. */ public static final String CM_COMPRESS_STATE = "cm.compress.state"; /** The HTTP UI address for the Cluster Manager. */ public static final String CM_HTTP_ADDRESS = "cm.server.http.address"; /** The RPC address of the Proxy Job Tracker. */ public static final String PROXY_JOB_TRACKER_ADDRESS = "corona.proxy.job.tracker.rpcaddr"; /** The Thrift address of the Proxy Job Tracker. */ public static final String PROXY_JOB_TRACKER_THRIFT_ADDRESS = "corona.proxy.job.tracker.thriftaddr"; /** The interval after which a cluster node is timed out. */ public static final String NODE_EXPIRY_INTERVAL = "cm.node.expiryinterval"; /** Allow unconfigured pools? */ public static final String CONFIGURED_POOLS_ONLY = "cm.configured.pools.only"; /** * The number of sessions that flag node for failed connections after which * the node is considered bad. */ public static final String NODE_MAX_FAILED_CONNECTIONS = "cm.node.max.failed.connections"; /** * The number of failed connections to a node after which a session flags the * node as bad. */ public static final String NODE_MAX_FAILED_CONNECTIONS_SESSION = "cm.node.max.failed.connections.session"; /** * The number of sessions that flag node for failures after which the node is * considered bad. */ public static final String NODE_MAX_FAILURES = "cm.node.max.failures"; /** * The number of failures on a node after which a session flags the node as * bad. */ public static final String NODE_MAX_FAILURES_SESSION = "cm.node.max.failures.session"; /** The interval after which a session is timed out. */ public static final String SESSION_EXPIRY_INTERVAL = "cm.session.expiryinterval"; public static final String CM_NOTIFIER_THREAD_COUNT = "cm.notifier.numnotifiers"; /** How often a notifier thread will poll its queue of tasks. */ public static final String NOTIFIER_POLL_INTERVAL = "cm.notifier.pollinterval"; /** The retry interval factor for a notifier. */ public static final String NOTIFIER_RETRY_INTERVAL_FACTOR = "cm.notifier.retry.interval.factor"; /** The retry interval start for a notifier. */ public static final String NOTIFIER_RETRY_INTERVAL_START = "cm.notifier.retry.interval.start"; /** The max retries for a notifier. */ public static final String NOTIFIER_RETRY_MAX = "cm.notifier.retry.max"; /** JSON configuration specifying the CPU->Resource allocation. */ public static final String CPU_TO_RESOURCE_PARTITIONING = "cm.cpu.to.resource.partitioning"; /** Timeout. */ public static final String CM_SOTIMEOUT = "cm.server.sotimeout"; /** Minimum free memory on a node before scheduling on it. */ public static final String NODE_RESERVED_MEMORY_MB = "cm.node.reserved.memory.mb"; /** Minimum free disk on a node before scheduling on it. */ public static final String NODE_RESERVED_DISK_GB = "cm.node.reserved.disk.gb"; /** Log directory for sessions. */ public static final String SESSIONS_LOG_ROOT = "corona.sessions.log.dir"; /** Maximum number of retired sessions to keep in memory. */ public static final String MAX_RETIRED_SESSIONS = "cm.sessions.num.retired"; // these are left in the mapred.fairscheduler namespace to make sure they are // compatible with the current fairscheduler. client can be expected to send jobs // to corona and/or classic hadoop with same configuration public static final String IMPLICIT_POOL_PROPERTY = "mapred.fairscheduler.poolnameproperty"; /** * In the format of <pool group>.<pool> (i.e. ads.nonsla) * Specifies a default pool group PoolGroupManager.DEFAULT_POOL_GROUP if * the pool group is not specified. * i.e. ads_nonsla -> defaultpoolgroup.ads_nonsla */ public static final String EXPLICIT_POOL_PROPERTY = "mapred.fairscheduler.pool"; /** Where the general config file is stored. */ public static final String CONFIG_FILE_PROPERTY = "cm.config.file"; /** Default general config file location */ public static final String DEFAULT_CONFIG_FILE = "corona.xml"; /** Where the pools config file is stored. */ public static final String POOLS_CONFIG_FILE_PROPERTY = "cm.pools.config.file"; /** * Default pools config file location (same as general config file * by default). */ public static final String DEFAULT_POOLS_CONFIG_FILE = "corona.xml"; /** * Property for specifying the number of ms to wait between pools config * generation (if specified). */ public static final String POOLS_RELOAD_PERIOD_MS_PROPERTY = "cm.pools.reload.period.ms"; /** * Property for specifying the number of ms to wait between pools config * generation (if specified). */ public static final String CONFIG_RELOAD_PERIOD_MS_PROPERTY = "cm.config.reload.period.ms"; /** Class to generate the pools config */ public static final String POOLS_CONFIG_DOCUMENT_GENERATOR_PROPERTY = "cm.pools.config.document.generator"; /** number of task trackers restarted in one batch */ public static final String CORONA_NODE_RESTART_BATCH = "corona.node.restart.batch"; /** interval for restarting task trackers batches */ public static final String CORONA_NODE_RESTART_INTERVAL = "corona.node.restart.interval"; /** The max time CM will wait for JT heartbeat to be in sync */ public static final String CM_HEARTBEAT_DELAY_MAX = "cm.heartbeat.delay.max"; private Map<Integer, Map<ResourceType, Integer>> cachedCpuToResourcePartitioning = null; public CoronaConf(Configuration conf) { super(conf); } public int getCMSoTimeout() { return getInt(CM_SOTIMEOUT, 60*1000); } public String getClusterManagerAddress() { return get(CM_ADDRESS, "localhost:8888"); } public String getClusterManagerHttpAddress() { return get(CM_HTTP_ADDRESS, "localhost:0"); } public String getProxyJobTrackerAddress() { return get(PROXY_JOB_TRACKER_ADDRESS , "localhost:50035"); } public String getProxyJobTrackerThriftAddress() { return get(PROXY_JOB_TRACKER_THRIFT_ADDRESS, "localhost:50036"); } public static String getClusterManagerAddress(Configuration conf) { return conf.get(CM_ADDRESS, "localhost:8888"); } public int getNodeExpiryInterval() { return getInt(NODE_EXPIRY_INTERVAL, 10 * 60 * 1000); } public String getSessionsLogDir() { return get(SESSIONS_LOG_ROOT, "/tmp/history"); } public int getNumRetiredSessions() { return getInt(MAX_RETIRED_SESSIONS, 1000); } public int getMaxSessionsPerDir() { return getInt("corona.history.max.per.dir", 1000); } public long getLogDirRotationInterval() { return getLong("corona.history.roll.period", 60L * 60 * 1000); } public int getSessionExpiryInterval() { int val = getInt(SESSION_EXPIRY_INTERVAL, 0); if (val != 0) return val; // if the session expiry interval is not specified then we compute // one based on the exponential backoff intervals of the session // notification retries val = getNotifierRetryIntervalStart(); int factor = getNotifierRetryIntervalFactor(); for(int i=1; i<getNotifierRetryMax(); i++) { val += val*factor; } return val; } public int getNotifierPollInterval() { return getInt(NOTIFIER_POLL_INTERVAL, 1000); } public int getNotifierRetryIntervalFactor() { return getInt(NOTIFIER_RETRY_INTERVAL_FACTOR, 4); } public int getNotifierRetryIntervalStart() { return getInt(NOTIFIER_RETRY_INTERVAL_START, 5000); } public int getNotifierRetryMax() { return getInt(NOTIFIER_RETRY_MAX, 5); } /** * Get and cache the cpu to resource partitioning for this object. * * @return Mapping of cpu to resources (cached) */ public Map<Integer, Map<ResourceType, Integer>> getCpuToResourcePartitioning() { if (cachedCpuToResourcePartitioning == null) { cachedCpuToResourcePartitioning = getUncachedCpuToResourcePartitioning(this); } return cachedCpuToResourcePartitioning; } /** * Determine the cpu to resource partitioning for a configuration * * @param conf Configuration with the cpu to resource partitioning * @return Mapping of cpu to resources */ public static Map<Integer, Map<ResourceType, Integer>> getUncachedCpuToResourcePartitioning(Configuration conf) { String jsonStr = conf.get(CPU_TO_RESOURCE_PARTITIONING, ""); Map<Integer, Map<ResourceType, Integer>> ret = new HashMap<Integer, Map<ResourceType, Integer>>(); try { ObjectMapper mapper = new ObjectMapper(); JsonNode rootNode = mapper.readValue(jsonStr, JsonNode.class); Iterator<String> iter = rootNode.getFieldNames(); while (iter.hasNext()) { String field = iter.next(); Integer numCpu = Integer.parseInt(field); if ((numCpu < 0) || (numCpu > 64)) { throw new RuntimeException( "Number of CPUs: " + numCpu + " is not in range 0-64"); } JsonNode val = rootNode.get(field); if (!val.isObject()) { throw new RuntimeException( "Resource Partitioning: " + val.toString() + " is not a object"); } Map<ResourceType, Integer> resourcePartition = null; Iterator<String> valIter = val.getFieldNames(); while (valIter.hasNext()) { String resourceTypeString = valIter.next(); JsonNode resourceVal = val.get(resourceTypeString); int resourceSlots = 0; if (!resourceVal.isInt() || ((resourceSlots = resourceVal.getIntValue()) < 0) || resourceSlots > 64) { throw new RuntimeException( "Resource Partition value: " + resourceVal.toString() + " is not a valid number"); } if (resourcePartition == null) { resourcePartition = new EnumMap<ResourceType, Integer>(ResourceType.class); } try { ResourceType resourceType = ResourceType.valueOf(resourceTypeString); resourcePartition.put(resourceType, new Integer(resourceSlots)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Cannot correctly parse resource type " + resourceTypeString + ", must be one of " + Arrays.toString(ResourceType.values())); } } if (resourcePartition != null) { ret.put(numCpu, resourcePartition); } } return ret; } catch (JsonParseException e) { LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING); throw new RuntimeException(e); } catch (JsonMappingException e) { LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING); throw new RuntimeException(e); } catch (IOException e) { LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING); throw new RuntimeException(e); } } /** * Get the pool info. In order to support previous behavior, a single pool * name is accepted. * @return Pool info, using a default pool group if the * explicit pool can not be found */ public PoolInfo getPoolInfo() { String poolNameProperty = get(IMPLICIT_POOL_PROPERTY, "user.name"); String explicitPool = get(EXPLICIT_POOL_PROPERTY, get(poolNameProperty, "")).trim(); String[] poolInfoSplitString = explicitPool.split("[.]"); if (poolInfoSplitString != null && poolInfoSplitString.length == 2) { return new PoolInfo(poolInfoSplitString[0], poolInfoSplitString[1]); } else if (!explicitPool.isEmpty()) { return new PoolInfo(PoolGroupManager.DEFAULT_POOL_GROUP, explicitPool); } else { return PoolGroupManager.DEFAULT_POOL_INFO; } } public int getNodeReservedMemoryMB() { return getInt(NODE_RESERVED_MEMORY_MB, 0); } public int getNodeReservedDiskGB() { return getInt(NODE_RESERVED_DISK_GB, 0); } /** * @return The number of sessions that report too many failed connections in * order to blacklist a node. */ public int getMaxFailedConnections() { return getInt(NODE_MAX_FAILED_CONNECTIONS, 20); } /** * @return The number of failed connections to a node encountered by a session * in order for it to count towards blacklisting the node. */ public int getMaxFailedConnectionsPerSession() { return getInt(NODE_MAX_FAILED_CONNECTIONS_SESSION, 1); } /** * @return The number of sessions that report too many failures in order to * blacklist a node. */ public int getMaxFailures() { return getInt(NODE_MAX_FAILURES, 40); } /** * @return The number of failures encountered by a session in order for it to * count towards blacklisting the node. */ public int getMaxFailuresPerSession() { return getInt(NODE_MAX_FAILURES_SESSION, 5); } public String getHostsFile() { return get(HOSTS_FILE, ""); } public String getExcludesFile() { return get(EXCLUDE_HOSTS_FILE, ""); } /** * Get the address of the file used to save the state of the ClusterManager * when it goes down for an upgrade * * @return A String, containing the address of the file used to save the * ClusterManager state. */ public String getCMStateFile() { return get(CM_STATE_FILE, "cm.state"); } /** * Return the flag which indicates if we will be using compression while * saving the ClusterManager state. * * @return A boolean, which is true if we are going to use compression while * saving the CM state. */ public boolean getCMCompressStateFlag() { return getBoolean(CM_COMPRESS_STATE, false); } public int getCMNotifierThreadCount() { return getInt(CM_NOTIFIER_THREAD_COUNT, 17); } /** * Get the general config file location * * @return General config file location (default if not set) */ public String getConfigFile() { return get(CONFIG_FILE_PROPERTY, DEFAULT_CONFIG_FILE); } /** * Get the pools config file location * * @return Pools config file location (default if not set) */ public String getPoolsConfigFile() { return get(POOLS_CONFIG_FILE_PROPERTY, DEFAULT_POOLS_CONFIG_FILE); } /** * Only allow configured pools? * * @return True if only configured pools is allowed, false otherwise */ public boolean onlyAllowConfiguredPools() { return getBoolean(CONFIGURED_POOLS_ONLY, false); } /** * Get the milliseconds to wait between trying to generate pools config * * @return Milliseconds to wait between trying to generate pools config */ public long getPoolsReloadPeriodMs() { // Default of 5 minutes return getLong(POOLS_RELOAD_PERIOD_MS_PROPERTY, 5 * 60000); } /** * Get the milliseconds to wait between reloading config files * * @return Milliseconds to wait between reloading config files */ public long getConfigReloadPeriodMs() { // Default of 1 minute return getLong(CONFIG_RELOAD_PERIOD_MS_PROPERTY, 60000); } /** * Get the pools config document generator class * * @return Null if not specified, otherwise the generator class. */ public Class<?> getPoolsConfigDocumentGeneratorClass() { return getClass(POOLS_CONFIG_DOCUMENT_GENERATOR_PROPERTY, null); } public int getCoronaNodeRestartBatch() { return getInt(CORONA_NODE_RESTART_BATCH, 1000); } public long getCoronaNodeRestartInterval() { return getLong(CORONA_NODE_RESTART_INTERVAL, 1800000L); } public long getCMHeartbeatDelayMax() { return getLong(CM_HEARTBEAT_DELAY_MAX, 600000); } }