/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.tez.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.JobSubmissionFiles; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.split.JobSplitWriter; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration; import org.apache.pig.classification.InterfaceAudience; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.library.vertexmanager.ShuffleVertexManager; import org.apache.tez.mapreduce.hadoop.DeprecatedKeys; import org.apache.tez.mapreduce.hadoop.InputSplitInfo; import org.apache.tez.mapreduce.hadoop.InputSplitInfoDisk; import org.apache.tez.mapreduce.hadoop.InputSplitInfoMem; @InterfaceAudience.Private public class MRToTezHelper { private static final Log LOG = LogFactory.getLog(MRToTezHelper.class); private static final String JOB_SPLIT_RESOURCE_NAME = MRJobConfig.JOB_SPLIT; private static final String JOB_SPLIT_METAINFO_RESOURCE_NAME = MRJobConfig.JOB_SPLIT_METAINFO; private static Map<String, String> mrAMParamToTezAMParamMap = new HashMap<String, String>(); private static Map<String, String> mrMapParamToTezVertexParamMap = new HashMap<String, String>(); private static Map<String, String> mrReduceParamToTezVertexParamMap = new HashMap<String, String>(); private static List<String> mrSettingsToRetain = new ArrayList<String>(); private static List<String> mrSettingsToRemove = new ArrayList<String>(); private MRToTezHelper() { } static { populateMRToTezParamsMap(); populateMRSettingsToRetain(); populateMRSettingsToRemove(); } private static void populateMRToTezParamsMap() { //AM settings mrAMParamToTezAMParamMap.put(MRJobConfig.MR_AM_VMEM_MB, TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB); mrAMParamToTezAMParamMap.put(MRJobConfig.MR_AM_CPU_VCORES, TezConfiguration.TEZ_AM_RESOURCE_CPU_VCORES); mrAMParamToTezAMParamMap.put(MRJobConfig.MR_AM_MAX_ATTEMPTS, TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS); mrAMParamToTezAMParamMap.put(MRConfiguration.JOB_CREDENTIALS_BINARY, TezConfiguration.TEZ_CREDENTIALS_PATH); mrAMParamToTezAMParamMap.put(MRJobConfig.JOB_CANCEL_DELEGATION_TOKEN, TezConfiguration.TEZ_CANCEL_DELEGATION_TOKENS_ON_COMPLETION); //Map settings mrMapParamToTezVertexParamMap.put(MRJobConfig.MAP_MAX_ATTEMPTS, TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS); mrMapParamToTezVertexParamMap.put(MRJobConfig.MAP_SPECULATIVE, TezConfiguration.TEZ_AM_SPECULATION_ENABLED); mrMapParamToTezVertexParamMap.put(MRJobConfig.MAP_LOG_LEVEL, TezConfiguration.TEZ_TASK_LOG_LEVEL); //TezConfiguration.TEZ_AM_VERTEX_MAX_TASK_CONCURRENCY TEZ-2914 in Tez 0.8 mrMapParamToTezVertexParamMap.put("mapreduce.job.running.map.limit", "tez.am.vertex.max-task-concurrency"); //TezConfiguration.TEZ_TASK_PROGRESS_STUCK_INTERVAL_MS TEZ-808 in Tez 0.8 mrMapParamToTezVertexParamMap.put(MRJobConfig.TASK_TIMEOUT, "tez.am.progress.stuck.interval-ms"); //Reduce settings mrReduceParamToTezVertexParamMap.put(MRJobConfig.REDUCE_MAX_ATTEMPTS, TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS); mrReduceParamToTezVertexParamMap.put(MRJobConfig.REDUCE_SPECULATIVE, TezConfiguration.TEZ_AM_SPECULATION_ENABLED); mrReduceParamToTezVertexParamMap.put(MRJobConfig.REDUCE_LOG_LEVEL, TezConfiguration.TEZ_TASK_LOG_LEVEL); mrReduceParamToTezVertexParamMap.put("mapreduce.job.running.reduce.limit", "tez.am.vertex.max-task-concurrency"); mrReduceParamToTezVertexParamMap.put(MRJobConfig.TASK_TIMEOUT, "tez.am.progress.stuck.interval-ms"); } private static void populateMRSettingsToRetain() { // FileInputFormat mrSettingsToRetain.add(FileInputFormat.INPUT_DIR); mrSettingsToRetain.add(FileInputFormat.SPLIT_MAXSIZE); mrSettingsToRetain.add(FileInputFormat.SPLIT_MINSIZE); mrSettingsToRetain.add(FileInputFormat.PATHFILTER_CLASS); mrSettingsToRetain.add(FileInputFormat.NUM_INPUT_FILES); mrSettingsToRetain.add(FileInputFormat.INPUT_DIR_RECURSIVE); // FileOutputFormat mrSettingsToRetain.add(MRConfiguration.OUTPUT_BASENAME); mrSettingsToRetain.add(FileOutputFormat.COMPRESS); mrSettingsToRetain.add(FileOutputFormat.COMPRESS_CODEC); mrSettingsToRetain.add(FileOutputFormat.COMPRESS_TYPE); mrSettingsToRetain.add(FileOutputFormat.OUTDIR); mrSettingsToRetain.add(FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER); } private static void populateMRSettingsToRemove() { // FileInputFormat.listStatus() on a task can cause job failure when run from Oozie mrSettingsToRemove.add(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY); mrSettingsToRemove.add(MRJobConfig.CACHE_ARCHIVES); mrSettingsToRemove.add(MRJobConfig.CACHE_ARCHIVES_SIZES); mrSettingsToRemove.add(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS); mrSettingsToRemove.add(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES); mrSettingsToRemove.add(MRJobConfig.CACHE_FILES); mrSettingsToRemove.add(MRJobConfig.CACHE_FILES_SIZES); mrSettingsToRemove.add(MRJobConfig.CACHE_FILE_TIMESTAMPS); mrSettingsToRemove.add(MRJobConfig.CACHE_FILE_VISIBILITIES); mrSettingsToRemove.add(MRJobConfig.CLASSPATH_FILES); } private static void removeUnwantedSettings(Configuration tezConf, boolean isAMConf) { // It is good to clean up as much of the unapplicable settings as possible. // Tez has configs set on multiple places AM, DAG, Vertex, VertexManager // Plugin, Tasks (Processor, Edge, every input and output, combiner) // If conf size is bigger, it places heavy pressurce on AM memory and is // inefficient while sending over RPC to tasks for (String mrSetting : mrSettingsToRemove) { tezConf.unset(mrSetting); } Iterator<Entry<String, String>> iter = new Configuration(tezConf).iterator(); while (iter.hasNext()) { String key = iter.next().getKey(); if (!isAMConf) { // Keep the setting in AM conf to be able to connect back to the // Oozie launcher job and look at the parameter values passed, // but get rid of for others if (key.startsWith("oozie.")) { tezConf.unset(key); continue; } } if (key.startsWith("yarn.nodemanager")) { tezConf.unset(key); } else if (key.startsWith("mapreduce.jobhistory")) { tezConf.unset(key); } else if (key.startsWith("mapreduce.jobtracker")) { tezConf.unset(key); } else if (key.startsWith("mapreduce.tasktracker")) { tezConf.unset(key); } } } public static void translateMRSettingsForTezAM(TezConfiguration dagAMConf) { convertMRToTezConf(dagAMConf, dagAMConf, DeprecatedKeys.getMRToDAGParamMap()); convertMRToTezConf(dagAMConf, dagAMConf, mrAMParamToTezAMParamMap); String env = dagAMConf.get(MRJobConfig.MR_AM_ADMIN_USER_ENV); if (dagAMConf.get(MRJobConfig.MR_AM_ENV) != null) { env = (env == null) ? dagAMConf.get(MRJobConfig.MR_AM_ENV) : env + "," + dagAMConf.get(MRJobConfig.MR_AM_ENV); } if (env != null) { dagAMConf.setIfUnset(TezConfiguration.TEZ_AM_LAUNCH_ENV, env); } dagAMConf.setIfUnset(TezConfiguration.TEZ_AM_LAUNCH_CMD_OPTS, org.apache.tez.mapreduce.hadoop.MRHelpers .getJavaOptsForMRAM(dagAMConf)); String queueName = dagAMConf.get(JobContext.QUEUE_NAME, YarnConfiguration.DEFAULT_QUEUE_NAME); dagAMConf.setIfUnset(TezConfiguration.TEZ_QUEUE_NAME, queueName); dagAMConf.setIfUnset(TezConfiguration.TEZ_AM_VIEW_ACLS, dagAMConf.get(MRJobConfig.JOB_ACL_VIEW_JOB, MRJobConfig.DEFAULT_JOB_ACL_VIEW_JOB)); dagAMConf.setIfUnset(TezConfiguration.TEZ_AM_MODIFY_ACLS, dagAMConf.get(MRJobConfig.JOB_ACL_MODIFY_JOB, MRJobConfig.DEFAULT_JOB_ACL_MODIFY_JOB)); // Hardcoding at AM level instead of setting per vertex till TEZ-2710 is available dagAMConf.setIfUnset(TezConfiguration.TEZ_TASK_SCALE_MEMORY_RESERVE_FRACTION, "0.5"); removeUnwantedSettings(dagAMConf, true); } /** * Set config with Scope.Vertex in TezConfiguration on the vertex * * @param vertex Vertex on which config is to be set * @param isMapVertex Whether map or reduce vertex. i.e root or intermediate/leaf vertex * @param conf Config that contains the tez or equivalent mapreduce settings. */ public static void setVertexConfig(Vertex vertex, boolean isMapVertex, Configuration conf) { Map<String, String> configMapping = isMapVertex ? mrMapParamToTezVertexParamMap : mrReduceParamToTezVertexParamMap; for (Entry<String, String> dep : configMapping.entrySet()) { String value = conf.get(dep.getValue(), conf.get(dep.getKey())); if (value != null) { vertex.setConf(dep.getValue(), value); LOG.debug("Setting " + dep.getValue() + " to " + value + " for the vertex " + vertex.getName()); } } } /** * Process the mapreduce configuration settings and * - copy as is the still required ones (like those used by FileInputFormat/FileOutputFormat) * - convert and set equivalent tez runtime settings * - handle compression related settings * * @param tezConf Configuration on which the mapreduce settings will have to be transferred * @param mrConf Configuration that contains mapreduce settings */ public static void processMRSettings(Configuration tezConf, Configuration mrConf) { for (String mrSetting : mrSettingsToRetain) { if (mrConf.get(mrSetting) != null) { tezConf.set(mrSetting, mrConf.get(mrSetting)); } } JobControlCompiler.configureCompression(tezConf); convertMRToTezConf(tezConf, mrConf, DeprecatedKeys.getMRToTezRuntimeParamMap()); removeUnwantedSettings(tezConf, false); // ShuffleVertexManager Plugin settings // DeprecatedKeys.getMRToTezRuntimeParamMap() only translates min and not max String slowStartFraction = mrConf.get(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART); if (slowStartFraction != null) { tezConf.setIfUnset(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION, slowStartFraction); tezConf.setIfUnset(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION, slowStartFraction); } } /** * Convert MR settings to Tez settings and set on conf. * * @param tezConf Configuration on which MR equivalent Tez settings should be set * @param mrConf Configuration that contains MR settings * @param mrToTezConfigMapping Mapping of MR config to equivalent Tez config */ private static void convertMRToTezConf(Configuration tezConf, Configuration mrConf, Map<String, String> mrToTezConfigMapping) { for (Entry<String, String> dep : mrToTezConfigMapping.entrySet()) { if (mrConf.get(dep.getKey()) != null) { if (tezConf.get(dep.getValue()) == null) { if (LOG.isDebugEnabled()) { LOG.debug("Setting " + dep.getValue() + " to " + mrConf.get(dep.getKey()) + " from MR setting " + dep.getKey()); } tezConf.set(dep.getValue(), mrConf.get(dep.getKey())); } tezConf.unset(dep.getKey()); } } } /** * Write input splits (job.split and job.splitmetainfo) to disk */ public static InputSplitInfoDisk writeInputSplitInfoToDisk( InputSplitInfoMem infoMem, Path inputSplitsDir, JobConf jobConf, FileSystem fs) throws IOException, InterruptedException { InputSplit[] splits = infoMem.getNewFormatSplits(); JobSplitWriter.createSplitFiles(inputSplitsDir, jobConf, fs, splits); return new InputSplitInfoDisk( JobSubmissionFiles.getJobSplitFile(inputSplitsDir), JobSubmissionFiles.getJobSplitMetaFile(inputSplitsDir), splits.length, infoMem.getTaskLocationHints(), jobConf.getCredentials()); } /** * Exact copy of private method from from org.apache.tez.mapreduce.hadoop.MRInputHelpers * * Update provided localResources collection with the required local * resources needed by MapReduce tasks with respect to Input splits. * * @param fs Filesystem instance to access status of splits related files * @param inputSplitInfo Information on location of split files * @param localResources LocalResources collection to be updated * @throws IOException */ public static void updateLocalResourcesForInputSplits( FileSystem fs, InputSplitInfo inputSplitInfo, Map<String, LocalResource> localResources) throws IOException { if (localResources.containsKey(JOB_SPLIT_RESOURCE_NAME)) { throw new RuntimeException("LocalResources already contains a" + " resource named " + JOB_SPLIT_RESOURCE_NAME); } if (localResources.containsKey(JOB_SPLIT_METAINFO_RESOURCE_NAME)) { throw new RuntimeException("LocalResources already contains a" + " resource named " + JOB_SPLIT_METAINFO_RESOURCE_NAME); } FileStatus splitFileStatus = fs.getFileStatus(inputSplitInfo.getSplitsFile()); FileStatus metaInfoFileStatus = fs.getFileStatus(inputSplitInfo.getSplitsMetaInfoFile()); localResources.put(JOB_SPLIT_RESOURCE_NAME, LocalResource.newInstance( ConverterUtils.getYarnUrlFromPath(inputSplitInfo.getSplitsFile()), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, splitFileStatus.getLen(), splitFileStatus.getModificationTime())); localResources.put(JOB_SPLIT_METAINFO_RESOURCE_NAME, LocalResource.newInstance( ConverterUtils.getYarnUrlFromPath( inputSplitInfo.getSplitsMetaInfoFile()), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, metaInfoFileStatus.getLen(), metaInfoFileStatus.getModificationTime())); } }