package com.skp.experiment.common; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.skp.experiment.common.parameter.DefaultOptionCreator; /** * helper util regarding to HDFS * @author doyoung * */ public class HadoopClusterUtil { private static final Logger log = LoggerFactory.getLogger(HadoopClusterUtil.class); public static int CLUSTER_SIZE_DEV = 10; public static int CLUSTER_SIZE_PROD = 30; public static int MAP_TASKS_PER_NODE = 7; public static int REDUCER_TASKS_PER_NODE = 7; public static long DEFALUT_INPUT_SPLIT_SIZE = 1024L * 1024L * 64L; public static long PHYSICAL_MEMERY_LIMIT = 14L * 1024L * 1024L * 1024L; public static long DEFAULT_HEAP_SIZE = 2L * 1024L * 1024L * 1024L; @SuppressWarnings("deprecation") public static ClusterStatus getClusterStatus(Configuration conf) throws IOException { return new JobClient(new JobConf(conf, HadoopClusterUtil.class)).getClusterStatus(); } public static int getNumberOfTaskTrackers(Configuration conf) throws IOException { int numTaskTrackers = getClusterStatus(conf).getTaskTrackers(); log.info("Task Trackers Num: {}", numTaskTrackers); return numTaskTrackers; } public static int getMaxMapTasks(Configuration conf) throws IOException { int maxMapTasks = getClusterStatus(conf).getMaxMapTasks(); log.info("Max Map Task On This Cluster: {}", maxMapTasks); return maxMapTasks; } public static long getHdfsPathSize(Path input) throws IOException { long size = getHdfsPathSize(new Configuration(), input); log.info("HDFS Path Size: {}\t{}", input.toString(), size); return size; } public static long getHdfsPathSize(Configuration conf, Path input) throws IOException { long size = FileSystem.get(conf).getContentSummary(input).getLength(); log.info("HDFS Path Size: {}\t{}", input.toString(), size); return size; } public static long getMinInputSplitSizeMax(Configuration conf, Path input) throws IOException { long pathSize = HadoopClusterUtil.getHdfsPathSize(conf, input); long minSplitSize = (long) Math.ceil(pathSize / (double)HadoopClusterUtil.getMaxMapTasks(conf)); log.info("HDFS Min Split Size: {}\t{}", input.toString(), minSplitSize); return minSplitSize; } public static long getMinInputSplitSizeMin(Configuration conf, Path input) throws IOException { int taskTrackerNums = HadoopClusterUtil.getNumberOfTaskTrackers(conf); long pathSize = HadoopClusterUtil.getHdfsPathSize(conf, input); long minSplitSize = (long) Math.ceil(pathSize / (double)taskTrackerNums); log.info("HDFS Min Split Size: {}\t{}", input.toString(), minSplitSize); return minSplitSize; } public static long getMaxBlockSize(Configuration conf, Path input) throws IOException { long blockSize = 1024 * 1024 * 64; long inputSize = getHdfsPathSize(conf, input); int taskTrackerNum = getNumberOfTaskTrackers(conf); for (int pow = 1; pow < 40; pow ++) { long currentBlockSize = (long)Math.pow(2, pow); if (currentBlockSize * taskTrackerNum >= inputSize) { blockSize = currentBlockSize; break; } } log.info("BlockSize: {}\t{}", input.toString(), blockSize); return blockSize; } public static void deletePartFiles(Configuration conf, Path dir) throws IOException { FileSystem fs = FileSystem.get(conf); FileStatus[] files = fs.globStatus(new Path(dir.toString() + "/part*")); for (FileStatus file : files) { fs.delete(file.getPath(), true); } } public static void writeToHdfs(Configuration conf, Path output, String outputString) throws IOException { writeToHdfs(conf, output, outputString, true); } public static void writeToHdfs(Configuration conf, Path output, String outputString, boolean newline) throws IOException { FSDataOutputStream out = null; try { FileSystem fs = FileSystem.get(conf); out = fs.create(fs.makeQualified(output)); if (newline) { out.writeBytes(outputString + DefaultOptionCreator.NEWLINE); } else { out.writeBytes(outputString); } } finally { if (out != null) { out.close(); } } } public static void renamePath(Configuration conf, Path input, Path output) throws IOException { FileSystem fs = FileSystem.get(conf); fs.rename(input, output); } public static String getAttemptId(Configuration conf) throws IllegalArgumentException { if (conf == null) { throw new NullPointerException("conf is null"); } String taskId = conf.get("mapred.task.id"); if (taskId == null) { throw new IllegalArgumentException("Configutaion does not contain the property mapred.task.id"); } String[] parts = taskId.split("_"); if (parts.length != 6 || !parts[0].equals("attempt") || (!"m".equals(parts[3]) && !"r".equals(parts[3]))) { throw new IllegalArgumentException("TaskAttemptId string : " + taskId + " is not properly formed"); } return "part-" + parts[3] + "-" + parts[4]; } }