package com.linkedin.camus.sweeper; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.joda.time.DateTimeZone; import org.mortbay.log.Log; import com.linkedin.camus.sweeper.mapreduce.CamusSweeperJob; import com.linkedin.camus.sweeper.utils.PriorityExecutor; import com.linkedin.camus.sweeper.utils.PriorityExecutor.Important; import com.linkedin.camus.sweeper.utils.Utils; public class CamusSweeper extends Configured implements Tool { protected static final String DEFAULT_NUM_THREADS = "5"; protected static final String CAMUS_SWEEPER_PRIORITY_LIST = "camus.sweeper.priority.list"; private static final String MAX_FILES = "max.files"; private static final int DEFAULT_MAX_FILES = 24; private static final String REDUCER_COUNT = "reducer.count"; private static final int DEFAULT_REDUCER_COUNT = 45; private static final String MAPRED_MIN_SPLIT_SIZE = "mapred.min.split.size"; private static final String MAPRED_MAX_SPLIT_SIZE = "mapred.max.split.size"; private static final String TMP_PATH = "tmp.path"; static final String INPUT_PATHS = "input.paths"; static final String DEST_PATH = "dest.path"; protected List<SweeperError> errorMessages; protected List<Job> runningJobs; protected Properties props; protected FileSystem fileSystem; protected ExecutorService executorService; protected FsPermission perm = new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE); protected String destSubdir; protected String sourceSubdir; private static Logger log = Logger.getLogger(CamusSweeper.class); protected CamusSweeperPlanner planner; protected Map<String, Integer> priorityTopics = new HashMap<String, Integer>(); public CamusSweeper() { props = new Properties(); } public CamusSweeper(Properties props) { this.props = props; init(); } private void init() { for (String str : props.getProperty(CAMUS_SWEEPER_PRIORITY_LIST, "").split(",")) { String[] tokens = str.split("="); String topic = tokens[0]; int priority = tokens.length > 1 ? Integer.parseInt(tokens[1]) : 1; priorityTopics.put(topic, priority); } this.errorMessages = Collections.synchronizedList(new ArrayList<SweeperError>()); DateTimeZone.setDefault(DateTimeZone.forID(props.getProperty("default.timezone"))); this.runningJobs = Collections.synchronizedList(new ArrayList<Job>()); sourceSubdir = props.getProperty("camus.sweeper.source.subdir"); destSubdir = props.getProperty("camus.sweeper.dest.subdir"); try { planner = ((CamusSweeperPlanner) Class.forName(props.getProperty("camus.sweeper.planner.class")).newInstance()) .setPropertiesLogger(props, log); } catch (Exception e) { throw new RuntimeException(e); } } public void cancel() throws Exception { executorService.shutdownNow(); for (Job hadoopJob : runningJobs) { if (!hadoopJob.isComplete()) { try { hadoopJob.killJob(); } catch (IOException e) { e.printStackTrace(); } } } } public Map<FileStatus, String> findAllTopics(Path input, PathFilter filter, String topicSubdir, FileSystem fs) throws IOException { Map<FileStatus, String> topics = new HashMap<FileStatus, String>(); for (FileStatus f : fs.listStatus(input)) { // skipping first level, in search of the topic subdir findAllTopics(f.getPath(), filter, topicSubdir, "", fs, topics); } return topics; } private void findAllTopics(Path input, PathFilter filter, String topicSubdir, String topicNameSpace, FileSystem fs, Map<FileStatus, String> topics) throws IOException { for (FileStatus f : fs.listStatus(input)) { if (f.isDir()) { String topicFullName = (topicNameSpace.isEmpty() ? "" : topicNameSpace + ".") + f.getPath().getParent().getName(); if (f.getPath().getName().equals(topicSubdir) && filter.accept(f.getPath().getParent())) { topics.put(fs.getFileStatus(f.getPath().getParent()), topicFullName); } else { findAllTopics(f.getPath(), filter, topicSubdir, topicFullName, fs, topics); } } } } protected void createExecutorService() { int numThreads = Integer.parseInt(props.getProperty("num.threads", DEFAULT_NUM_THREADS)); executorService = new PriorityExecutor(numThreads); } public void run() throws Exception { log.info("Starting kafka sweeper"); createExecutorService(); String fromLocation = (String) props.getProperty("camus.sweeper.source.dir"); String destLocation = (String) props.getProperty("camus.sweeper.dest.dir", ""); String tmpLocation = (String) props.getProperty("camus.sweeper.tmp.dir", ""); if (destLocation.isEmpty()) destLocation = fromLocation; if (tmpLocation.isEmpty()) tmpLocation = "/tmp"; props.setProperty("camus.sweeper.tmp.dir", tmpLocation); log.info("fromLocation: " + fromLocation); log.info("destLocation: " + destLocation); List<String> blacklist = Utils.getStringList(props, "camus.sweeper.blacklist"); List<String> whitelist = Utils.getStringList(props, "camus.sweeper.whitelist"); Configuration conf = new Configuration(); for (Entry<Object, Object> pair : props.entrySet()) { String key = (String) pair.getKey(); conf.set(key, (String) pair.getValue()); } this.fileSystem = FileSystem.get(conf); Path tmpPath = new Path(tmpLocation); if (!fileSystem.exists(tmpPath)) { fileSystem.mkdirs(tmpPath, perm); Log.info("Created tmpPath " + tmpPath + " with permissions " + perm + " and umask " + getUmask(conf)); if (!fileSystem.getFileStatus(tmpPath).getPermission().equals(perm)) { log.error(String.format("Wrong permission for %s. Expects %s, actual %s", tmpPath, perm, fileSystem .getFileStatus(tmpPath).getPermission())); fileSystem.setPermission(tmpPath, perm); } String user = UserGroupInformation.getCurrentUser().getUserName(); fileSystem.setOwner(tmpPath, user, user); } Path fromLocationPath = new Path(fromLocation); Map<FileStatus, String> topics = findAllTopics(fromLocationPath, new WhiteBlackListPathFilter(whitelist, blacklist, fileSystem.getFileStatus(fromLocationPath).getPath()), sourceSubdir, fileSystem); for (FileStatus topic : topics.keySet()) { String topicFullName = topics.get(topic); log.info("Processing topic " + topicFullName); Path destinationPath = new Path(destLocation + "/" + topics.get(topic).replace(".", "/") + "/" + destSubdir); try { runCollectorForTopicDir(fileSystem, topicFullName, new Path(topic.getPath(), sourceSubdir), destinationPath); } catch (Exception e) { System.err.println("unable to process " + topicFullName + " skipping..."); e.printStackTrace(); } } log.info("Shutting down priority executor"); executorService.shutdown(); while (!executorService.isTerminated()) { executorService.awaitTermination(30, TimeUnit.SECONDS); } log.info("Shutting down"); if (!errorMessages.isEmpty()) { for (SweeperError error : errorMessages) { System.err.println("Error occurred in " + error.getTopic() + " at " + error.getInputPath().toString() + " message " + error.getException().getMessage()); error.e.printStackTrace(); } throw new RuntimeException("Sweeper Failed"); } } private static String getUmask(Configuration conf) { if (conf.get(FsPermission.UMASK_LABEL) != null && conf.get(FsPermission.DEPRECATED_UMASK_LABEL) != null) { log.warn(String.format("Both umask labels exist: %s=%s, %s=%s", FsPermission.UMASK_LABEL, conf.get(FsPermission.UMASK_LABEL), FsPermission.DEPRECATED_UMASK_LABEL, conf.get(FsPermission.DEPRECATED_UMASK_LABEL))); return conf.get(FsPermission.UMASK_LABEL); } else if (conf.get(FsPermission.UMASK_LABEL) != null) { log.info(String.format("umask set: %s=%s", FsPermission.UMASK_LABEL, conf.get(FsPermission.UMASK_LABEL))); return conf.get(FsPermission.UMASK_LABEL); } else if (conf.get(FsPermission.DEPRECATED_UMASK_LABEL) != null) { log.info(String.format("umask set: %s=%s", FsPermission.DEPRECATED_UMASK_LABEL, conf.get(FsPermission.DEPRECATED_UMASK_LABEL))); return conf.get(FsPermission.DEPRECATED_UMASK_LABEL); } else { log.info("umask unset"); return "undefined"; } } protected void runCollectorForTopicDir(FileSystem fs, String topic, Path topicSourceDir, Path topicDestDir) throws Exception { log.info("Running collector for topic " + topic + " source:" + topicSourceDir + " dest:" + topicDestDir); ArrayList<Future<?>> tasksToComplete = new ArrayList<Future<?>>(); List<Properties> jobPropsList = planner.createSweeperJobProps(topic, topicSourceDir, topicDestDir, fs); for (Properties jobProps : jobPropsList) { tasksToComplete.add(runCollector(jobProps, topic)); } log.info("Finishing processing for topic " + topic); } protected Future<?> runCollector(Properties props, String topic) { String jobName = topic + "-" + UUID.randomUUID().toString(); props .put("tmp.path", props.getProperty("camus.sweeper.tmp.dir") + "/" + jobName + "_" + System.currentTimeMillis()); if (props.containsKey("reduce.count.override." + topic)) props.put("reducer.count", Integer.parseInt(props.getProperty("reduce.count.override." + topic))); log.info("Processing " + props.get("input.paths")); return executorService.submit(new KafkaCollectorRunner(jobName, props, errorMessages, topic)); } public class KafkaCollectorRunner implements Runnable, Important { protected Properties props; protected String name; protected List<SweeperError> errorQueue; protected String topic; protected int priority; public KafkaCollectorRunner(String name, Properties props, List<SweeperError> errorQueue, String topic) { this.name = name; this.props = props; this.errorQueue = errorQueue; this.topic = topic; priority = priorityTopics.containsKey(topic) ? priorityTopics.get(topic) : 0; } public void run() { KafkaCollector collector = null; try { log.info("Starting runner for " + name); collector = new KafkaCollector(props, name, topic); log.info("Waiting until input for job " + name + " is ready. Input directories: " + props.getProperty("input.paths")); if (!planner.waitUntilReadyToProcess(props, fileSystem)) { throw new JobCancelledException("Job has been cancelled by planner while waiting for input to be ready."); } log.info("Running " + name + " for input " + props.getProperty("input.paths")); collector.run(); } catch (Throwable e) // Sometimes the error is the Throwable, e.g. java.lang.NoClassDefFoundError { e.printStackTrace(); log.error("Failed for " + name + " ,job: " + collector == null ? null : collector.getJob() + " failed for " + props.getProperty("input.paths") + " Exception:" + e.getLocalizedMessage()); errorQueue.add(new SweeperError(name, props.get("input.paths").toString(), e)); } } @Override public int getPriority() { return priority; } } protected class KafkaCollector { protected static final String TARGET_FILE_SIZE = "camus.sweeper.target.file.size"; protected static final long TARGET_FILE_SIZE_DEFAULT = 1536l * 1024l * 1024l; protected long targetFileSize; protected final String jobName; protected final Properties props; protected final String topicName; protected final Path[] inputPaths; protected final Path tmpPath; protected final Path outputPath; protected final FileSystem fs; protected Job job; public KafkaCollector(Properties props, String jobName, String topicName) throws IOException { this.jobName = jobName; this.props = props; this.topicName = topicName; this.targetFileSize = props.containsKey(TARGET_FILE_SIZE) ? Long.parseLong(props.getProperty(TARGET_FILE_SIZE)) : TARGET_FILE_SIZE_DEFAULT; job = new Job(getConf()); job.setJarByClass(CamusSweeper.class); job.setJobName(jobName); for (Entry<Object, Object> pair : props.entrySet()) { String key = (String) pair.getKey(); job.getConfiguration().set(key, (String) pair.getValue()); } this.fs = FileSystem.get(job.getConfiguration()); this.inputPaths = getInputPaths(); this.tmpPath = new Path(job.getConfiguration().get(TMP_PATH)); this.outputPath = new Path(job.getConfiguration().get(DEST_PATH)); addInputAndOutputPathsToFileInputFormat(); } private void addInputAndOutputPathsToFileInputFormat() throws IOException { for (Path path : inputPaths) { FileInputFormat.addInputPath(job, path); } FileOutputFormat.setOutputPath(job, tmpPath); } private Path[] getInputPaths() { List<String> strPaths = Utils.getStringList(props, INPUT_PATHS); Path[] inputPaths = new Path[strPaths.size()]; for (int i = 0; i < strPaths.size(); i++) inputPaths[i] = new Path(strPaths.get(i)); return inputPaths; } public void run() throws Exception { job.getConfiguration().set("mapred.compress.map.output", "true"); ((CamusSweeperJob) Class.forName(props.getProperty("camus.sweeper.io.configurer.class")).newInstance()) .setLogger(log).configureJob(topicName, job); setNumOfReducersAndSplitSizes(); submitMrJob(); moveTmpPathToOutputPath(); } protected void moveTmpPathToOutputPath() throws IOException { Path oldPath = null; if (fs.exists(outputPath)) { oldPath = new Path("/tmp", "_old_" + job.getJobID()); moveExistingContentInOutputPathToOldPath(oldPath); } log.info("Moving " + tmpPath + " to " + outputPath); mkdirs(fs, outputPath.getParent(), perm, job.getConfiguration()); if (!fs.rename(tmpPath, outputPath)) { fs.rename(oldPath, outputPath); fs.delete(tmpPath, true); throw new RuntimeException("Error: cannot rename " + tmpPath + " to " + outputPath); } deleteOldPath(oldPath); } private void deleteOldPath(Path oldPath) throws IOException { if (oldPath != null && fs.exists(oldPath)) { log.info("Deleting " + oldPath); fs.delete(oldPath, true); } } private void moveExistingContentInOutputPathToOldPath(Path oldPath) throws IOException { log.info("Path " + outputPath + " exists. Overwriting. Existing content will be moved to " + oldPath); if (!fs.rename(outputPath, oldPath)) { fs.delete(tmpPath, true); throw new RuntimeException("Error: cannot rename " + outputPath + " to " + oldPath); } } protected void setNumOfReducersAndSplitSizes() throws IOException { long inputSize = getInputSize(); int maxFiles = job.getConfiguration().getInt(MAX_FILES, DEFAULT_MAX_FILES); int numTasks = Math.min((int) (inputSize / targetFileSize) + 1, maxFiles); if (job.getNumReduceTasks() != 0) { determineAndSetNumOfReducers(numTasks); } else { setSplitSizes(inputSize / numTasks); } } private void setSplitSizes(long targetSplitSize) { log.info("Setting target split size " + targetSplitSize); job.getConfiguration().setLong(MAPRED_MAX_SPLIT_SIZE, targetSplitSize); job.getConfiguration().setLong(MAPRED_MIN_SPLIT_SIZE, targetSplitSize); } private void determineAndSetNumOfReducers(int numTasks) { int numReducers; if (job.getConfiguration().get(REDUCER_COUNT) != null) { numReducers = job.getConfiguration().getInt(REDUCER_COUNT, DEFAULT_REDUCER_COUNT); } else { numReducers = numTasks; } job.setNumReduceTasks(numReducers); } private long getInputSize() throws IOException { long inputSize = 0; for (Path p : inputPaths) { log.info("inputPath: " + p.toString() + ", size=" + fs.getContentSummary(p).getLength()); inputSize += fs.getContentSummary(p).getLength(); } return inputSize; } protected void submitMrJob() throws IOException, InterruptedException, ClassNotFoundException { job.submit(); runningJobs.add(job); log.info("job running for: " + job.getJobName() + ", url: " + job.getTrackingURL()); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new RuntimeException("hadoop job failed."); } } protected String getJobName() { return jobName; } protected String getTopicName() { return topicName; } protected Job getJob() { return job; } protected Properties getProps() { return this.props; } } protected void mkdirs(FileSystem fs, Path path, FsPermission perm, Configuration conf) throws IOException { if (!fs.exists(path.getParent())) mkdirs(fs, path.getParent(), perm, conf); String msg = "Creating " + path + " with permissions " + perm + " and umask " + getUmask(conf); if (!fs.exists(path)) { log.info(msg); } if (!fs.mkdirs(path, perm)) { msg = msg + " failed"; log.error(msg); throw new IOException(msg); } if (!fs.getFileStatus(path).getPermission().equals(perm)) { log.error(String.format("Wrong permission for %s. Expects %s, actual %s", path, perm, fs.getFileStatus(path) .getPermission())); fs.setPermission(path, perm); } } public static class WhiteBlackListPathFilter implements PathFilter { private Pattern whitelist; private Pattern blacklist; private int rootLength; public WhiteBlackListPathFilter(Collection<String> whitelist, Collection<String> blacklist, Path qualRootDir) { if (whitelist.isEmpty()) this.whitelist = Pattern.compile(".*"); //whitelist everything else this.whitelist = compileMultiPattern(whitelist); if (blacklist.isEmpty()) this.blacklist = Pattern.compile("a^"); //blacklist nothing else this.blacklist = compileMultiPattern(blacklist); log.info("whitelist: " + this.whitelist.toString()); log.info("blacklist: " + this.blacklist.toString()); this.rootLength = qualRootDir.toString().length() + 1; } @Override public boolean accept(Path path) { String name = path.getName(); String fullName = path.toString().substring(rootLength).replaceAll("/", "."); return whitelist.matcher(fullName).matches() && !(blacklist.matcher(fullName).matches() || name.startsWith(".") || name.startsWith("_")); } private Pattern compileMultiPattern(Collection<String> list) { String patternStr = "("; for (String str : list) { patternStr += str + "|"; } patternStr = patternStr.substring(0, patternStr.length() - 1) + ")"; return Pattern.compile(patternStr); } } protected static class SweeperError { protected final String topic; protected final String input; protected final Throwable e; public SweeperError(String topic, String input, Throwable e) { this.topic = topic; this.input = input; this.e = e; } public String getTopic() { return topic; } public String getInputPath() { return input; } public Throwable getException() { return e; } } @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption("p", true, "properties filename from the classpath"); options.addOption("P", true, "external properties filename"); options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("D")); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); if (!(cmd.hasOption('p') || cmd.hasOption('P'))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("CamusJob.java", options); return 1; } if (cmd.hasOption('p')) props.load(this.getClass().getResourceAsStream(cmd.getOptionValue('p'))); if (cmd.hasOption('P')) { File file = new File(cmd.getOptionValue('P')); FileInputStream fStream = new FileInputStream(file); props.load(fStream); } props.putAll(cmd.getOptionProperties("D")); init(); run(); return 0; } public static void main(String args[]) throws Exception { CamusSweeper job = new CamusSweeper(); ToolRunner.run(job, args); } }