package com.linkedin.camus.sweeper; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormatter; import com.linkedin.camus.sweeper.CamusSweeper.WhiteBlackListPathFilter; import com.linkedin.camus.sweeper.utils.DateUtils; import com.linkedin.camus.sweeper.utils.Utils; /** * * Responsible for cleaning out the daily files based on the retention value set in the config * */ public class CamusCleaner extends Configured implements Tool { public static final String SIMULATE = "camus.sweeper.clean.simulate"; public static final String FORCE = "camus.sweeper.clean.force"; public static final String RETENTION_TOPIC_PREFIX = "camus.sweeper.clean.retention.days.topic."; public static final String OUTPUT_DAILY_FORMAT_STR = "YYYY/MM/dd"; public static final String FAIL_JOB_IF_FOLDER_NOT_QUALIFY_FOR_DELETION = "fail.job.if.folder.not.qualify.for.deletion"; public static final String DELETE_FOLDERS_NOT_QUALIFY_FOR_DELETION = "delete.folders.not.qualify.for.deletion"; private DateUtils dUtils; private DateTimeFormatter outputDailyFormat; private DateTimeFormatter outputMonthFormat; private DateTimeFormatter outputYearFormat; private final Properties props; private Path sourcePath; private String sourceSubDir; private String destSubDir; private FileSystem fs; private boolean simulate = false; private boolean force = false; private static Logger log = Logger.getLogger(CamusCleaner.class); private boolean someFoldersNotQualifyForDeletion = false; private boolean failJobIfFolderNotQualifyForDeletion = false; private boolean deleteFoldersNotQualifyForDeletion = true; public CamusCleaner() { this.props = new Properties(); } public CamusCleaner(Properties props) { this.props = props; dUtils = new DateUtils(props); outputDailyFormat = dUtils.getDateTimeFormatter(OUTPUT_DAILY_FORMAT_STR); outputMonthFormat = dUtils.getDateTimeFormatter("YYYY/MM"); outputYearFormat = dUtils.getDateTimeFormatter("YYYY"); sourceSubDir = props.getProperty("camus.sweeper.source.subdir"); destSubDir = props.getProperty("camus.sweeper.dest.subdir", ""); } public static void main(String args[]) throws Exception { CamusCleaner job = new CamusCleaner(); ToolRunner.run(job, args); } public void run() throws Exception { log.info("Starting the Camus - Daily Cleaner"); Configuration conf = new Configuration(); fs = FileSystem.get(conf); List<String> blacklist = Utils.getStringList(props, "camus.sweeper.blacklist"); List<String> whitelist = Utils.getStringList(props, "camus.sweeper.whitelist"); // usually means hourly, but fromLocation can be daily subdir since the same // code is used for daily retention String fromLocation = (String) props.getProperty("camus.sweeper.source.dir"); sourcePath = fs.getFileStatus(new Path(fromLocation)).getPath(); log.debug("Path : " + sourcePath); simulate = Boolean.parseBoolean(props.getProperty(SIMULATE, "false")); force = Boolean.parseBoolean(props.getProperty(FORCE, "false")); this.failJobIfFolderNotQualifyForDeletion = Boolean.parseBoolean(this.props.getProperty(FAIL_JOB_IF_FOLDER_NOT_QUALIFY_FOR_DELETION, "false")); this.deleteFoldersNotQualifyForDeletion = Boolean.parseBoolean(this.props.getProperty(DELETE_FOLDERS_NOT_QUALIFY_FOR_DELETION, "true")); // Topic-specific retention Map<String, String> map = Utils.getMapByPrefix(props, RETENTION_TOPIC_PREFIX); int regularRetention = Integer.parseInt((String) props.getProperty("camus.sweeper.clean.retention.days.global", "-1")); if (regularRetention != -1) log.info("Global retention set to " + regularRetention); else log.info("Global retention set to infinity, will not delete unspecified topics"); WhiteBlackListPathFilter filter = new WhiteBlackListPathFilter(whitelist, blacklist, sourcePath); Map<FileStatus, String> topics = new CamusSweeper().findAllTopics(sourcePath, filter, sourceSubDir, fs); for (FileStatus status : topics.keySet()) { String name = status.getPath().getName(); if (name.startsWith(".") || name.startsWith("_")) { continue; } String fullname = topics.get(status); int topicRetention = map.containsKey(fullname) ? Integer.parseInt(map.get(fullname)) : regularRetention; enforceRetention(fullname, status, sourceSubDir, destSubDir, topicRetention); } if (this.someFoldersNotQualifyForDeletion && this.failJobIfFolderNotQualifyForDeletion) { throw new RuntimeException("Cannot delete some folders. See LOG errors for details."); } } private void enforceRetention(String topicName, FileStatus topicDir, String topicSourceSubdir, String topicDestSubdir, int numDays) throws Exception { log.info("Running retention for " + topicName + " using " + numDays + " days"); if (numDays != -1) { DateTime time = new DateTime(dUtils.zone); DateTime daysAgo = time.minusDays(numDays); Path sourceDailyGlob = new Path(topicDir.getPath() + "/" + topicSourceSubdir + "/*/*/*"); for (FileStatus f : fs.globStatus(sourceDailyGlob)) { DateTime dirDateTime = outputDailyFormat.parseDateTime( f.getPath().toString().substring(f.getPath().toString().length() - OUTPUT_DAILY_FORMAT_STR.length())); if (dirDateTime.isBefore(daysAgo)) { if (!(force || topicDestSubdir.isEmpty())) { Path destPath = new Path(topicDir.getPath(), topicDestSubdir + "/" + dirDateTime.toString(outputDailyFormat)); if (!fs.exists(destPath)) { someFoldersNotQualifyForDeletion = true; log.error(String.format("Cannot delete folder %s, since rollup folder %s doesn't exist.", f.getPath(), destPath)); continue; } else { FileStatus dest = fs.getFileStatus(destPath); boolean canDelete = true; for (FileStatus sourceFile : fs.listStatus(f.getPath())) { if (dest.getModificationTime() < sourceFile.getModificationTime()) { someFoldersNotQualifyForDeletion = true; log.error(String.format( "Cannot delete folder %s, since the timestamp of %s (%d) is later than the timestamp of rollup folder %s (%d)", f.getPath(), sourceFile.getPath(), sourceFile.getModificationTime(), destPath, dest.getModificationTime())); canDelete = false; } } if (!canDelete && !this.deleteFoldersNotQualifyForDeletion) { continue; } } } deleteFileDir(fs, f.getPath()); } } } Path sourceMonthlyGlob = new Path(topicDir.getPath() + "/" + topicSourceSubdir + "/*/*"); for (FileStatus f : fs.globStatus(sourceMonthlyGlob)) { if (fs.listStatus(f.getPath()).length == 0) deleteFileDir(fs, f.getPath()); } Path sourceYearGlob = new Path(topicDir.getPath() + "/" + topicSourceSubdir + "/*"); for (FileStatus f : fs.globStatus(sourceYearGlob)) { if (fs.listStatus(f.getPath()).length == 0) deleteFileDir(fs, f.getPath()); } Path sourceWithSubdir = new Path(topicDir.getPath() + "/" + topicSourceSubdir); if (fs.listStatus(sourceWithSubdir).length == 0) deleteFileDir(fs, sourceWithSubdir); if (fs.listStatus(topicDir.getPath()).length == 0) deleteFileDir(fs, topicDir.getPath()); } private void deleteFileDir(FileSystem fs, Path deletePath) throws IOException { if (!simulate) { log.info("Deleting " + deletePath); if (fs.delete(deletePath, true)) { return; } else { throw new IOException("Path " + deletePath + " couldn't be deleted."); } } else { log.info("Simulating delete " + deletePath); } } public int run(String[] args) throws Exception { Options options = new Options(); options.addOption("p", true, "properties filename from the classpath"); options.addOption("P", true, "external properties filename"); options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("D")); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); if (!(cmd.hasOption('p') || cmd.hasOption('P'))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("CamusJob.java", options); return 1; } if (cmd.hasOption('p')) props.load(ClassLoader.getSystemClassLoader().getResourceAsStream(cmd.getOptionValue('p'))); if (cmd.hasOption('P')) { String pathname = cmd.getOptionValue('P'); InputStream fStream; if (pathname.startsWith("hdfs:")) { Path pt = new Path(pathname); FileSystem fs = FileSystem.get(new Configuration()); fStream = fs.open(pt); } else { File file = new File(pathname); fStream = new FileInputStream(file); } props.load(fStream); fStream.close(); } props.putAll(cmd.getOptionProperties("D")); dUtils = new DateUtils(props); outputDailyFormat = dUtils.getDateTimeFormatter(OUTPUT_DAILY_FORMAT_STR); outputMonthFormat = dUtils.getDateTimeFormatter("YYYY/MM"); outputYearFormat = dUtils.getDateTimeFormatter("YYYY"); run(); return 0; } }