/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.compaction.mapreduce; import static gobblin.compaction.dataset.Dataset.DatasetState.COMPACTION_COMPLETE; import static gobblin.compaction.dataset.Dataset.DatasetState.GIVEN_UP; import static gobblin.compaction.dataset.Dataset.DatasetState.UNVERIFIED; import static gobblin.compaction.dataset.Dataset.DatasetState.VERIFIED; import static gobblin.compaction.mapreduce.MRCompactorJobRunner.Status.ABORTED; import static gobblin.compaction.mapreduce.MRCompactorJobRunner.Status.COMMITTED; import java.io.IOException; import java.net.URI; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import java.lang.reflect.InvocationTargetException; import org.joda.time.DateTime; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.joda.time.DateTimeZone; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Functions; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.io.Closer; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import gobblin.compaction.Compactor; import gobblin.compaction.listeners.CompactorCompletionListener; import gobblin.compaction.listeners.CompactorCompletionListenerFactory; import gobblin.compaction.listeners.CompactorListener; import gobblin.compaction.dataset.Dataset; import gobblin.compaction.dataset.DatasetsFinder; import gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder; import gobblin.compaction.event.CompactionSlaEventHelper; import gobblin.compaction.verify.DataCompletenessVerifier; import gobblin.compaction.verify.DataCompletenessVerifier.Results; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.metrics.GobblinMetrics; import gobblin.metrics.Tag; import gobblin.metrics.event.EventSubmitter; import gobblin.util.ClassAliasResolver; import gobblin.util.DatasetFilterUtils; import gobblin.util.ExecutorsUtils; import gobblin.util.HadoopUtils; import gobblin.util.ClusterNameTags; import gobblin.util.FileListUtils; import gobblin.util.recordcount.CompactionRecordCountProvider; import gobblin.util.recordcount.IngestionRecordCountProvider; import gobblin.util.reflection.GobblinConstructorUtils; /** * MapReduce-based {@link gobblin.compaction.Compactor}. Compaction will run on each qualified {@link Dataset} * under {@link #COMPACTION_INPUT_DIR}. * * @author Ziyang Liu */ public class MRCompactor implements Compactor { private static final Logger LOG = LoggerFactory.getLogger(MRCompactor.class); public static final String COMPACTION_PREFIX = "compaction."; /** * Basic compaction properties. */ public static final String COMPACTION_THREAD_POOL_SIZE = COMPACTION_PREFIX + "thread.pool.size"; public static final int DEFAULT_COMPACTION_THREAD_POOL_SIZE = 30; public static final String COMPACTION_INPUT_DIR = COMPACTION_PREFIX + "input.dir"; // The subdir name of input dataset paths, e.g., "hourly" in "/data/input/PasswordChangeEvent/hourly/2015/09/06". public static final String COMPACTION_INPUT_SUBDIR = COMPACTION_PREFIX + "input.subdir"; public static final String DEFAULT_COMPACTION_INPUT_SUBDIR = "hourly"; public static final String COMPACTION_DEST_DIR = COMPACTION_PREFIX + "dest.dir"; // The subdir name of output dataset paths, e.g., "daily" in "/data/input/PasswordChangeEvent/daily/2015/09/06". public static final String COMPACTION_DEST_SUBDIR = COMPACTION_PREFIX + "dest.subdir"; public static final String DEFAULT_COMPACTION_DEST_SUBDIR = "daily"; // The output dir for compaction MR job, which will be moved to the final output dir for data publishing. public static final String COMPACTION_TMP_DEST_DIR = COMPACTION_PREFIX + "tmp.dest.dir"; public static final String DEFAULT_COMPACTION_TMP_DEST_DIR = "/tmp/gobblin-compaction"; public static final String COMPACTION_JOB_DIR = COMPACTION_PREFIX + "tmp.job.dir"; public static final String COMPACTION_LATE_DIR_SUFFIX = "_late"; public static final String COMPACTION_BLACKLIST = COMPACTION_PREFIX + "blacklist"; public static final String COMPACTION_WHITELIST = COMPACTION_PREFIX + "whitelist"; public static final String COMPACTION_HIGH_PRIORITY_TOPICS = COMPACTION_PREFIX + "high.priority.topics"; public static final String COMPACTION_NORMAL_PRIORITY_TOPICS = COMPACTION_PREFIX + "normal.priority.topics"; public static final String COMPACTION_JOB_RUNNER_CLASS = COMPACTION_PREFIX + "job.runner.class"; public static final String DEFAULT_COMPACTION_JOB_RUNNER_CLASS = "gobblin.compaction.mapreduce.avro.MRCompactorAvroKeyDedupJobRunner"; public static final String COMPACTION_TIMEZONE = COMPACTION_PREFIX + "timezone"; public static final String DEFAULT_COMPACTION_TIMEZONE = ConfigurationKeys.PST_TIMEZONE_NAME; public static final String COMPACTION_FILE_SYSTEM_URI = COMPACTION_PREFIX + "file.system.uri"; public static final String COMPACTION_MR_JOB_TIMEOUT_MINUTES = COMPACTION_PREFIX + "mr.job.timeout.minutes"; public static final long DEFAULT_COMPACTION_MR_JOB_TIMEOUT_MINUTES = Long.MAX_VALUE; // Dataset finder to find datasets for compaction. public static final String COMPACTION_DATASETS_FINDER = COMPACTION_PREFIX + "datasets.finder"; public static final String DEFAULT_COMPACTION_DATASETS_FINDER = TimeBasedSubDirDatasetsFinder.class.getName(); // Rename source directories as a compaction complete indication // Compaction jobs using this completion mode can't share input sources public static final String COMPACTION_RENAME_SOURCE_DIR_ENABLED = COMPACTION_PREFIX + "rename.source.dir.enabled"; public static final boolean DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED = false; public static final String COMPACTION_RENAME_SOURCE_DIR_SUFFIX = "_COMPLETE"; //The provider that provides event counts for the compaction input files. public static final String COMPACTION_INPUT_RECORD_COUNT_PROVIDER = COMPACTION_PREFIX + "input.record.count.provider"; public static final String DEFAULT_COMPACTION_INPUT_RECORD_COUNT_PROVIDER = IngestionRecordCountProvider.class.getName(); //The provider that provides event counts for the compaction output files. public static final String COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER = COMPACTION_PREFIX + "output.record.count.provider"; public static final String DEFAULT_COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER = CompactionRecordCountProvider.class.getName(); // If a dataset has already been compacted and new (late) data is found, whether recompact this dataset. public static final String COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA = COMPACTION_PREFIX + "recompact.from.input.for.late.data"; public static final boolean DEFAULT_COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA = false; // The threshold of new(late) data that will trigger recompaction per dataset. // It follows the pattern DATASET_NAME_REGEX:THRESHOLD;DATASET_NAME_REGEX:THRESHOLD, e.g., A.*,B.*:0.2; C.*,D.*:0.3. // Dataset names that match A.* or B.* will have threshold 0.2. Dataset names that match C.* or D.* will have threshold 0.3. public static final String COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET = COMPACTION_PREFIX + "latedata.threshold.for.recompact.per.topic"; public static final double DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET = 1.0; // The threshold of new (late) files that will trigger compaction per dataset. // The trigger is based on the file numbers in the late output directory public static final String COMPACTION_LATEDATA_THRESHOLD_FILE_NUM = COMPACTION_PREFIX + "latedata.threshold.file.num"; public static final int DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FILE_NUM = 1000; // The threshold of new (late) files that will trigger compaction per dataset. // The trigger is based on how long the file has been in the late output directory. public static final String COMPACTION_LATEDATA_THRESHOLD_DURATION = COMPACTION_PREFIX + "latedata.threshold.duration"; public static final String DEFAULT_COMPACTION_LATEDATA_THRESHOLD_DURATION = "24h"; public static final String COMPACTION_RECOMPACT_CONDITION = COMPACTION_PREFIX + "recompact.condition"; public static final String DEFAULT_COMPACTION_RECOMPACT_CONDITION = "RecompactBasedOnRatio"; public static final String COMPACTION_RECOMPACT_COMBINE_CONDITIONS = COMPACTION_PREFIX + "recompact.combine.conditions"; public static final String COMPACTION_RECOMPACT_COMBINE_CONDITIONS_OPERATION = COMPACTION_PREFIX + "recompact.combine.conditions.operation"; public static final String DEFAULT_COMPACTION_RECOMPACT_COMBINE_CONDITIONS_OPERATION = "or"; public static final String COMPACTION_COMPLETE_LISTERNER = COMPACTION_PREFIX + "complete.listener"; public static final String DEFAULT_COMPACTION_COMPLETE_LISTERNER = "SimpleCompactorCompletionHook"; // Whether the input data for the compaction is deduplicated. public static final String COMPACTION_INPUT_DEDUPLICATED = COMPACTION_PREFIX + "input.deduplicated"; public static final boolean DEFAULT_COMPACTION_INPUT_DEDUPLICATED = false; // Whether the output of the compaction should be deduplicated. public static final String COMPACTION_OUTPUT_DEDUPLICATED = COMPACTION_PREFIX + "output.deduplicated"; public static final boolean DEFAULT_COMPACTION_OUTPUT_DEDUPLICATED = true; public static final String COMPACTION_COMPLETENESS_VERIFICATION_PREFIX = COMPACTION_PREFIX + "completeness.verification."; public static final String COMPACTION_RECOMPACT_FROM_DEST_PATHS = COMPACTION_PREFIX + "recompact.from.dest.paths"; public static final String COMPACTION_RECOMPACT_ALL_DATA = COMPACTION_PREFIX + "recompact.all.data"; public static final boolean DEFAULT_COMPACTION_RECOMPACT_FROM_DEST_PATHS = false; public static final boolean DEFAULT_COMPACTION_RECOMPACT_ALL_DATA = true; /** * Configuration properties related to data completeness verification. */ public static final String COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "blacklist"; public static final String COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "whitelist"; public static final String COMPACTION_VERIFICATION_TIMEOUT_MINUTES = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "timeout.minutes"; public static final long DEFAULT_COMPACTION_VERIFICATION_TIMEOUT_MINUTES = 30; public static final String COMPACTION_COMPLETENESS_VERIFICATION_ENABLED = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "enabled"; public static final boolean DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_ENABLED = false; // Number of datasets to be passed to DataCompletenessVerifier together. By passing multiple datasets together, // some costs in DataCompletenessVerifier (e.g., submitting a SQL query) can be amortized. public static final String COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "num.datasets.verified.together"; public static final int DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER = 10; // Whether to compact and publish a datatset if its completeness cannot be verified. public static final String COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX + "publish.data.if.cannot.verify"; public static final boolean DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY = false; /** * Compaction configuration properties used internally. */ public static final String COMPACTION_SHOULD_DEDUPLICATE = COMPACTION_PREFIX + "should.deduplicate"; public static final String COMPACTION_JOB_DEST_PARTITION = COMPACTION_PREFIX + "job.dest.partition"; public static final String COMPACTION_ENABLE_SUCCESS_FILE = COMPACTION_PREFIX + "fileoutputcommitter.marksuccessfuljobs"; public static final String COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK = COMPACTION_PREFIX + "job.late.data.movement.task"; public static final String COMPACTION_JOB_LATE_DATA_FILES = COMPACTION_PREFIX + "job.late.data.files"; public static final String COMPACTION_COMPLETE_FILE_NAME = "_COMPACTION_COMPLETE"; public static final String COMPACTION_LATE_FILES_DIRECTORY = "late"; public static final String COMPACTION_JARS = COMPACTION_PREFIX + "jars"; public static final String COMPACTION_JAR_SUBDIR = "_gobblin_compaction_jars"; public static final String COMPACTION_TRACKING_EVENTS_NAMESPACE = COMPACTION_PREFIX + "tracking.events"; public static final String COMPACTION_INPUT_PATH_TIME = COMPACTION_PREFIX + "input.path.time"; private static final long COMPACTION_JOB_WAIT_INTERVAL_SECONDS = 10; private static final Map<Dataset, Job> RUNNING_MR_JOBS = Maps.newConcurrentMap(); private final State state; private final List<? extends Tag<?>> tags; private final Configuration conf; private final String tmpOutputDir; private final FileSystem fs; private final JobRunnerExecutor jobExecutor; private final Set<Dataset> datasets; private final Map<Dataset, MRCompactorJobRunner> jobRunnables; private final Closer closer; private final Optional<DataCompletenessVerifier> verifier; private final Stopwatch stopwatch; private final GobblinMetrics gobblinMetrics; private final EventSubmitter eventSubmitter; private final Optional<CompactorListener> compactorListener; private final DateTime initilizeTime; private final long dataVerifTimeoutMinutes; private final long compactionTimeoutMinutes; private final boolean shouldVerifDataCompl; private final boolean shouldPublishDataIfCannotVerifyCompl; private final CompactorCompletionListener compactionCompleteListener; public MRCompactor(Properties props, List<? extends Tag<?>> tags, Optional<CompactorListener> compactorListener) throws IOException { this.state = new State(); this.state.addAll(props); this.initilizeTime = getCurrentTime(); this.tags = tags; this.conf = HadoopUtils.getConfFromState(this.state); this.tmpOutputDir = getTmpOutputDir(); this.fs = getFileSystem(); this.datasets = getDatasetsFinder().findDistinctDatasets(); this.jobExecutor = createJobExecutor(); this.jobRunnables = Maps.newConcurrentMap(); this.closer = Closer.create(); this.stopwatch = Stopwatch.createStarted(); this.gobblinMetrics = initializeMetrics(); this.eventSubmitter = new EventSubmitter.Builder( GobblinMetrics.get(this.state.getProp(ConfigurationKeys.JOB_NAME_KEY)).getMetricContext(), MRCompactor.COMPACTION_TRACKING_EVENTS_NAMESPACE).build(); this.compactorListener = compactorListener; this.dataVerifTimeoutMinutes = getDataVerifTimeoutMinutes(); this.compactionTimeoutMinutes = getCompactionTimeoutMinutes(); this.shouldVerifDataCompl = shouldVerifyDataCompleteness(); this.compactionCompleteListener = getCompactionCompleteListener(); this.verifier = this.shouldVerifDataCompl ? Optional.of(this.closer.register(new DataCompletenessVerifier(this.state))) : Optional.<DataCompletenessVerifier> absent(); this.shouldPublishDataIfCannotVerifyCompl = shouldPublishDataIfCannotVerifyCompl(); } public DateTime getInitializeTime() { return this.initilizeTime; } private String getTmpOutputDir() { return this.state.getProp(COMPACTION_TMP_DEST_DIR, DEFAULT_COMPACTION_TMP_DEST_DIR); } private FileSystem getFileSystem() throws IOException { if (this.state.contains(COMPACTION_FILE_SYSTEM_URI)) { URI uri = URI.create(this.state.getProp(COMPACTION_FILE_SYSTEM_URI)); return FileSystem.get(uri, this.conf); } return FileSystem.get(this.conf); } private DatasetsFinder getDatasetsFinder() { try { return (DatasetsFinder) Class .forName(this.state.getProp(COMPACTION_DATASETS_FINDER, DEFAULT_COMPACTION_DATASETS_FINDER)) .getConstructor(State.class).newInstance(this.state); } catch (Exception e) { throw new RuntimeException("Failed to initiailize DatasetsFinder.", e); } } private DateTime getCurrentTime () { DateTimeZone timeZone = DateTimeZone .forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE)); return new DateTime (timeZone); } private JobRunnerExecutor createJobExecutor() { int threadPoolSize = getThreadPoolSize(); BlockingQueue<Runnable> queue = new PriorityBlockingQueue<>(); return new JobRunnerExecutor(threadPoolSize, threadPoolSize, Long.MAX_VALUE, TimeUnit.NANOSECONDS, queue); } private int getThreadPoolSize() { return this.state.getPropAsInt(COMPACTION_THREAD_POOL_SIZE, DEFAULT_COMPACTION_THREAD_POOL_SIZE); } private GobblinMetrics initializeMetrics() { ImmutableList.Builder<Tag<?>> tags = ImmutableList.builder(); tags.addAll(this.tags); tags.addAll(Tag.fromMap(ClusterNameTags.getClusterNameTags())); GobblinMetrics gobblinMetrics = GobblinMetrics.get(this.state.getProp(ConfigurationKeys.JOB_NAME_KEY), null, tags.build()); gobblinMetrics.startMetricReporting(this.state.getProperties()); return gobblinMetrics; } @Override public void compact() throws IOException { try { copyDependencyJarsToHdfs(); processDatasets(); throwExceptionsIfAnyDatasetCompactionFailed(); onCompactionCompletion(); } catch (Throwable t) { // This throwable is logged here before propagated. Otherwise, if another throwable is thrown // in the finally-block, this throwable may be suppressed. LOG.error("Caught throwable during compaction", t); throw Throwables.propagate(t); } finally { try { shutdownExecutors(); this.closer.close(); } finally { deleteDependencyJars(); this.gobblinMetrics.stopMetricsReporting(); } } } private CompactorCompletionListener getCompactionCompleteListener () { ClassAliasResolver<CompactorCompletionListenerFactory> classAliasResolver = new ClassAliasResolver<>(CompactorCompletionListenerFactory.class); String listenerName= this.state.getProp(MRCompactor.COMPACTION_COMPLETE_LISTERNER, MRCompactor.DEFAULT_COMPACTION_COMPLETE_LISTERNER); try { CompactorCompletionListenerFactory factory = GobblinConstructorUtils.invokeFirstConstructor( classAliasResolver.resolveClass(listenerName), ImmutableList.of()); return factory.createCompactorCompactionListener(this.state); } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) { throw new IllegalArgumentException(e); } } private void onCompactionCompletion() { this.compactionCompleteListener.onCompactionCompletion(this); } /** * Copy dependency jars from local fs to HDFS. */ private void copyDependencyJarsToHdfs() throws IOException { if (!this.state.contains(ConfigurationKeys.JOB_JAR_FILES_KEY)) { return; } LocalFileSystem lfs = FileSystem.getLocal(this.conf); Path tmpJarFileDir = new Path(this.tmpOutputDir, "_gobblin_compaction_jars"); this.state.setProp(COMPACTION_JARS, tmpJarFileDir.toString()); this.fs.delete(tmpJarFileDir, true); for (String jarFile : this.state.getPropAsList(ConfigurationKeys.JOB_JAR_FILES_KEY)) { for (FileStatus status : lfs.globStatus(new Path(jarFile))) { Path tmpJarFile = new Path(this.fs.makeQualified(tmpJarFileDir), status.getPath().getName()); this.fs.copyFromLocalFile(status.getPath(), tmpJarFile); LOG.info(String.format("%s will be added to classpath", tmpJarFile)); } } } /** * Delete dependency jars from HDFS when job is done. */ private void deleteDependencyJars() throws IllegalArgumentException, IOException { if (this.state.contains(COMPACTION_JARS)) { this.fs.delete(new Path(this.state.getProp(COMPACTION_JARS)), true); } } private void processDatasets() { createJobPropsForDatasets(); processCompactionJobs(); } /** * Create compaction job properties for {@link Dataset}s. */ private void createJobPropsForDatasets() { final Set<Dataset> datasetsWithProps = Sets.newHashSet(); for (Dataset dataset : this.datasets) { datasetsWithProps.addAll(createJobPropsForDataset(dataset)); } this.datasets.clear(); this.datasets.addAll(datasetsWithProps); } /** * Existing dataset in {@link #datasets} does not have job props. * Create compaction job properties for each given {@link Dataset}. * Update datasets based on the results of creating job props for them. */ private List<Dataset> createJobPropsForDataset(Dataset dataset) { LOG.info("Creating compaction jobs for dataset " + dataset + " with priority " + dataset.priority()); final MRCompactorJobPropCreator jobPropCreator = getJobPropCreator(dataset); List<Dataset> datasetsWithProps; try { datasetsWithProps = jobPropCreator.createJobProps(); } catch (Throwable t) { // If a throwable is caught when creating job properties for a dataset, skip the topic and add the throwable // to the dataset. datasetsWithProps = ImmutableList.<Dataset> of(jobPropCreator.createFailedJobProps(t)); } return datasetsWithProps; } /** * Get an instance of {@link MRCompactorJobPropCreator}. */ MRCompactorJobPropCreator getJobPropCreator(Dataset dataset) { try { return new MRCompactorJobPropCreator.Builder().withDataset(dataset).withFileSystem(this.fs).withState(this.state) .build(); } catch (Exception e) { throw new RuntimeException(e); } } public Set<Dataset> getDatasets() { return this.datasets; } private void processCompactionJobs() { if (this.shouldVerifDataCompl) { verifyDataCompleteness(); } else { setAllDatasetStatesToVerified(); } this.submitCompactionJobsAndWaitForCompletion(); } private boolean shouldVerifyDataCompleteness() { return this.state.getPropAsBoolean(COMPACTION_COMPLETENESS_VERIFICATION_ENABLED, DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_ENABLED); } private void verifyDataCompleteness() { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST); int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether(); List<Dataset> datasetsToBeVerified = Lists.newArrayList(); for (Dataset dataset : this.datasets) { if (dataset.state() != UNVERIFIED) { continue; } if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) { datasetsToBeVerified.add(dataset); if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) { ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified); addCallback(datasetsToBeVerified, future); datasetsToBeVerified = Lists.newArrayList(); } } else { dataset.setState(VERIFIED); } } if (!datasetsToBeVerified.isEmpty()) { ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified); addCallback(datasetsToBeVerified, future); } } /** * A {@link Dataset} should be verified if its not already compacted, and it satisfies the blacklist and whitelist. */ private boolean shouldVerifyCompletenessForDataset(Dataset dataset, List<Pattern> blacklist, List<Pattern> whitelist) { boolean renamingRequired = this.state.getPropAsBoolean(COMPACTION_RENAME_SOURCE_DIR_ENABLED, DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED); LOG.info ("Should verify completeness with renaming source dir : " + renamingRequired); return !datasetAlreadyCompacted(this.fs, dataset, renamingRequired) && DatasetFilterUtils.survived(dataset.getName(), blacklist, whitelist); } /** * Get all the renamed directories from the given paths * They are deepest level containing directories whose name has a suffix {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX} * Also each directory needs to contain at least one file so empty directories will be excluded from the result */ public static Set<Path> getDeepestLevelRenamedDirsWithFileExistence (FileSystem fs, Set<Path> paths) throws IOException { Set<Path> renamedDirs = Sets.newHashSet(); for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, paths)) { if (fileStatus.getPath().getParent().toString().endsWith(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX)) { renamedDirs.add(fileStatus.getPath().getParent()); } } return renamedDirs; } /** * Get all the unrenamed directories from the given paths * They are deepest level containing directories whose name doesn't have a suffix {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX} * Also each directory needs to contain at least one file so empty directories will be excluded from the result */ public static Set<Path> getDeepestLevelUnrenamedDirsWithFileExistence (FileSystem fs, Set<Path> paths) throws IOException { Set<Path> unrenamed = Sets.newHashSet(); for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, paths)) { if (!fileStatus.getPath().getParent().toString().endsWith(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX)) { unrenamed.add(fileStatus.getPath().getParent()); } } return unrenamed; } /** * Rename all the source directories for a specific dataset */ public static void renameSourceDirAsCompactionComplete (FileSystem fs, Dataset dataset) { try { for (Path path: dataset.getRenamePaths()) { Path newPath = new Path (path.getParent(), path.getName() + MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX); LOG.info("[{}] Renaming {} to {}", dataset.getDatasetName(), path, newPath); fs.rename(path, newPath); } } catch (Exception e) { LOG.error ("Rename input path failed", e); } } /** * A {@link Dataset} is considered already compacted if either condition is true: * 1) When completion file strategy is used, a compaction completion means there is a file named * {@link MRCompactor#COMPACTION_COMPLETE_FILE_NAME} in its {@link Dataset#outputPath()}. * 2) When renaming source directory strategy is used, a compaction completion means source directories * {@link Dataset#inputPaths()} contains at least one directory which has been renamed to something with * {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}. */ public static boolean datasetAlreadyCompacted(FileSystem fs, Dataset dataset, boolean renameSourceEnable) { if (renameSourceEnable) { return checkAlreadyCompactedBasedOnSourceDirName (fs, dataset); } else { return checkAlreadyCompactedBasedOnCompletionFile(fs, dataset); } } /** When renaming source directory strategy is used, a compaction completion means source directories * {@link Dataset#inputPaths()} contains at least one directory which has been renamed to something with * {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}. */ private static boolean checkAlreadyCompactedBasedOnSourceDirName (FileSystem fs, Dataset dataset) { try { Set<Path> renamedDirs = getDeepestLevelRenamedDirsWithFileExistence(fs, dataset.inputPaths()); return !renamedDirs.isEmpty(); } catch (IOException e) { LOG.error("Failed to get deepest directories from source", e); return false; } } /** * When completion file strategy is used, a compaction completion means there is a file named * {@link MRCompactor#COMPACTION_COMPLETE_FILE_NAME} in its {@link Dataset#outputPath()}. */ private static boolean checkAlreadyCompactedBasedOnCompletionFile(FileSystem fs, Dataset dataset) { Path filePath = new Path(dataset.outputPath(), MRCompactor.COMPACTION_COMPLETE_FILE_NAME); try { return fs.exists(filePath); } catch (IOException e) { LOG.error("Failed to verify the existence of file " + filePath, e); return false; } } public static long readCompactionTimestamp(FileSystem fs, Path compactionOutputPath) throws IOException { Path completionFilePath = new Path(compactionOutputPath, COMPACTION_COMPLETE_FILE_NAME); try (FSDataInputStream completionFileStream = fs.open(completionFilePath)) { return completionFileStream.readLong(); } } private void addCallback(final List<Dataset> datasetsToBeVerified, ListenableFuture<Results> future) { Futures.addCallback(future, new FutureCallback<Results>() { /** * On success, resubmit verification for the {@link Dataset}s that should be resubmitted * (i.e., verification didn't pass and it didn't timeout). */ @Override public void onSuccess(Results results) { List<Dataset> datasetsToBeVerifiedAgain = Lists.newArrayList(); for (Results.Result result : results) { Optional<MRCompactorJobRunner> jobRunner = Optional.fromNullable(MRCompactor.this.jobRunnables.get(result.dataset())); switch (result.status()) { case PASSED: LOG.info("Completeness verification for dataset " + result.dataset() + " passed."); submitVerificationSuccessSlaEvent(result); result.dataset().setState(VERIFIED); if (jobRunner.isPresent()) { jobRunner.get().proceed(); } break; case FAILED: if (shouldGiveUpVerification()) { LOG.info("Completeness verification for dataset " + result.dataset() + " has timed out."); submitVerificationSuccessSlaEvent(result); result.dataset().setState(GIVEN_UP); result.dataset().addThrowable(new RuntimeException( String.format("Completeness verification for dataset %s failed or timed out.", result.dataset()))); } else { LOG.info("Completeness verification for dataset " + result.dataset() + " failed. Will verify again."); datasetsToBeVerifiedAgain.add(result.dataset()); } break; default: throw new IllegalStateException("Unrecognized result status: " + result.status()); } } if (!datasetsToBeVerifiedAgain.isEmpty()) { ListenableFuture<Results> future2 = MRCompactor.this.verifier.get().verify(datasetsToBeVerifiedAgain); addCallback(datasetsToBeVerifiedAgain, future2); } } /** * On failure, resubmit verification for all {@link Dataset}s, unless timed out. */ @Override public void onFailure(Throwable t) { LOG.error("Failed to verify completeness for the following datasets: " + datasetsToBeVerified, t); if (shouldGiveUpVerification()) { for (Dataset dataset : datasetsToBeVerified) { LOG.warn(String.format("Completeness verification for dataset %s has timed out.", dataset)); submitFailureSlaEvent(dataset, CompactionSlaEventHelper.COMPLETION_VERIFICATION_FAILED_EVENT_NAME); dataset.setState(GIVEN_UP); dataset.addThrowable(new RuntimeException( String.format("Completeness verification for dataset %s failed or timed out.", dataset))); } } else { ListenableFuture<Results> future2 = MRCompactor.this.verifier.get().verify(datasetsToBeVerified); addCallback(datasetsToBeVerified, future2); } } }); } /** * Get the number of {@link Dataset}s to be verified together. This allows multiple {@link Dataset}s * to share the same verification job, e.g., share the same query. */ private int getNumDatasetsVerifiedTogether() { return this.state.getPropAsInt(COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER, DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER); } private void setAllDatasetStatesToVerified() { for (Dataset dataset : this.datasets) { dataset.compareAndSetState(UNVERIFIED, VERIFIED); } } /** * Data completeness verification of a folder should give up if timed out. */ private boolean shouldGiveUpVerification() { return this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.dataVerifTimeoutMinutes; } private boolean shouldPublishDataIfCannotVerifyCompl() { return this.state.getPropAsBoolean(COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY, DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY); } private void submitCompactionJobsAndWaitForCompletion() { LOG.info("Submitting compaction jobs. Number of datasets: " + this.datasets.size()); boolean allDatasetsCompleted = false; while (!allDatasetsCompleted) { allDatasetsCompleted = true; for (Dataset dataset : this.datasets) { MRCompactorJobRunner jobRunner = MRCompactor.this.jobRunnables.get(dataset); if (dataset.state() == VERIFIED || dataset.state() == UNVERIFIED) { allDatasetsCompleted = false; // Run compaction for a dataset, if it is not already running or completed if (jobRunner == null || jobRunner.status() == ABORTED) { runCompactionForDataset(dataset, dataset.state() == VERIFIED); } } else if (dataset.state() == GIVEN_UP) { if (this.shouldPublishDataIfCannotVerifyCompl) { allDatasetsCompleted = false; if (jobRunner == null || jobRunner.status() == ABORTED) { runCompactionForDataset(dataset, true); } else { jobRunner.proceed(); } } else { if (jobRunner != null) { jobRunner.abort(); } } } } if (this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.compactionTimeoutMinutes) { // Compaction timed out. Killing all compaction jobs running LOG.error("Compaction timed-out. Killing all running jobs"); for (MRCompactorJobRunner jobRunner : MRCompactor.this.jobRunnables.values()) { jobRunner.abort(); } break; } // Sleep for a few seconds before another round try { Thread.sleep(TimeUnit.SECONDS.toMillis(COMPACTION_JOB_WAIT_INTERVAL_SECONDS)); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException("Interrupted while waiting", e); } } } /** * Run compaction job for a {@link Dataset}. * * @param dataset The input {@link Dataset} to run compaction for. * @param proceed Whether the compaction job is permitted to publish data. If data completeness verification * is enabled and the status of the inputFolder is UNVERIFIED, 'proceed' should be set to false. * Otherwise it should be set to true. */ private void runCompactionForDataset(Dataset dataset, boolean proceed) { LOG.info("Running compaction for dataset " + dataset); try { MRCompactorJobRunner jobRunner = getMRCompactorJobRunner(dataset); this.jobRunnables.put(dataset, jobRunner); if (proceed) { jobRunner.proceed(); } this.jobExecutor.execute(jobRunner); } catch (Throwable t) { dataset.skip(t); } } /** * Get an instance of {@link MRCompactorJobRunner}. */ private MRCompactorJobRunner getMRCompactorJobRunner(Dataset dataset) { try { @SuppressWarnings("unchecked") Class<? extends MRCompactorJobRunner> cls = (Class<? extends MRCompactorJobRunner>) Class .forName(this.state.getProp(COMPACTION_JOB_RUNNER_CLASS, DEFAULT_COMPACTION_JOB_RUNNER_CLASS)); return cls.getDeclaredConstructor(Dataset.class, FileSystem.class).newInstance(dataset, this.fs); } catch (Exception e) { throw new RuntimeException("Cannot instantiate MRCompactorJobRunner", e); } } /** * Keep track of running MR jobs, so if the compaction is cancelled, the MR jobs can be killed. */ public static void addRunningHadoopJob(Dataset dataset, Job job) { MRCompactor.RUNNING_MR_JOBS.put(dataset, job); } private long getCompactionTimeoutMinutes() { return this.state.getPropAsLong(COMPACTION_MR_JOB_TIMEOUT_MINUTES, DEFAULT_COMPACTION_MR_JOB_TIMEOUT_MINUTES); } private long getDataVerifTimeoutMinutes() { return this.state.getPropAsLong(COMPACTION_VERIFICATION_TIMEOUT_MINUTES, DEFAULT_COMPACTION_VERIFICATION_TIMEOUT_MINUTES); } private void throwExceptionsIfAnyDatasetCompactionFailed() { Set<Dataset> datasetsWithThrowables = getDatasetsWithThrowables(); int numDatasetsWithThrowables = 0; for (Dataset dataset : datasetsWithThrowables) { numDatasetsWithThrowables++; for (Throwable t : dataset.throwables()) { LOG.error("Error processing dataset " + dataset, t); submitFailureSlaEvent(dataset, CompactionSlaEventHelper.COMPACTION_FAILED_EVENT_NAME); } } if (numDatasetsWithThrowables > 0) { throw new RuntimeException(String.format("Failed to process %d datasets.", numDatasetsWithThrowables)); } } /** * Return all {@link Dataset}s where a {@link Throwable} is thrown from the compaction job. */ private Set<Dataset> getDatasetsWithThrowables() { Set<Dataset> datasetsWithThrowables = Sets.newHashSet(); for (Dataset dataset : this.datasets) { if (!dataset.throwables().isEmpty()) { datasetsWithThrowables.add(dataset); } } return datasetsWithThrowables; } private void shutdownExecutors() { LOG.info("Shutting down Executors"); ExecutorsUtils.shutdownExecutorService(this.jobExecutor, Optional.of(LOG)); } @Override public void cancel() throws IOException { try { for (Map.Entry<Dataset, Job> entry : MRCompactor.RUNNING_MR_JOBS.entrySet()) { Job hadoopJob = entry.getValue(); if (!hadoopJob.isComplete()) { LOG.info(String.format("Killing hadoop job %s for dataset %s", hadoopJob.getJobID(), entry.getKey())); hadoopJob.killJob(); } } } finally { try { ExecutorsUtils.shutdownExecutorService(this.jobExecutor, Optional.of(LOG), 0, TimeUnit.NANOSECONDS); } finally { if (this.verifier.isPresent()) { this.verifier.get().closeNow(); } } } } public static void modifyDatasetStateToRecompact (Dataset dataset) { // Modify the dataset for recompaction LOG.info ("{} changes to recompact mode", dataset.getDatasetName()); State recompactState = new State(); recompactState.setProp(MRCompactor.COMPACTION_RECOMPACT_FROM_DEST_PATHS, Boolean.TRUE); recompactState.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK, Boolean.FALSE); dataset.modifyDatasetForRecompact(recompactState); dataset.setState(VERIFIED); } /** * A subclass of {@link ThreadPoolExecutor} for running compaction jobs, and performs necessary steps * after each compaction job finishes. */ private class JobRunnerExecutor extends ThreadPoolExecutor { public JobRunnerExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue); } /** * When a compaction job for a {@link Dataset} finishes, if it successfully published the data (t == null * && jobRunner.status() == {@link MRCompactorJobRunner.Status#COMMITTED}, or if it * threw any {@link Throwable} (t != null), mark the {@link Dataset} as * {@link Dataset.DatasetState#COMPACTION_COMPLETE}. * If the job failed to publish the data because the input data was not complete, reduce the priority of * the {@link Dataset}. A new compaction job will be submitted later with a lower priority. */ @Override protected void afterExecute(Runnable r, Throwable t) { Preconditions.checkArgument(r instanceof MRCompactorJobRunner, String.format("Runnable expected to be instance of %s, actual %s", MRCompactorJobRunner.class.getSimpleName(), r.getClass().getSimpleName())); MRCompactorJobRunner jobRunner = (MRCompactorJobRunner) r; MRCompactor.this.jobRunnables.remove(jobRunner.getDataset()); if (t == null) { if (jobRunner.status() == COMMITTED) { if (jobRunner.getDataset().needToRecompact()) { modifyDatasetStateToRecompact (jobRunner.getDataset()); } else { // Set the dataset status to COMPACTION_COMPLETE if compaction is successful. jobRunner.getDataset().setState(COMPACTION_COMPLETE); } if (MRCompactor.this.compactorListener.isPresent()) { try { MRCompactor.this.compactorListener.get().onDatasetCompactionCompletion(jobRunner.getDataset()); } catch (Exception e) { t = e; } } } else if (jobRunner.getDataset().state() == GIVEN_UP && !MRCompactor.this.shouldPublishDataIfCannotVerifyCompl) { // Compaction job of a dataset has aborted, and data completeness verification has given up. // This dataset will not be compacted. LOG.info(String.format("Dataset %s will not be compacted, since data completeness cannot be verified", jobRunner.getDataset())); jobRunner.getDataset().setState(COMPACTION_COMPLETE); } else { // Compaction job of a dataset has aborted because data completeness is not verified. // Reduce priority and try again. jobRunner.getDataset().reducePriority(); } } if (t != null) { // Compaction job of a dataset has failed with a throwable. afterExecuteWithThrowable(jobRunner, t); } } private void afterExecuteWithThrowable(MRCompactorJobRunner jobRunner, Throwable t) { jobRunner.getDataset().skip(t); } } /** * Submit an event when completeness verification is successful */ private void submitVerificationSuccessSlaEvent(Results.Result result) { try { CompactionSlaEventHelper.getEventSubmitterBuilder(result.dataset(), Optional.<Job> absent(), this.fs) .eventSubmitter(this.eventSubmitter).eventName(CompactionSlaEventHelper.COMPLETION_VERIFICATION_SUCCESS_EVENT_NAME) .additionalMetadata(Maps.transformValues(result.verificationContext(), Functions.toStringFunction())).build() .submit(); } catch (Throwable t) { LOG.warn("Failed to submit verification success event:" + t, t); } } /** * Submit a failure sla event */ private void submitFailureSlaEvent(Dataset dataset, String eventName) { try { CompactionSlaEventHelper.getEventSubmitterBuilder(dataset, Optional.<Job> absent(), this.fs) .eventSubmitter(this.eventSubmitter).eventName(eventName).build().submit(); } catch (Throwable t) { LOG.warn("Failed to submit failure sla event:" + t, t); } } }