/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.retention.policy; import java.io.IOException; import java.util.Collection; import java.util.List; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import com.google.common.base.Optional; import com.google.common.base.Predicate; import com.google.common.collect.Collections2; import com.google.common.collect.Lists; import gobblin.annotation.Alpha; import gobblin.data.management.version.FileSystemDatasetVersion; import gobblin.util.FileListUtils; /** * An abstract {@link RetentionPolicy} for {@link gobblin.data.management.retention.dataset.RawDataset}. * * This class embeds another {@link RetentionPolicy}. In {@link #listDeletableVersions(List)} it applies the * embedded {@link RetentionPolicy}'s predicate, as well as {@link #listQualifiedRawFileSystemDatasetVersions(Collection)}. */ @Alpha public abstract class RawDatasetRetentionPolicy implements RetentionPolicy<FileSystemDatasetVersion> { private final FileSystem fs; private final Class<? extends FileSystemDatasetVersion> versionClass; private final RetentionPolicy<FileSystemDatasetVersion> embeddedRetentionPolicy; public RawDatasetRetentionPolicy(FileSystem fs, Class<? extends FileSystemDatasetVersion> versionClass, RetentionPolicy<FileSystemDatasetVersion> retentionPolicy) { this.fs = fs; this.versionClass = versionClass; this.embeddedRetentionPolicy = retentionPolicy; } @Override public Class<? extends FileSystemDatasetVersion> versionClass() { return this.versionClass; } @Override public Collection<FileSystemDatasetVersion> listDeletableVersions(List<FileSystemDatasetVersion> allVersions) { Collection<FileSystemDatasetVersion> deletableVersions = this.embeddedRetentionPolicy.listDeletableVersions(allVersions); return listQualifiedRawFileSystemDatasetVersions(deletableVersions); } /** * A raw dataset version is qualified to be deleted, iff the corresponding refined paths exist, and the latest * mod time of all files is in the raw dataset is earlier than the latest mod time of all files in the refined paths. */ protected Collection<FileSystemDatasetVersion> listQualifiedRawFileSystemDatasetVersions(Collection<FileSystemDatasetVersion> allVersions) { return Lists.newArrayList(Collections2.filter(allVersions, new Predicate<FileSystemDatasetVersion>() { @Override public boolean apply(FileSystemDatasetVersion version) { Iterable<Path> refinedDatasetPaths = getRefinedDatasetPaths(version); try { Optional<Long> latestRawDatasetModTime = getLatestModTime(version.getPaths()); Optional<Long> latestRefinedDatasetModTime = getLatestModTime(refinedDatasetPaths); return latestRawDatasetModTime.isPresent() && latestRefinedDatasetModTime.isPresent() && latestRawDatasetModTime.get() <= latestRefinedDatasetModTime.get(); } catch (IOException e) { throw new RuntimeException("Failed to get modification time", e); } } })); } private Optional<Long> getLatestModTime(Iterable<Path> paths) throws IOException { long latestModTime = Long.MIN_VALUE; for (FileStatus status : FileListUtils.listMostNestedPathRecursively(this.fs, paths)) { latestModTime = Math.max(latestModTime, status.getModificationTime()); } return latestModTime == Long.MIN_VALUE ? Optional.<Long> absent() : Optional.of(latestModTime); } /** * Get the corresponding refined paths for a raw dataset version. For example, a raw dataset version * can be a file containing un-deduplicated records, whose corresponding refined dataset path is a file * containing the corresponding deduplicated records. */ protected abstract Iterable<Path> getRefinedDatasetPaths(FileSystemDatasetVersion version); }