/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy; import gobblin.commit.CommitStep; import gobblin.data.management.copy.entities.PrePublishStep; import gobblin.data.management.dataset.DatasetUtils; import gobblin.dataset.FileSystemDataset; import gobblin.util.PathUtils; import gobblin.util.FileListUtils; import gobblin.util.commit.DeleteFileCommitStep; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.collect.Lists; import com.google.common.collect.Maps; /** * Implementation of {@link CopyableDataset} that creates a {@link CopyableFile} for every file that is a descendant if * the root directory. */ public class RecursiveCopyableDataset implements CopyableDataset, FileSystemDataset { private static final String CONFIG_PREFIX = CopyConfiguration.COPY_PREFIX + ".recursive"; /** Like -update in distcp. Will update files that are different between source and target, and skip files already in target. */ public static final String UPDATE_KEY = CONFIG_PREFIX + ".update"; /** Like -delete in distcp. Will delete files in target that don't exist in source. */ public static final String DELETE_KEY = CONFIG_PREFIX + ".delete"; /** If true, will delete newly empty directories up to the dataset root. */ public static final String DELETE_EMPTY_DIRECTORIES_KEY = CONFIG_PREFIX + ".deleteEmptyDirectories"; private final Path rootPath; private final FileSystem fs; private final PathFilter pathFilter; // Glob used to find this dataset private final Path glob; private final CopyableFileFilter copyableFileFilter; private final boolean update; private final boolean delete; private final boolean deleteEmptyDirectories; private final Properties properties; public RecursiveCopyableDataset(final FileSystem fs, Path rootPath, Properties properties, Path glob) { this.rootPath = PathUtils.getPathWithoutSchemeAndAuthority(rootPath); this.fs = fs; this.pathFilter = DatasetUtils.instantiatePathFilter(properties); this.copyableFileFilter = DatasetUtils.instantiateCopyableFileFilter(properties); this.glob = glob; this.update = Boolean.parseBoolean(properties.getProperty(UPDATE_KEY)); this.delete = Boolean.parseBoolean(properties.getProperty(DELETE_KEY)); this.deleteEmptyDirectories = Boolean.parseBoolean(properties.getProperty(DELETE_EMPTY_DIRECTORIES_KEY)); this.properties = properties; } @Override public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException { Path nonGlobSearchPath = PathUtils.deepestNonGlobPath(this.glob); Path targetPath = new Path(configuration.getPublishDir(), PathUtils.relativizePath(this.rootPath, nonGlobSearchPath)); Map<Path, FileStatus> filesInSource = createPathMap(getFilesAtPath(this.fs, this.rootPath, this.pathFilter), this.rootPath); Map<Path, FileStatus> filesInTarget = createPathMap(getFilesAtPath(targetFs, targetPath, this.pathFilter), targetPath); List<Path> toCopy = Lists.newArrayList(); Map<Path, FileStatus> toDelete = Maps.newHashMap(); boolean requiresUpdate = false; for (Map.Entry<Path, FileStatus> entry : filesInSource.entrySet()) { FileStatus statusInTarget = filesInTarget.remove(entry.getKey()); if (statusInTarget != null) { // in both if (!sameFile(filesInSource.get(entry.getKey()), statusInTarget)) { toCopy.add(entry.getKey()); toDelete.put(entry.getKey(), statusInTarget); requiresUpdate = true; } } else { toCopy.add(entry.getKey()); } } if (!this.update && requiresUpdate) { throw new IOException("Some files need to be copied but they already exist in the destination. " + "Aborting because not running in update mode."); } if (this.delete) { toDelete.putAll(filesInTarget); } List<CopyEntity> copyEntities = Lists.newArrayList(); List<CopyableFile> copyableFiles = Lists.newArrayList(); for (Path path : toCopy) { FileStatus file = filesInSource.get(path); Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), nonGlobSearchPath); Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath); copyableFiles.add(CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration) .fileSet(datasetURN()) .ancestorsOwnerAndPermission(CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)) .build()); } copyEntities.addAll(this.copyableFileFilter.filter(this.fs, targetFs, copyableFiles)); if (!toDelete.isEmpty()) { CommitStep step = new DeleteFileCommitStep(targetFs, toDelete.values(), this.properties, this.deleteEmptyDirectories ? Optional.of(targetPath) : Optional.<Path>absent()); copyEntities.add(new PrePublishStep(datasetURN(), Maps.<String, String>newHashMap(), step, 1)); } return copyEntities; } @VisibleForTesting protected List<FileStatus> getFilesAtPath(FileSystem fs, Path path, PathFilter fileFilter) throws IOException { try { return FileListUtils.listFilesRecursively(fs, path, fileFilter); } catch (FileNotFoundException fnfe) { return Lists.newArrayList(); } } @Override public Path datasetRoot() { return this.rootPath; } @Override public String datasetURN() { return datasetRoot().toString(); } private Map<Path, FileStatus> createPathMap(List<FileStatus> files, Path prefix) { Map<Path, FileStatus> map = Maps.newHashMap(); for (FileStatus status : files) { map.put(PathUtils.relativizePath(status.getPath(), prefix), status); } return map; } private static boolean sameFile(FileStatus fileInSource, FileStatus fileInTarget) { return fileInTarget.getLen() == fileInSource.getLen() && fileInSource.getModificationTime() <= fileInTarget.getModificationTime(); } }