/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
/**
* This class provides the basic functionality to sync two FileSystems based on
* the snapshot diff report. More specifically, we have the following settings:
* 1. Both the source and target FileSystem must be DistributedFileSystem
* 2. Two snapshots (e.g., s1 and s2) have been created on the source FS.
* The diff between these two snapshots will be copied to the target FS.
* 3. The target has the same snapshot s1. No changes have been made on the
* target since s1. All the files/directories in the target are the same with
* source.s1
*/
class DistCpSync {
static boolean sync(DistCpOptions inputOptions, Configuration conf)
throws IOException {
List<Path> sourcePaths = inputOptions.getSourcePaths();
if (sourcePaths.size() != 1) {
// we only support one source dir which must be a snapshottable directory
throw new IllegalArgumentException(sourcePaths.size()
+ " source paths are provided");
}
final Path sourceDir = sourcePaths.get(0);
final Path targetDir = inputOptions.getTargetPath();
final FileSystem sfs = sourceDir.getFileSystem(conf);
final FileSystem tfs = targetDir.getFileSystem(conf);
// currently we require both the source and the target file system are
// DistributedFileSystem.
if (!(sfs instanceof DistributedFileSystem) ||
!(tfs instanceof DistributedFileSystem)) {
throw new IllegalArgumentException("The FileSystems needs to" +
" be DistributedFileSystem for using snapshot-diff-based distcp");
}
final DistributedFileSystem sourceFs = (DistributedFileSystem) sfs;
final DistributedFileSystem targetFs= (DistributedFileSystem) tfs;
// make sure targetFS has no change between from and the current states
if (!checkNoChange(inputOptions, targetFs, targetDir)) {
// set the source path using the snapshot path
inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
inputOptions.getToSnapshot())));
return false;
}
Path tmpDir = null;
try {
tmpDir = createTargetTmpDir(targetFs, targetDir);
DiffInfo[] diffs = getDiffs(inputOptions, sourceFs, sourceDir, targetDir);
if (diffs == null) {
return false;
}
// do the real sync work: deletion and rename
syncDiff(diffs, targetFs, tmpDir);
return true;
} catch (Exception e) {
DistCp.LOG.warn("Failed to use snapshot diff for distcp", e);
return false;
} finally {
deleteTargetTmpDir(targetFs, tmpDir);
// TODO: since we have tmp directory, we can support "undo" with failures
// set the source path using the snapshot path
inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
inputOptions.getToSnapshot())));
}
}
private static String getSnapshotName(String name) {
return Path.CUR_DIR.equals(name) ? "" : name;
}
private static Path getSourceSnapshotPath(Path sourceDir, String snapshotName) {
if (Path.CUR_DIR.equals(snapshotName)) {
return sourceDir;
} else {
return new Path(sourceDir,
HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + snapshotName);
}
}
private static Path createTargetTmpDir(DistributedFileSystem targetFs,
Path targetDir) throws IOException {
final Path tmp = new Path(targetDir,
DistCpConstants.HDFS_DISTCP_DIFF_DIRECTORY_NAME + DistCp.rand.nextInt());
if (!targetFs.mkdirs(tmp)) {
throw new IOException("The tmp directory " + tmp + " already exists");
}
return tmp;
}
private static void deleteTargetTmpDir(DistributedFileSystem targetFs,
Path tmpDir) {
try {
if (tmpDir != null) {
targetFs.delete(tmpDir, true);
}
} catch (IOException e) {
DistCp.LOG.error("Unable to cleanup tmp dir: " + tmpDir, e);
}
}
/**
* Compute the snapshot diff on the given file system. Return true if the diff
* is empty, i.e., no changes have happened in the FS.
*/
private static boolean checkNoChange(DistCpOptions inputOptions,
DistributedFileSystem fs, Path path) {
try {
SnapshotDiffReport targetDiff =
fs.getSnapshotDiffReport(path, inputOptions.getFromSnapshot(), "");
if (!targetDiff.getDiffList().isEmpty()) {
DistCp.LOG.warn("The target has been modified since snapshot "
+ inputOptions.getFromSnapshot());
return false;
} else {
return true;
}
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + path, e);
}
return false;
}
@VisibleForTesting
static DiffInfo[] getDiffs(DistCpOptions inputOptions,
DistributedFileSystem fs, Path sourceDir, Path targetDir) {
try {
final String from = getSnapshotName(inputOptions.getFromSnapshot());
final String to = getSnapshotName(inputOptions.getToSnapshot());
SnapshotDiffReport sourceDiff = fs.getSnapshotDiffReport(sourceDir,
from, to);
return DiffInfo.getDiffs(sourceDiff, targetDir);
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e);
}
return null;
}
private static void syncDiff(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException {
moveToTmpDir(diffs, targetFs, tmpDir);
moveToTarget(diffs, targetFs);
}
/**
* Move all the source files that should be renamed or deleted to the tmp
* directory.
*/
private static void moveToTmpDir(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException {
// sort the diffs based on their source paths to make sure the files and
// subdirs are moved before moving their parents/ancestors.
Arrays.sort(diffs, DiffInfo.sourceComparator);
Random random = new Random();
for (DiffInfo diff : diffs) {
Path tmpTarget = new Path(tmpDir, diff.source.getName());
while (targetFs.exists(tmpTarget)) {
tmpTarget = new Path(tmpDir, diff.source.getName() + random.nextInt());
}
diff.setTmp(tmpTarget);
targetFs.rename(diff.source, tmpTarget);
}
}
/**
* Finish the rename operations: move all the intermediate files/directories
* from the tmp dir to the final targets.
*/
private static void moveToTarget(DiffInfo[] diffs,
DistributedFileSystem targetFs) throws IOException {
// sort the diffs based on their target paths to make sure the parent
// directories are created first.
Arrays.sort(diffs, DiffInfo.targetComparator);
for (DiffInfo diff : diffs) {
if (diff.target != null) {
if (!targetFs.exists(diff.target.getParent())) {
targetFs.mkdirs(diff.target.getParent());
}
targetFs.rename(diff.getTmp(), diff.target);
}
}
}
}