/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Stack;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Globals;
/**
* A Map-reduce program to recursively copy directories between different
* file-systems.
* @since 1.0
* @author The hadoop project
*/
@SuppressWarnings("deprecation")
public class DistCp implements Tool {
/* Default Charset. */
private static final Charset CHARSET =
Charset.forName(Globals.DEFAULT_FILE_ENCODING);
private static final String NAME = "distcp";
private static final String usage = NAME
+ " [OPTIONS] <srcurl>* <desturl>" + "\n\nOPTIONS:"
+ "\n-p[rbugp] Preserve status"
+ "\n r: replication number"
+ "\n b: block size"
+ "\n u: user" + "\n g: group"
+ "\n p: permission"
+ "\n -p alone is equivalent to -prbugp"
+ "\n-i Ignore failures"
+ "\n-log <logdir> Write logs to <logdir>"
+ "\n-m <num_maps> Maximum number of simultaneous copies"
+ "\n-overwrite Overwrite destination"
+ "\n-update Overwrite if src size different from dst size"
+ "\n-f <urilist_uri> Use list at <urilist_uri> as src list"
+ "\n-filelimit <n> Limit the total number of files to be <= n"
+ "\n-sizelimit <n> Limit the total size to be <= n bytes"
+ "\n-delete Delete the files existing in the dst but not in src"
+ "\n-mapredSslConf <f> Filename of SSL configuration for mapper task"
+
"\n\nNOTE 1: if -overwrite or -update are set, each source URI is "
+ "\n interpreted as an isomorphic update to an existing directory."
+ "\nFor example:" + "\nhadoop " + NAME
+ " -p -update \"hdfs://A:8020/user/foo/bar\" "
+ "\"hdfs://B:8020/user/foo/baz\"\n"
+ "\n would update all descendants of 'baz' also in 'bar'; it would "
+ "\n *not* update /user/foo/baz/bar" +
"\n\nNOTE 2: The parameter <n> in -filelimit and -sizelimit can be "
+ "\n specified with symbolic representation. For examples,"
+ "\n 1230k = 1230 * 1024 = 1259520"
+ "\n 891g = 891 * 1024^3 = 956703965184" +
"\n";
private static final long BYTES_PER_MAP = 256 * 1024 * 1024;
private static final int MAX_MAPS_PER_NODE = 20;
private static final int SYNC_FILE_MAX = 10;
enum Counter {
COPY, SKIP, FAIL, BYTESCOPIED, BYTESEXPECTED
}
enum Options {
DELETE("-delete", NAME + ".delete"),
FILE_LIMIT("-filelimit", NAME + ".limit.file"),
SIZE_LIMIT("-sizelimit", NAME + ".limit.size"),
IGNORE_READ_FAILURES("-i", NAME + ".ignore.read.failures"),
PRESERVE_STATUS("-p", NAME + ".preserve.status"),
OVERWRITE("-overwrite", NAME + ".overwrite.always"), UPDATE("-update", NAME
+ ".overwrite.ifnewer");
final String cmd, propertyname;
Options(final String cmd, final String propertyname) {
this.cmd = cmd;
this.propertyname = propertyname;
}
private long parseLong(final String[] args, final int offset) {
if (offset == args.length) {
throw new IllegalArgumentException("<n> not specified in " + this.cmd);
}
long n = StringUtils.TraditionalBinaryPrefix.string2long(args[offset]);
if (n <= 0) {
throw new IllegalArgumentException("n = " + n + " <= 0 in " + this.cmd);
}
return n;
}
}
enum FileAttribute {
BLOCK_SIZE, REPLICATION, USER, GROUP, PERMISSION;
final char symbol;
FileAttribute() {
this.symbol = toString().toLowerCase().charAt(0);
}
static EnumSet<FileAttribute> parse(final String s) {
if (s == null || s.length() == 0) {
return EnumSet.allOf(FileAttribute.class);
}
EnumSet<FileAttribute> set = EnumSet.noneOf(FileAttribute.class);
FileAttribute[] attributes = values();
for (char c : s.toCharArray()) {
int i = 0;
for (; i < attributes.length && c != attributes[i].symbol; i++) {
}
if (i < attributes.length) {
if (!set.contains(attributes[i])) {
set.add(attributes[i]);
} else {
throw new IllegalArgumentException("There are more than one '"
+ attributes[i].symbol + "' in " + s);
}
} else {
throw new IllegalArgumentException(
"'" + c + "' in " + s + " is undefined.");
}
}
return set;
}
}
static final String TMP_DIR_LABEL = NAME + ".tmp.dir";
static final String DST_DIR_LABEL = NAME + ".dest.path";
static final String JOB_DIR_LABEL = NAME + ".job.dir";
static final String MAX_MAPS_LABEL = NAME + ".max.map.tasks";
static final String SRC_LIST_LABEL = NAME + ".src.list";
static final String SRC_COUNT_LABEL = NAME + ".src.count";
static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
static final String DST_DIR_LIST_LABEL = NAME + ".dst.dir.list";
static final String BYTES_PER_MAP_LABEL = NAME + ".bytes.per.map";
static final String PRESERVE_STATUS_LABEL =
Options.PRESERVE_STATUS.propertyname + ".value";
private JobConf conf;
@Override
public void setConf(final Configuration conf) {
if (conf instanceof JobConf) {
this.conf = (JobConf) conf;
} else {
this.conf = new JobConf(conf);
}
}
@Override
public Configuration getConf() {
return this.conf;
}
public DistCp(final Configuration conf) {
setConf(conf);
}
/**
* An input/output pair of filenames.
*/
static class FilePair implements Writable {
FileStatus input = new FileStatus();
String output;
FilePair() {
}
FilePair(final FileStatus input, final String output) {
this.input = input;
this.output = output;
}
@Override
public void readFields(final DataInput in) throws IOException {
this.input.readFields(in);
this.output = Text.readString(in);
}
@Override
public void write(final DataOutput out) throws IOException {
this.input.write(out);
Text.writeString(out, this.output);
}
@Override
public String toString() {
return this.input + " : " + this.output;
}
}
/**
* InputFormat of a distcp job responsible for generating splits of the src
* file list.
*/
static class CopyInputFormat implements InputFormat<Text, Text> {
/**
* Produce splits such that each is no greater than the quotient of the
* total size and the number of splits requested.
* @param job The handle to the JobConf object
* @param numSplits Number of splits requested
*/
@Override
public InputSplit[] getSplits(final JobConf job, final int numSplits)
throws IOException {
int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
String srcfilelist = job.get(SRC_LIST_LABEL, "");
if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
throw new RuntimeException("Invalid metadata: #files("
+ cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist
+ ")");
}
Path src = new Path(srcfilelist);
FileSystem fs = src.getFileSystem(job);
FileStatus srcst = fs.getFileStatus(src);
ArrayList<FileSplit> splits = new ArrayList<>(numSplits);
LongWritable key = new LongWritable();
FilePair value = new FilePair();
final long targetsize = cbsize / numSplits;
long pos = 0L;
long last = 0L;
long acc = 0L;
long cbrem = srcst.getLen();
SequenceFile.Reader sl = null;
try {
sl = new SequenceFile.Reader(fs, src, job);
for (; sl.next(key, value); last = sl.getPosition()) {
// if adding this split would put this split past the target size,
// cut the last split and put this next file in the next split.
if (acc + key.get() > targetsize && acc != 0) {
long splitsize = last - pos;
splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
cbrem -= splitsize;
pos = last;
acc = 0L;
}
acc += key.get();
}
} finally {
checkAndClose(sl);
}
if (cbrem != 0) {
splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
}
return splits.toArray(new FileSplit[splits.size()]);
}
/**
* Returns a reader for this split of the src file list.
*/
@Override
public RecordReader<Text, Text> getRecordReader(final InputSplit split,
final JobConf job, final Reporter reporter) throws IOException {
return new SequenceFileRecordReader<>(job, (FileSplit) split);
}
}
/**
* FSCopyFilesMapper: The mapper for copying files between FileSystems.
*/
static class CopyFilesMapper
implements Mapper<LongWritable, FilePair, WritableComparable<?>, Text> {
// config
private int sizeBuf = 128 * 1024;
private FileSystem destFileSys = null;
private boolean ignoreReadFailures;
private boolean preserve_status;
private EnumSet<FileAttribute> preseved;
private boolean overwrite;
private boolean update;
private Path destPath = null;
private byte[] buffer = null;
private JobConf job;
// stats
private int failcount = 0;
private int skipcount = 0;
private int copycount = 0;
private String getCountString() {
return "Copied: "
+ this.copycount + " Skipped: " + this.skipcount + " Failed: "
+ this.failcount;
}
private void updateStatus(final Reporter reporter) {
reporter.setStatus(getCountString());
}
/**
* Return true if dst should be replaced by src and the update flag is set.
* Right now, this merely checks that the src and dst len are not equal.
* This should be improved on once modification times, CRCs, etc. can be
* meaningful in this context.
* @throws IOException
*/
private boolean needsUpdate(final FileStatus srcstatus,
final FileSystem dstfs, final Path dstpath) throws IOException {
return this.update
&& !sameFile(srcstatus.getPath().getFileSystem(this.job), srcstatus,
dstfs, dstpath);
}
private FSDataOutputStream create(final Path f, final Reporter reporter,
final FileStatus srcstat) throws IOException {
if (this.destFileSys.exists(f)) {
this.destFileSys.delete(f, false);
}
if (!this.preserve_status) {
return this.destFileSys.create(f, true, this.sizeBuf, reporter);
}
FsPermission permission = this.preseved.contains(FileAttribute.PERMISSION)
? srcstat.getPermission() : null;
short replication = this.preseved.contains(FileAttribute.REPLICATION)
? srcstat.getReplication() : this.destFileSys.getDefaultReplication();
long blockSize = this.preseved.contains(FileAttribute.BLOCK_SIZE)
? srcstat.getBlockSize() : this.destFileSys.getDefaultBlockSize();
return this.destFileSys.create(f, permission, true, this.sizeBuf,
replication, blockSize, reporter);
}
/**
* Copy a file to a destination.
* @param srcstat src path and metadata
* @param relativedst dst path
* @param reporter Hadoop reporter
*/
private void copy(final FileStatus srcstat, final Path relativedst,
final OutputCollector<WritableComparable<?>, Text> outc,
final Reporter reporter) throws IOException {
Path absdst = new Path(this.destPath, relativedst);
int totfiles = this.job.getInt(SRC_COUNT_LABEL, -1);
assert totfiles >= 0 : "Invalid file count " + totfiles;
// if a directory, ensure created even if empty
if (srcstat.isDir()) {
if (this.destFileSys.exists(absdst)) {
if (!this.destFileSys.getFileStatus(absdst).isDir()) {
throw new IOException(
"Failed to mkdirs: " + absdst + " is a file.");
}
} else if (!this.destFileSys.mkdirs(absdst)) {
throw new IOException("Failed to mkdirs " + absdst);
}
// TODO: when modification times can be set, directories should be
// emitted to reducers so they might be preserved. Also, mkdirs does
// not currently return an error when the directory already exists;
// if this changes, all directory work might as well be done in reduce
return;
}
if (this.destFileSys.exists(absdst)
&& !this.overwrite
&& !needsUpdate(srcstat, this.destFileSys, absdst)) {
outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
++this.skipcount;
reporter.incrCounter(Counter.SKIP, 1);
updateStatus(reporter);
return;
}
Path tmpfile = new Path(this.job.get(TMP_DIR_LABEL), relativedst);
long cbcopied = 0L;
FSDataInputStream in = null;
FSDataOutputStream out = null;
try {
// open src file
in = srcstat.getPath().getFileSystem(this.job).open(srcstat.getPath());
reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen());
// open tmp file
out = create(tmpfile, reporter, srcstat);
// copy file
for (int cbread; (cbread = in.read(this.buffer)) >= 0;) {
out.write(this.buffer, 0, cbread);
cbcopied += cbread;
reporter.setStatus(
String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen())
+ absdst + " [ " + StringUtils.humanReadableInt(cbcopied)
+ " / " + StringUtils.humanReadableInt(srcstat.getLen())
+ " ]");
}
} finally {
checkAndClose(in);
checkAndClose(out);
}
if (cbcopied != srcstat.getLen()) {
throw new IOException("File size not matched: copied "
+ bytesString(cbcopied) + " to tmpfile (=" + tmpfile
+ ") but expected " + bytesString(srcstat.getLen()) + " from "
+ srcstat.getPath());
} else {
if (totfiles == 1) {
// Copying a single file; use dst path provided by user as destination
// rather than destination directory, if a file
Path dstparent = absdst.getParent();
if (!(this.destFileSys.exists(dstparent)
&& this.destFileSys.getFileStatus(dstparent).isDir())) {
absdst = dstparent;
}
}
if (this.destFileSys.exists(absdst)
&& this.destFileSys.getFileStatus(absdst).isDir()) {
throw new IOException(absdst + " is a directory");
}
if (!this.destFileSys.mkdirs(absdst.getParent())) {
throw new IOException(
"Failed to create parent dir: " + absdst.getParent());
}
rename(tmpfile, absdst);
FileStatus dststat = this.destFileSys.getFileStatus(absdst);
if (dststat.getLen() != srcstat.getLen()) {
this.destFileSys.delete(absdst, false);
throw new IOException("File size not matched: copied "
+ bytesString(dststat.getLen()) + " to dst (=" + absdst
+ ") but expected " + bytesString(srcstat.getLen()) + " from "
+ srcstat.getPath());
}
updatePermissions(srcstat, dststat);
}
// report at least once for each file
++this.copycount;
reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
reporter.incrCounter(Counter.COPY, 1);
updateStatus(reporter);
}
/** rename tmp to dst, delete dst if already exists */
private void rename(final Path tmp, final Path dst) throws IOException {
try {
if (this.destFileSys.exists(dst)) {
this.destFileSys.delete(dst, true);
}
if (!this.destFileSys.rename(tmp, dst)) {
throw new IOException();
}
} catch (IOException cause) {
throw (IOException) new IOException("Fail to rename tmp file (="
+ tmp + ") to destination file (=" + dst + ")").initCause(cause);
}
}
private void updatePermissions(final FileStatus src, final FileStatus dst)
throws IOException {
if (this.preserve_status) {
DistCp.updatePermissions(src, dst, this.preseved, this.destFileSys);
}
}
static String bytesString(final long b) {
return b + " bytes (" + StringUtils.humanReadableInt(b) + ")";
}
/**
* Mapper configuration. Extracts source and destination file system, as
* well as top-level paths on source and destination directories. Gets the
* named file systems, to be used later in map.
*/
@Override
public void configure(final JobConf job) {
this.destPath = new Path(job.get(DST_DIR_LABEL, "/"));
try {
this.destFileSys = this.destPath.getFileSystem(job);
} catch (IOException ex) {
throw new RuntimeException("Unable to get the named file system.", ex);
}
this.sizeBuf = job.getInt("copy.buf.size", 128 * 1024);
this.buffer = new byte[this.sizeBuf];
this.ignoreReadFailures =
job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false);
this.preserve_status =
job.getBoolean(Options.PRESERVE_STATUS.propertyname, false);
if (this.preserve_status) {
this.preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL));
}
this.update = job.getBoolean(Options.UPDATE.propertyname, false);
this.overwrite =
!this.update && job.getBoolean(Options.OVERWRITE.propertyname, false);
this.job = job;
}
/**
* Map method. Copies one file from source file system to destination.
* @param key src len
* @param value FilePair (FileStatus src, Path dst)
* @param out Log of failed copies
* @param reporter Hadoop reporter
*/
@Override
public void map(final LongWritable key, final FilePair value,
final OutputCollector<WritableComparable<?>, Text> out,
final Reporter reporter) throws IOException {
final FileStatus srcstat = value.input;
final Path relativedst = new Path(value.output);
try {
copy(srcstat, relativedst, out, reporter);
} catch (IOException e) {
++this.failcount;
reporter.incrCounter(Counter.FAIL, 1);
updateStatus(reporter);
final String sfailure =
"FAIL " + relativedst + " : " + StringUtils.stringifyException(e);
out.collect(null, new Text(sfailure));
getLogger().info(sfailure);
try {
for (int i = 0; i < 3; ++i) {
try {
final Path tmp =
new Path(this.job.get(TMP_DIR_LABEL), relativedst);
if (this.destFileSys.delete(tmp, true)) {
break;
}
} catch (Throwable ex) {
// ignore, we are just cleaning up
getLogger()
.fine("Ignoring cleanup exception: " + ex.getMessage());
}
// update status, so we don't get timed out
updateStatus(reporter);
Thread.sleep(3 * 1000);
}
} catch (InterruptedException inte) {
throw (IOException) new IOException().initCause(inte);
}
} finally {
updateStatus(reporter);
}
}
@Override
public void close() throws IOException {
if (0 == this.failcount || this.ignoreReadFailures) {
return;
}
throw new IOException(getCountString());
}
}
private static List<Path> fetchFileList(final Configuration conf,
final Path srcList) throws IOException {
List<Path> result = new ArrayList<>();
FileSystem fs = srcList.getFileSystem(conf);
BufferedReader input = null;
try {
input =
new BufferedReader(new InputStreamReader(fs.open(srcList), CHARSET));
String line = input.readLine();
while (line != null) {
result.add(new Path(line));
line = input.readLine();
}
} finally {
checkAndClose(input);
}
return result;
}
@Deprecated
public static void copy(final Configuration conf, final String srcPath,
final String destPath, final Path logPath, final boolean srcAsList,
final boolean ignoreReadFailures) throws IOException {
final Path src = new Path(srcPath);
List<Path> tmp = new ArrayList<>();
if (srcAsList) {
tmp.addAll(fetchFileList(conf, src));
} else {
tmp.add(src);
}
EnumSet<Options> flags = ignoreReadFailures
? EnumSet.of(Options.IGNORE_READ_FAILURES)
: EnumSet.noneOf(Options.class);
final Path dst = new Path(destPath);
copy(conf, new Arguments(tmp, dst, logPath, flags, null, Long.MAX_VALUE,
Long.MAX_VALUE, null));
}
/** Sanity check for srcPath */
private static void checkSrcPath(final Configuration conf,
final List<Path> srcPaths) throws IOException {
List<IOException> rslt = new ArrayList<>();
for (Path p : srcPaths) {
FileSystem fs = p.getFileSystem(conf);
if (!fs.exists(p)) {
rslt.add(new IOException("Input source " + p + " does not exist."));
}
}
if (!rslt.isEmpty()) {
throw new InvalidInputException(rslt);
}
}
/**
* Driver to copy srcPath to destPath depending on required protocol.
* @param args arguments
*/
static void copy(final Configuration conf, final Arguments args)
throws IOException {
getLogger().info("srcPaths=" + args.srcs);
getLogger().info("destPath=" + args.dst);
checkSrcPath(conf, args.srcs);
JobConf job = createJobConf(conf);
if (args.preservedAttributes != null) {
job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes);
}
if (args.mapredSslConf != null) {
job.set("dfs.client.https.keystore.resource", args.mapredSslConf);
}
// Initialize the mapper
try {
setup(conf, job, args);
JobClient.runJob(job);
finalize(conf, job, args.dst, args.preservedAttributes);
} finally {
// delete tmp
fullyDelete(job.get(TMP_DIR_LABEL), job);
// delete jobDirectory
fullyDelete(job.get(JOB_DIR_LABEL), job);
}
}
private static void updatePermissions(final FileStatus src,
final FileStatus dst, final EnumSet<FileAttribute> preseved,
final FileSystem destFileSys) throws IOException {
String owner = null;
String group = null;
if (preseved.contains(FileAttribute.USER)
&& !src.getOwner().equals(dst.getOwner())) {
owner = src.getOwner();
}
if (preseved.contains(FileAttribute.GROUP)
&& !src.getGroup().equals(dst.getGroup())) {
group = src.getGroup();
}
if (owner != null || group != null) {
destFileSys.setOwner(dst.getPath(), owner, group);
}
if (preseved.contains(FileAttribute.PERMISSION)
&& !src.getPermission().equals(dst.getPermission())) {
destFileSys.setPermission(dst.getPath(), src.getPermission());
}
}
static private void finalize(final Configuration conf, final JobConf jobconf,
final Path destPath, final String presevedAttributes) throws IOException {
if (presevedAttributes == null) {
return;
}
EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes);
if (!preseved.contains(FileAttribute.USER)
&& !preseved.contains(FileAttribute.GROUP)
&& !preseved.contains(FileAttribute.PERMISSION)) {
return;
}
FileSystem dstfs = destPath.getFileSystem(conf);
Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL));
SequenceFile.Reader in = null;
try {
in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf),
dstdirlist, jobconf);
Text dsttext = new Text();
FilePair pair = new FilePair();
for (; in.next(dsttext, pair);) {
Path absdst = new Path(destPath, pair.output);
updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved,
dstfs);
}
} finally {
checkAndClose(in);
}
}
static private class Arguments {
final List<Path> srcs;
final Path dst;
final Path log;
final EnumSet<Options> flags;
final String preservedAttributes;
final long filelimit;
final long sizelimit;
final String mapredSslConf;
/**
* Arguments for distcp
* @param srcs List of source paths
* @param dst Destination path
* @param log Log output directory
* @param flags Command-line flags
* @param preservedAttributes Preserved attributes
* @param filelimit File limit
* @param sizelimit Size limit
*/
Arguments(final List<Path> srcs, final Path dst, final Path log,
final EnumSet<Options> flags, final String preservedAttributes,
final long filelimit, final long sizelimit,
final String mapredSslConf) {
this.srcs = srcs;
this.dst = dst;
this.log = log;
this.flags = flags;
this.preservedAttributes = preservedAttributes;
this.filelimit = filelimit;
this.sizelimit = sizelimit;
this.mapredSslConf = mapredSslConf;
// if (LOG.isTraceEnabled()) {
// LOG.trace("this = " + this);
// }
}
static Arguments valueOf(final String[] args, final Configuration conf)
throws IOException {
List<Path> srcs = new ArrayList<>();
Path dst = null;
Path log = null;
EnumSet<Options> flags = EnumSet.noneOf(Options.class);
String presevedAttributes = null;
String mapredSslConf = null;
long filelimit = Long.MAX_VALUE;
long sizelimit = Long.MAX_VALUE;
for (int idx = 0; idx < args.length; idx++) {
Options[] opt = Options.values();
int i = 0;
for (; i < opt.length && !args[idx].startsWith(opt[i].cmd); i++) {
}
if (i < opt.length) {
flags.add(opt[i]);
if (opt[i] == Options.PRESERVE_STATUS) {
presevedAttributes = args[idx].substring(2);
FileAttribute.parse(presevedAttributes); // validation
} else if (opt[i] == Options.FILE_LIMIT) {
filelimit = Options.FILE_LIMIT.parseLong(args, ++idx);
} else if (opt[i] == Options.SIZE_LIMIT) {
sizelimit = Options.SIZE_LIMIT.parseLong(args, ++idx);
}
} else if ("-f".equals(args[idx])) {
if (++idx == args.length) {
throw new IllegalArgumentException(
"urilist_uri not specified in -f");
}
srcs.addAll(fetchFileList(conf, new Path(args[idx])));
} else if ("-log".equals(args[idx])) {
if (++idx == args.length) {
throw new IllegalArgumentException("logdir not specified in -log");
}
log = new Path(args[idx]);
} else if ("-mapredSslConf".equals(args[idx])) {
if (++idx == args.length) {
throw new IllegalArgumentException(
"ssl conf file not specified in -mapredSslConf");
}
mapredSslConf = args[idx];
} else if ("-m".equals(args[idx])) {
if (++idx == args.length) {
throw new IllegalArgumentException("num_maps not specified in -m");
}
try {
conf.setInt(MAX_MAPS_LABEL, Integer.parseInt(args[idx]));
} catch (NumberFormatException e) {
throw new IllegalArgumentException(
"Invalid argument to -m: " + args[idx]);
}
} else if ('-' == args[idx].codePointAt(0)) {
throw new IllegalArgumentException("Invalid switch " + args[idx]);
} else if (idx == args.length - 1) {
dst = new Path(args[idx]);
} else {
srcs.add(new Path(args[idx]));
}
}
// mandatory command-line parameters
if (srcs.isEmpty() || dst == null) {
throw new IllegalArgumentException(
"Missing " + (dst == null ? "dst path" : "src"));
}
// incompatible command-line flags
final boolean isOverwrite = flags.contains(Options.OVERWRITE);
final boolean isUpdate = flags.contains(Options.UPDATE);
final boolean isDelete = flags.contains(Options.DELETE);
if (isOverwrite && isUpdate) {
throw new IllegalArgumentException("Conflicting overwrite policies");
}
if (isDelete && !isOverwrite && !isUpdate) {
throw new IllegalArgumentException(Options.DELETE.cmd
+ " must be specified with " + Options.OVERWRITE + " or "
+ Options.UPDATE + ".");
}
return new Arguments(srcs, dst, log, flags, presevedAttributes, filelimit,
sizelimit, mapredSslConf);
}
/** {@inheritDoc} */
@Override
public String toString() {
return getClass().getName()
+ "{" + "\n srcs = " + this.srcs + "\n dst = " + this.dst
+ "\n log = " + this.log + "\n flags = " + this.flags
+ "\n preservedAttributes = " + this.preservedAttributes
+ "\n filelimit = " + this.filelimit + "\n sizelimit = "
+ this.sizelimit + "\n mapredSslConf = " + this.mapredSslConf
+ "\n}";
}
}
/**
* This is the main driver for recursively copying directories across file
* systems. It takes at least two cmdline parameters. A source URL and a
* destination URL. It then essentially does an "ls -lR" on the source URL,
* and writes the output in a round-robin manner to all the map input files.
* The mapper actually copies the files allotted to it. The reduce is empty.
*/
@Override
public int run(final String[] args) {
try {
copy(this.conf, Arguments.valueOf(args, this.conf));
return 0;
} catch (IllegalArgumentException e) {
System.err.println(StringUtils.stringifyException(e) + "\n" + usage);
ToolRunner.printGenericCommandUsage(System.err);
return -1;
} catch (DuplicationException e) {
System.err.println(StringUtils.stringifyException(e));
return DuplicationException.ERROR_CODE;
} catch (RemoteException e) {
final IOException unwrapped =
e.unwrapRemoteException(FileNotFoundException.class,
AccessControlException.class, QuotaExceededException.class);
System.err.println(StringUtils.stringifyException(unwrapped));
return -3;
} catch (Exception e) {
System.err.println("With failures, global counters are inaccurate; "
+ "consider running with -i");
System.err.println("Copy failed: " + StringUtils.stringifyException(e));
return -999;
}
}
/**
* This is the main driver for recursively copying directories across file
* systems. It takes at least two cmdline parameters. A source URL and a
* destination URL. It then essentially does an "ls -lR" on the source URL,
* and writes the output in a round-robin manner to all the map input files.
* The mapper actually copies the files allotted to it. The reduce is empty.
* @throws EoulsanException if an error occurs
*/
public void runWithException(final String[] args) throws EoulsanException {
try {
copy(this.conf, Arguments.valueOf(args, this.conf));
} catch (IllegalArgumentException e) {
throw new EoulsanException(
StringUtils.stringifyException(e) + "\n" + usage);
} catch (DuplicationException e) {
throw new EoulsanException(StringUtils.stringifyException(e));
} catch (RemoteException e) {
final IOException unwrapped =
e.unwrapRemoteException(FileNotFoundException.class,
AccessControlException.class, QuotaExceededException.class);
throw new EoulsanException(StringUtils.stringifyException(unwrapped));
} catch (Exception e) {
throw new EoulsanException(
"Copy failed: " + StringUtils.stringifyException(e));
}
}
/**
* Make a path relative with respect to a root path. absPath is always assumed
* to descend from root. Otherwise returned path is null.
*/
static String makeRelative(final Path root, final Path absPath) {
if (!absPath.isAbsolute()) {
throw new IllegalArgumentException(
"!absPath.isAbsolute(), absPath=" + absPath);
}
String p = absPath.toUri().getPath();
StringTokenizer pathTokens = new StringTokenizer(p, "/");
for (StringTokenizer rootTokens =
new StringTokenizer(root.toUri().getPath(), "/"); rootTokens
.hasMoreTokens();) {
if (!rootTokens.nextToken().equals(pathTokens.nextToken())) {
return null;
}
}
StringBuilder sb = new StringBuilder();
for (; pathTokens.hasMoreTokens();) {
sb.append(pathTokens.nextToken());
if (pathTokens.hasMoreTokens()) {
sb.append(Path.SEPARATOR);
}
}
return sb.length() == 0 ? "." : sb.toString();
}
/**
* Calculate how many maps to run. Number of maps is bounded by a minimum of
* the cumulative size of the copy / (distcp.bytes.per.map, default
* BYTES_PER_MAP or -m on the command line) and at most (distcp.max.map.tasks,
* default MAX_MAPS_PER_NODE * nodes in the cluster).
* @param totalBytes Count of total bytes for job
* @param job The job to configure
*/
private static void setMapCount(final long totalBytes, final JobConf job)
throws IOException {
int numMaps =
(int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP));
numMaps = Math.min(numMaps, job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE
* new JobClient(job).getClusterStatus().getTaskTrackers()));
job.setNumMapTasks(Math.max(numMaps, 1));
}
/** Fully delete dir */
static void fullyDelete(final String dir, final Configuration conf)
throws IOException {
if (dir != null) {
Path tmp = new Path(dir);
boolean success = tmp.getFileSystem(conf).delete(tmp, true);
if (!success) {
getLogger().warning("Could not fully delete " + tmp);
}
}
}
// Job configuration
private static JobConf createJobConf(final Configuration conf) {
JobConf jobconf = new JobConf(conf, DistCp.class);
jobconf.setJobName(NAME);
// turn off speculative execution, because DFS doesn't handle
// multiple writers to the same file.
jobconf.setMapSpeculativeExecution(false);
jobconf.setInputFormat(CopyInputFormat.class);
jobconf.setOutputKeyClass(Text.class);
jobconf.setOutputValueClass(Text.class);
jobconf.setMapperClass(CopyFilesMapper.class);
jobconf.setNumReduceTasks(0);
return jobconf;
}
private static final Random RANDOM = new Random();
public static String getRandomId() {
return Integer.toString(RANDOM.nextInt(Integer.MAX_VALUE), 36);
}
/**
* Initialize DFSCopyFileMapper specific job-configuration.
* @param conf : The dfs/mapred configuration.
* @param jobConf : The handle to the jobConf object to be initialized.
* @param args Arguments
*/
private static void setup(final Configuration conf, final JobConf jobConf,
final Arguments args) throws IOException {
jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());
// set boolean values
final boolean update = args.flags.contains(Options.UPDATE);
final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
jobConf.setBoolean(Options.UPDATE.propertyname, update);
jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname,
args.flags.contains(Options.IGNORE_READ_FAILURES));
jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname,
args.flags.contains(Options.PRESERVE_STATUS));
final String randomId = getRandomId();
JobClient jClient = new JobClient(jobConf);
Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());
long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);
FileSystem dstfs = args.dst.getFileSystem(conf);
boolean dstExists = dstfs.exists(args.dst);
boolean dstIsDir = false;
if (dstExists) {
dstIsDir = dstfs.getFileStatus(args.dst).isDir();
}
// default logPath
Path logPath = args.log;
if (logPath == null) {
String filename = "_distcp_logs_" + randomId;
if (!dstExists || !dstIsDir) {
Path parent = args.dst.getParent();
if (null == parent) {
// If dst is '/' on S3, it might not exist yet, but dst.getParent()
// will return null. In this case, use '/' as its own parent to
// prevent
// NPE errors below.
parent = args.dst;
}
if (!dstfs.exists(parent)) {
dstfs.mkdirs(parent);
}
logPath = new Path(parent, filename);
} else {
logPath = new Path(args.dst, filename);
}
}
FileOutputFormat.setOutputPath(jobConf, logPath);
// create src list, dst list
FileSystem jobfs = jobDirectory.getFileSystem(jobConf);
Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf,
srcfilelist, LongWritable.class, FilePair.class,
SequenceFile.CompressionType.NONE);
Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf,
dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);
Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
SequenceFile.Writer dir_writer =
SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
FilePair.class, SequenceFile.CompressionType.NONE);
// handle the case where the destination directory doesn't exist
// and we've only a single src directory OR we're updating/overwriting
// the contents of the destination directory.
final boolean special =
(args.srcs.size() == 1 && !dstExists) || update || overwrite;
int srcCount = 0, cnsyncf = 0, dirsyn = 0;
long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
try {
for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
final Path src = srcItr.next();
FileSystem srcfs = src.getFileSystem(conf);
FileStatus srcfilestat = srcfs.getFileStatus(src);
Path root = special && srcfilestat.isDir() ? src : src.getParent();
if (srcfilestat.isDir()) {
++srcCount;
}
Stack<FileStatus> pathstack = new Stack<>();
for (pathstack.push(srcfilestat); !pathstack.empty();) {
FileStatus cur = pathstack.pop();
FileStatus[] children = srcfs.listStatus(cur.getPath());
for (int i = 0; i < children.length; i++) {
boolean skipfile = false;
final FileStatus child = children[i];
final String dst = makeRelative(root, child.getPath());
++srcCount;
if (child.isDir()) {
pathstack.push(child);
} else {
// skip file if the src and the dst files are the same.
skipfile = update
&& sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
// skip file if it exceed file limit or size limit
skipfile |= fileCount == args.filelimit
|| byteCount + child.getLen() > args.sizelimit;
if (!skipfile) {
++fileCount;
byteCount += child.getLen();
// if (LOG.isTraceEnabled()) {
// LOG.trace("adding file " + child.getPath());
// }
++cnsyncf;
cbsyncs += child.getLen();
if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) {
src_writer.sync();
dst_writer.sync();
cnsyncf = 0;
cbsyncs = 0L;
}
}
}
if (!skipfile) {
src_writer.append(
new LongWritable(child.isDir() ? 0 : child.getLen()),
new FilePair(child, dst));
}
dst_writer.append(new Text(dst),
new Text(child.getPath().toString()));
}
if (cur.isDir()) {
String dst = makeRelative(root, cur.getPath());
dir_writer.append(new Text(dst), new FilePair(cur, dst));
if (++dirsyn > SYNC_FILE_MAX) {
dirsyn = 0;
dir_writer.sync();
}
}
}
}
} finally {
checkAndClose(src_writer);
checkAndClose(dst_writer);
checkAndClose(dir_writer);
}
FileStatus dststatus = null;
try {
dststatus = dstfs.getFileStatus(args.dst);
} catch (FileNotFoundException fnfe) {
getLogger().info(args.dst + " does not exist.");
}
// create dest path dir if copying > 1 file
if (dststatus == null) {
if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
throw new IOException("Failed to create" + args.dst);
}
}
final Path sorted = new Path(jobDirectory, "_distcp_sorted");
checkDuplication(jobfs, dstfilelist, sorted, conf);
if (dststatus != null && args.flags.contains(Options.DELETE)) {
deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf,
conf);
}
Path tmpDir =
new Path((dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId);
jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
// Explicitly create the tmpDir to ensure that it can be cleaned
// up by fullyDelete() later.
tmpDir.getFileSystem(conf).mkdirs(tmpDir);
getLogger().info("srcCount=" + srcCount);
jobConf.setInt(SRC_COUNT_LABEL, srcCount);
jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
setMapCount(byteCount, jobConf);
}
/**
* Check whether the contents of src and dst are the same. Return false if
* dstpath does not exist If the files have different sizes, return false. If
* the files have the same sizes, the file checksums will be compared. When
* file checksum is not supported in any of file systems, two files are
* considered as the same if they have the same size.
*/
static private boolean sameFile(final FileSystem srcfs,
final FileStatus srcstatus, final FileSystem dstfs, final Path dstpath)
throws IOException {
FileStatus dststatus;
try {
dststatus = dstfs.getFileStatus(dstpath);
} catch (FileNotFoundException fnfe) {
return false;
}
// same length?
if (srcstatus.getLen() != dststatus.getLen()) {
return false;
}
// get src checksum
final FileChecksum srccs;
try {
srccs = srcfs.getFileChecksum(srcstatus.getPath());
} catch (FileNotFoundException fnfe) {
/*
* Two possible cases: (1) src existed once but was deleted between the
* time period that srcstatus was obtained and the try block above. (2)
* srcfs does not support file checksum and (incorrectly) throws FNFE,
* e.g. some previous versions of HftpFileSystem. For case (1), it is okay
* to return true since src was already deleted. For case (2), true should
* be returned.
*/
return true;
}
// compare checksums
try {
final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath());
// return true if checksum is not supported
// (i.e. some of the checksums is null)
return srccs == null || dstcs == null || srccs.equals(dstcs);
} catch (FileNotFoundException fnfe) {
return false;
}
}
/** Delete the dst files/dirs which do not exist in src */
static private void deleteNonexisting(final FileSystem dstfs,
final FileStatus dstroot, final Path dstsorted, final FileSystem jobfs,
final Path jobdir, final JobConf jobconf, final Configuration conf)
throws IOException {
if (!dstroot.isDir()) {
throw new IOException("dst must be a directory when option "
+ Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath()
+ ") is not a directory.");
}
// write dst lsr results
final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr");
final SequenceFile.Writer writer =
SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class,
dstroot.getClass(), SequenceFile.CompressionType.NONE);
try {
// do lsr to get all file statuses in dstroot
final Stack<FileStatus> lsrstack = new Stack<>();
for (lsrstack.push(dstroot); !lsrstack.isEmpty();) {
final FileStatus status = lsrstack.pop();
if (status.isDir()) {
for (FileStatus child : dstfs.listStatus(status.getPath())) {
String relative = makeRelative(dstroot.getPath(), child.getPath());
writer.append(new Text(relative), child);
lsrstack.push(child);
}
}
}
} finally {
checkAndClose(writer);
}
// sort lsr results
final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted");
SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs,
new Text.Comparator(), Text.class, FileStatus.class, jobconf);
sorter.sort(dstlsr, sortedlsr);
// compare lsr list and dst list
SequenceFile.Reader lsrin = null;
SequenceFile.Reader dstin = null;
try {
lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf);
dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf);
// compare sorted lsr list and sorted dst list
final Text lsrpath = new Text();
final FileStatus lsrstatus = new FileStatus();
final Text dstpath = new Text();
final Text dstfrom = new Text();
final FsShell shell = new FsShell(conf);
final String[] shellargs = {"-rmr", null};
boolean hasnext = dstin.next(dstpath, dstfrom);
for (; lsrin.next(lsrpath, lsrstatus);) {
int dst_cmp_lsr = dstpath.compareTo(lsrpath);
for (; hasnext && dst_cmp_lsr < 0;) {
hasnext = dstin.next(dstpath, dstfrom);
dst_cmp_lsr = dstpath.compareTo(lsrpath);
}
if (dst_cmp_lsr == 0) {
// lsrpath exists in dst, skip it
hasnext = dstin.next(dstpath, dstfrom);
} else {
// lsrpath does not exist, delete it
String s = new Path(dstroot.getPath(), lsrpath.toString()).toString();
if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) {
shellargs[1] = s;
int r = 0;
try {
r = shell.run(shellargs);
} catch (Exception e) {
throw new IOException("Exception from shell.", e);
}
if (r != 0) {
throw new IOException("\""
+ shellargs[0] + " " + shellargs[1]
+ "\" returns non-zero value " + r);
}
}
}
}
} finally {
checkAndClose(lsrin);
checkAndClose(dstin);
}
}
// is x an ancestor path of y?
static private boolean isAncestorPath(final String x, final String y) {
if (!y.startsWith(x)) {
return false;
}
final int len = x.length();
return y.length() == len || y.charAt(len) == Path.SEPARATOR_CHAR;
}
/** Check whether the file list have duplication. */
static private void checkDuplication(final FileSystem fs, final Path file,
final Path sorted, final Configuration conf) throws IOException {
SequenceFile.Reader in = null;
try {
SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs,
new Text.Comparator(), Text.class, Text.class, conf);
sorter.sort(file, sorted);
in = new SequenceFile.Reader(fs, sorted, conf);
Text prevdst = null, curdst = new Text();
Text prevsrc = null, cursrc = new Text();
for (; in.next(curdst, cursrc);) {
if (prevdst != null && curdst.equals(prevdst)) {
throw new DuplicationException(
"Invalid input, there are duplicated files in the sources: "
+ prevsrc + ", " + cursrc);
}
prevdst = curdst;
curdst = new Text();
prevsrc = cursrc;
cursrc = new Text();
}
} finally {
checkAndClose(in);
}
}
static boolean checkAndClose(final java.io.Closeable io) {
if (io != null) {
try {
io.close();
} catch (IOException ioe) {
getLogger().warning(StringUtils.stringifyException(ioe));
return false;
}
}
return true;
}
/** An exception class for duplicated source files. */
public static class DuplicationException extends IOException {
private static final long serialVersionUID = 1L;
/** Error code for this exception */
public static final int ERROR_CODE = -2;
DuplicationException(final String message) {
super(message);
}
}
}