/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.morphline.hadoop.core; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.CommandBuilder; import org.kitesdk.morphline.api.MorphlineCompilationException; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.base.AbstractCommand; import com.typesafe.config.Config; /** * Command for transferring HDFS files, for example to help with centralized configuration file * management. On startup, the command downloads zero or more files or directory trees from HDFS to * the local file system. */ public final class DownloadHdfsFileBuilder implements CommandBuilder { @Override public Collection<String> getNames() { return Collections.singletonList("downloadHdfsFile"); } @Override public Command build(Config config, Command parent, Command child, MorphlineContext context) { try { return new DownloadHdfsFile(this, config, parent, child, context); } catch (IOException e) { throw new MorphlineCompilationException("Cannot compile", config, e); } } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static final class DownloadHdfsFile extends AbstractCommand { // global lock; contains successfully copied file paths private static final Set<String> DONE = new HashSet(); public DownloadHdfsFile(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) throws IOException { super(builder, config, parent, child, context); List<String> uris = getConfigs().getStringList(config, "inputFiles", Collections.<String>emptyList()); File dstRootDir = new File(getConfigs().getString(config, "outputDir", ".")); Configuration conf = new Configuration(); String defaultFileSystemUri = getConfigs().getString(config, "fs", null); if (defaultFileSystemUri != null) { FileSystem.setDefaultUri(conf, defaultFileSystemUri); // see Hadoop's GenericOptionsParser } for (String value : getConfigs().getStringList(config, "conf", Collections.<String>emptyList())) { conf.addResource(new Path(value)); // see Hadoop's GenericOptionsParser } validateArguments(); download(uris, conf, dstRootDir); } /* * To prevent races, we lock out other commands that delete and write the same local files, * and we only once delete and write any given file. This ensures that local file reads only * occur after local file writes are completed. E.g. this handles N parallel SolrSinks clones * in the same VM. * * TODO: consider extending this scheme to add filesystem based locking (advisory) in order to * also lock out clones in other JVM processes on the same file system. */ private void download(List<String> uris, Configuration conf, File dstRootDir) throws IOException { synchronized (DONE) { for (String uri : uris) { Path path = new Path(uri); File dst = new File(dstRootDir, path.getName()).getCanonicalFile(); if (!DONE.contains(dst.getPath())) { if (dst.isDirectory()) { LOG.debug("Deleting dir {}", dst); FileUtils.deleteDirectory(dst); } FileSystem fs = path.getFileSystem(conf); if (fs.isFile(path)) { dst.getParentFile().mkdirs(); } LOG.debug("Downloading {} to {}", uri, dst); if (!FileUtil.copy(fs, path, dst, false, conf)) { throw new IOException("Cannot download URI " + uri + " to " + dst); } DONE.add(dst.getPath()); LOG.debug("Succeeded downloading {} to {}", uri, dst); } } } } } }