/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.mgmt.upload; import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly; import fr.ens.biologie.genomique.eoulsan.core.ContextUtils; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.core.Version; import fr.ens.biologie.genomique.eoulsan.core.workflow.StepOutputDataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFormatConverter; import fr.ens.biologie.genomique.eoulsan.io.CompressionType; import fr.ens.biologie.genomique.eoulsan.modules.AbstractModule; import fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp; import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils; /** * This class define a download module that retrieve data from HDFS at the end * of an analysis. * @since 1.0 * @author Laurent Jourdren */ @HadoopOnly public class HDFSDataDownloadModule extends AbstractModule { /** * Key in the settings to use to save the list of DataFormat of the files to * download. */ public static final String DATAFORMATS_TO_DOWNLOAD_SETTING = "dataformat.to.download"; /** * Key in the settings to use to disable the downloads. */ public static final String NO_HDFS_DOWNLOAD = "no.hdfs.download"; /** Module name. */ public static final String MODULE_NAME = "_download"; private Configuration conf; @Override public String getName() { return MODULE_NAME; } @Override public String getDescription() { return "Download output data from HDFS filesystem"; } @Override public Version getVersion() { return Globals.APP_VERSION; } @Override public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters) throws EoulsanException { this.conf = CommonHadoop.createConfiguration(EoulsanRuntime.getSettings()); } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { // Skip the step if the global parameter NO_HDFS_DOWNLOAD is set final String noDownloadValue = context.getSettings().getSetting(NO_HDFS_DOWNLOAD); if (noDownloadValue != null && "true".equals(noDownloadValue.trim().toLowerCase())) { status.setProgressMessage("Download step skipped in settings."); return status.createTaskResult(); } final String hadoopWorkingPathname = ContextUtils.getHadoopWorkingDirectory(context).getSource(); getLogger().info("Start copying results."); getLogger().info("inpath=" + hadoopWorkingPathname + "\toutpath=" + context.getOutputDirectory()); final Configuration conf = this.conf; if (hadoopWorkingPathname == null) { throw new NullPointerException("The input path is null"); } if (context.getOutputDirectory() == null) { throw new NullPointerException("The output path is null"); } // Set the output directory final DataFile outputDir = context.getOutputDirectory(); try { final Path inPath = new Path(hadoopWorkingPathname); if (!PathUtils.isExistingDirectoryFile(inPath, conf)) { throw new EoulsanException( "The base directory is not a directory: " + inPath); } // Map with files to download final Map<DataFile, DataFile> files = new HashMap<>(); // Get the output file of the workflow final Set<StepOutputDataFile> outFiles = // context.getWorkflow().getWorkflowFilesAtFirstStep().getOutputFiles(); new HashSet<>(); // Add the output files of the workflow to the list of files to downloads for (StepOutputDataFile file : outFiles) { final DataFile in = file.getDataFile(); final DataFile out = new DataFile(outputDir, in.getName() + CompressionType.BZIP2.getExtension()); files.put(in, out); } // If no file to copy, do nothing if (files.size() > 0) { if (outputDir.isLocalFile()) { // Local FileSystem output for (Map.Entry<DataFile, DataFile> e : files.entrySet()) { // Copy the file getLogger().info("Copy " + e.getKey() + " to " + e.getValue()); new DataFormatConverter(e.getKey(), e.getValue()).convert(); } } else { // Use distributed copy if output is not on local FileSystem final Map<DataFile, DataFile> filesToTranscode = new HashMap<>(); final Map<DataFile, DataFile> filesToDistCp = new HashMap<>(); // Test if temporary file is needed for (Map.Entry<DataFile, DataFile> e : files.entrySet()) { final DataFile src = e.getKey(); final DataFile dest = e.getValue(); if (src.getName().equals(dest.getName())) { filesToDistCp.put(src, dest); } else { final DataFile tmp = new DataFile(src.getParent(), dest.getName()); filesToTranscode.put(src, tmp); filesToDistCp.put(tmp, dest); } } // Create temporary files final Path jobPath = PathUtils.createTempPath( new Path(hadoopWorkingPathname), "distcp-", "", this.conf); final DataFileDistCp dsdcp = new DataFileDistCp(this.conf, jobPath); dsdcp.copy(filesToTranscode); // Remove job path directory final FileSystem fs = jobPath.getFileSystem(conf); if (!fs.delete(jobPath, true)) { getLogger() .warning("Cannot remove DataFileDistCp job path: " + jobPath); } // Copy files to destination hadoopDistCp(conf, filesToDistCp); } } final StringBuilder logMsg = new StringBuilder(); for (Map.Entry<DataFile, DataFile> e : files.entrySet()) { logMsg.append("Copy "); logMsg.append(e.getKey()); logMsg.append(" to "); logMsg.append(e.getValue()); logMsg.append('\n'); } status.setProgressMessage(logMsg.toString()); return status.createTaskResult(); } catch (EoulsanException e) { return status.createTaskResult(e, "Error while download results: " + e.getMessage()); } catch (IOException e) { return status.createTaskResult(e, "Error while download results: " + e.getMessage()); } } /** * Copy files using hadoop DistCp. * @param conf Hadoop configuration * @param files files to copy * @throws EoulsanException if an error occurs while copying */ private void hadoopDistCp(final Configuration conf, final Map<DataFile, DataFile> files) throws EoulsanException { final DistCp distcp = new DistCp(conf); final Map<DataFile, Set<DataFile>> toCopy = new HashMap<>(); // Create a map of file to copy with destination directory as key for (Map.Entry<DataFile, DataFile> e : files.entrySet()) { final DataFile destDir; try { destDir = e.getValue().getParent(); } catch (IOException exp) { throw new EoulsanException(exp.getMessage(), exp); } if (destDir == null) { throw new EoulsanException("Destination directory is null."); } final Set<DataFile> inputFiles; if (toCopy.containsKey(destDir)) { inputFiles = toCopy.get(destDir); } else { inputFiles = new HashSet<>(); toCopy.put(destDir, inputFiles); } inputFiles.add(e.getKey()); } // For each desitination run distcp for (Map.Entry<DataFile, Set<DataFile>> e : toCopy.entrySet()) { final List<String> argsList = new ArrayList<>(); // Add input files for (DataFile f : e.getValue()) { argsList.add(f.toString()); } // Add destination argsList.add(e.getKey().toString()); // Convert arguments in a n array final String[] args = argsList.toArray(new String[argsList.size()]); // Run distcp distcp.runWithException(args); } } }