/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules; import java.io.File; import java.io.IOException; import java.nio.file.FileSystems; import java.nio.file.PathMatcher; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import com.google.common.base.Splitter; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.annotations.LocalOnly; import fr.ens.biologie.genomique.eoulsan.annotations.NoLog; import fr.ens.biologie.genomique.eoulsan.annotations.ReuseModuleInstance; import fr.ens.biologie.genomique.eoulsan.core.DataUtils; import fr.ens.biologie.genomique.eoulsan.core.FileNaming; import fr.ens.biologie.genomique.eoulsan.core.FileNamingParsingRuntimeException; import fr.ens.biologie.genomique.eoulsan.core.Modules; import fr.ens.biologie.genomique.eoulsan.core.Naming; import fr.ens.biologie.genomique.eoulsan.core.OutputPorts; import fr.ens.biologie.genomique.eoulsan.core.OutputPortsBuilder; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.core.Version; import fr.ens.biologie.genomique.eoulsan.core.workflow.DataMetadataStorage; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFiles; import fr.ens.biologie.genomique.eoulsan.data.DataFormat; import fr.ens.biologie.genomique.eoulsan.data.DataFormatRegistry; import fr.ens.biologie.genomique.eoulsan.design.Sample; import fr.ens.biologie.genomique.eoulsan.io.CompressionType; /** * This class define a import step. * @since 2.0 * @author Laurent Jourdren */ @LocalOnly @ReuseModuleInstance @NoLog public class ImportModule extends AbstractModule { public static final String MODULE_NAME = "import"; private static final Splitter SPACE_SPLITTER = Splitter.on(' ').trimResults().omitEmptyStrings(); private Set<DataFile> files; private OutputPorts outputPorts; @Override public String getName() { return MODULE_NAME; } @Override public Version getVersion() { return Globals.APP_VERSION; } @Override public OutputPorts getOutputPorts() { return this.outputPorts; } @Override public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters) throws EoulsanException { DataFile baseDir = new DataFile(new File(".")); String pattern = ""; // Parse parameters for (Parameter p : stepParameters) { switch (p.getName()) { case "files": pattern = p.getStringValue(); break; case "directory": if (p.getStringValue().length() > 0) { baseDir = new DataFile(p.getStringValue()); } break; default: Modules.unknownParameter(context, p); } } // Set the output ports try { // Check if base directory exists if (!(baseDir.exists() && baseDir.getMetaData().isDir())) { Modules.invalidConfiguration(context, "The directory does not exists: " + baseDir); } // Get the list of the files to import this.files = listFilesFromPatterns(baseDir, pattern); // Check if some files has been found if (this.files.isEmpty()) { Modules.invalidConfiguration(context, "No input file found in the " + getName() + " step"); } // Get the format and the compression of the files final Map<DataFormat, CompressionType> formats = listDataFormatFromFileList(this.files); // Create the output ports final OutputPortsBuilder builder = new OutputPortsBuilder(); int count = 0; for (Map.Entry<DataFormat, CompressionType> e : formats.entrySet()) { builder.addPort("output" + (++count), true, e.getKey(), e.getValue()); } this.outputPorts = builder.create(); } catch (IOException e) { throw new EoulsanException(e); } } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { final DataFormatRegistry registry = DataFormatRegistry.getInstance(); // Create a map with the samples final Map<String, Sample> samples = new HashMap<>(); for (Sample sample : context.getWorkflow().getDesign().getSamples()) { samples.put(Naming.toValidName(sample.getId()), sample); } try { // Sort the list of files to process final List<DataFile> sortedFiles = new ArrayList<>(this.files); Collections.sort(sortedFiles); // Group files related to the same data final Set<List<DataFile>> groupedFiles = groupFiles(registry, files); // For each data for (List<DataFile> inputFiles : groupedFiles) { Data data = null; // For each files of the data for (DataFile inputFile : inputFiles) { final DataFormat format = fileFormat(registry, inputFile); final FileNaming fileNaming = fileNaming(inputFile); // Define the data object if (data == null) { // If file use the Eoulsan naming if (fileNaming != null) { data = context.getOutputData(format, format.getPrefix()) .addDataToList(fileNaming.getDataName(), fileNaming.getPart()); // Set metadata of imported files final boolean isMetadataSet = DataMetadataStorage.getInstance() .loadMetadata(data, Collections.singletonList(inputFile)); // Set the metadata from sample metadata if (!isMetadataSet && samples.containsKey(data.getName())) { DataUtils.setDataMetaData(data, samples.get(data.getName())); } } // If file does not use Eoulsan naming else { // Define the data name final String dataName = Naming.toValidName(inputFile.getBasename()); data = context.getOutputData(format, format.getPrefix()) .addDataToList(dataName); } } DataFile outputFile; if (format.getMaxFilesCount() > 1) { if (fileNaming != null) { outputFile = data.getDataFile(fileNaming.getFileIndex()); } else { outputFile = data.getDataFile(0); } } else { outputFile = data.getDataFile(); } // Copy or create symbolic link DataFiles.symlinkOrCopy(inputFile, outputFile, true); } // Set the metadata for the data DataMetadataStorage.getInstance().loadMetadata(data, inputFiles); } } catch (EoulsanException | IOException e) { return status.createTaskResult(e); } return status.createTaskResult(); } // // Other methods // /** * Build collection of PathMatcher for selection files to tread according to a * pattern file define in test configuration. Patterns set in string with * space to separator. Get input and output patterns files. * @param patterns sequences of patterns filesList. * @return a set of PathMatcher, one per pattern. */ private static Set<PathMatcher> createPathMatchers(final String patterns) { // No pattern defined if (patterns == null || patterns.trim().isEmpty()) { return Collections.emptySet(); } // Initialize collection final Set<PathMatcher> result = new HashSet<>(); // Parse patterns for (final String globSyntax : SPACE_SPLITTER.split(patterns)) { // Convert in syntax reading by Java final PathMatcher matcher = FileSystems.getDefault().getPathMatcher("glob:" + globSyntax); // Add in list patterns files to treat result.add(matcher); } // Return unmodifiable collection return Collections.unmodifiableSet(result); } /** * Listing recursively all files in the source directory which match with * patterns files * @param patternKey the pattern key * @return the list with all files which match with pattern * @throws IOException if an error occurs while parsing input directory * @throws EoulsanException if no file to compare found */ private Set<DataFile> listFilesFromPatterns(final DataFile directory, final String patternKey) throws IOException, EoulsanException { final Set<PathMatcher> fileMatchers = createPathMatchers(patternKey); final Set<DataFile> files = listFilesFromPatterns(directory, fileMatchers); // Return unmodifiable list return Collections.unmodifiableSet(files); } /** * Create list files matching to the patterns * @param patterns set of pattern to filter file in result directory * @return unmodifiable list of files or empty list * @throws IOException */ private Set<DataFile> listFilesFromPatterns(final DataFile directory, final Set<PathMatcher> patterns) throws IOException { final Set<DataFile> filesFound = new HashSet<>(); final List<DataFile> files = directory.list(); for (final PathMatcher matcher : patterns) { for (DataFile f : files) { if (matcher.matches(new File(f.getName()).toPath())) { filesFound.add(f); } } } return Collections.unmodifiableSet(filesFound); } /** * Get the format and compression of a list of files. * @param files the list of file * @return a map with for each format the common compression of the files * @throws EoulsanException if format of a file cannot be determined */ private static Map<DataFormat, CompressionType> listDataFormatFromFileList( final Set<DataFile> files) throws EoulsanException { if (files == null) { return Collections.emptyMap(); } final Map<DataFormat, CompressionType> result = new HashMap<>(); final DataFormatRegistry registry = DataFormatRegistry.getInstance(); for (DataFile file : files) { final DataFormat format = fileFormat(registry, file); final CompressionType compression = file.getCompressionType(); final CompressionType previous = result.get(format); if (previous == null || previous == CompressionType.NONE) { result.put(format, compression); } } return Collections.unmodifiableMap(result); } /** * Get the format of a file. * @param registry the format registry * @param file the file which name must be parsed * @return the DataFormat of the file * @throws EoulsanException if no format or several format for the file has * been found */ private static DataFormat fileFormat(final DataFormatRegistry registry, final DataFile file) throws EoulsanException { try { // First try to get format of file if file name use Eoulsan file naming FileNaming name = FileNaming.parse(file); return name.getFormat(); } catch (FileNamingParsingRuntimeException e) { // If not work, try to get the format of the file from its file extension final String extension = file.getExtension(); final Set<DataFormat> formats = registry.getDataFormatsFromExtension(extension); if (formats.isEmpty()) { throw new EoulsanException("No format found for file: " + file); } if (formats.size() > 1) { throw new EoulsanException( "More than one format found for file: " + file); } return formats.iterator().next(); } } /** * Get the FileNaming related to a file if can be created. * @param file file which name must be parsed * @return a FileNaming object or null, if the file name cannot be parsed */ private static FileNaming fileNaming(final DataFile file) { try { return FileNaming.parse(file); } catch (FileNamingParsingRuntimeException e) { return null; } } /** * Group file by data. * @param registry format registry * @param files files to process */ private static Set<List<DataFile>> groupFiles( final DataFormatRegistry registry, final Set<DataFile> files) { final Set<List<DataFile>> result = new HashSet<>(); List<DataFile> group = new ArrayList<>(); DataFile previous = null; // Sort files List<DataFile> sortedFiles = new ArrayList<>(); sortedFiles.addAll(files); Collections.sort(sortedFiles); // For each files for (DataFile file : sortedFiles) { // Test if current file is related to the same data than the previous data if (group.isEmpty() || FileNaming.dataEquals(previous, file)) { group.add(file); } else { result.add(group); group = new ArrayList<>(); group.add(file); } previous = file; } if (!group.isEmpty()) { result.add(group); } return Collections.unmodifiableSet(result); } }