package fr.ens.biologie.genomique.eoulsan.modules.generators; import static com.google.common.base.Preconditions.checkNotNull; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GTF; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_DESC_TXT; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_FASTA; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; import java.nio.file.Files; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import fr.ens.biologie.genomique.eoulsan.Common; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.annotations.Generator; import fr.ens.biologie.genomique.eoulsan.annotations.LocalOnly; import fr.ens.biologie.genomique.eoulsan.bio.readsmappers.STARReadsMapper; import fr.ens.biologie.genomique.eoulsan.bio.readsmappers.SequenceReadsMapper; import fr.ens.biologie.genomique.eoulsan.core.InputPorts; import fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder; import fr.ens.biologie.genomique.eoulsan.core.Modules; import fr.ens.biologie.genomique.eoulsan.core.OutputPorts; import fr.ens.biologie.genomique.eoulsan.core.OutputPortsBuilder; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.core.Step; import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.core.Version; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFiles; import fr.ens.biologie.genomique.eoulsan.data.protocols.DataProtocol; import fr.ens.biologie.genomique.eoulsan.data.protocols.StorageDataProtocol; import fr.ens.biologie.genomique.eoulsan.io.CompressionType; import fr.ens.biologie.genomique.eoulsan.modules.AbstractModule; import fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule; /** * This class define a module that generate a STAR mapper index. * @since 2.0 * @author Laurent Jourdren */ @LocalOnly @Generator public class STARIndexGeneratorModule extends AbstractModule { public static final String MODULE_NAME = "starindexgenerator"; private static final int OVERHANG_DEFAULT = 100; private final SequenceReadsMapper mapper = new STARReadsMapper(); private Integer overhang = OVERHANG_DEFAULT; private boolean gtfFile; private boolean gtfFormat; private String chrStartEndFilename; private String gtfFeatureExon; private String gtfTagExonParentTranscript; private Integer genomeSAindexNbases; private Integer genomeChrBinNbits; private boolean useExpressionStepParameters; private int localThreads; private int maxLocalThreads; @Override public String getName() { return MODULE_NAME; } @Override public String getDescription() { return "Generate Mapper index"; } @Override public Version getVersion() { return Globals.APP_VERSION; } @Override public InputPorts getInputPorts() { final InputPortsBuilder builder = new InputPortsBuilder(); builder.addPort("genome", GENOME_FASTA).addPort("genomedescription", GENOME_DESC_TXT); if (this.gtfFile) { builder.addPort("annotation", this.gtfFormat ? ANNOTATION_GTF : ANNOTATION_GFF); } return builder.create(); } @Override public OutputPorts getOutputPorts() { return OutputPortsBuilder.singleOutputPort(this.mapper.getArchiveFormat()); } @Override public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters) throws EoulsanException { if (stepParameters == null) { throw new EoulsanException( "No parameters set in " + getName() + " generator"); } for (Parameter p : stepParameters) { switch (p.getName()) { case "overhang": this.overhang = p.getIntValueGreaterOrEqualsTo(1); break; case "gtf.file": Modules.renamedParameter(context, p, "use.gtf.file"); case "use.gtf.file": this.gtfFile = p.getBooleanValue(); break; case "file.chr.start.end": this.chrStartEndFilename = p.getStringValue(); break; case "gtf.feature.exon": this.gtfFeatureExon = p.getStringValue(); break; case "gtf.tag.exon.parent.transcript": this.gtfTagExonParentTranscript = p.getStringValue(); break; case "genome.sa.index.nbases": this.genomeSAindexNbases = p.getIntValueGreaterOrEqualsTo(0); break; case "genome.chr.bin.nbits": this.genomeChrBinNbits = p.getIntValueGreaterOrEqualsTo(0); break; case "use.expression.step.parameters": this.useExpressionStepParameters = p.getBooleanValue(); break; case "local.threads": this.localThreads = p.getIntValueGreaterOrEqualsTo(1); break; case "max.local.threads": this.maxLocalThreads = p.getIntValueGreaterOrEqualsTo(1); break; case "features.file.format": switch (p.getLowerStringValue()) { case "gtf": this.gtfFormat = true; break; case "gff": case "gff3": this.gtfFormat = false; break; default: Modules.badParameterValue(context, p, "Unknown annotation file format"); break; } break; default: throw new EoulsanException( "Unknown parameter for " + getName() + " step: " + p.getName()); } } } /** * Set the "gtf.feature.exon" and "gtf.tag.exon.parent.transcript" parameter * from the expression step parameters. * @param context the context of the task * @throws EoulsanException if more than one expression step exists */ private void searchExpressionStepParameters(final TaskContext context) throws EoulsanException { int count = 0; for (Step step : context.getWorkflow().getSteps()) { if (AbstractExpressionModule.MODULE_NAME.equals(step.getModuleName())) { for (Parameter p : step.getParameters()) { switch (p.getName()) { case AbstractExpressionModule.OLD_GENOMIC_TYPE_PARAMETER_NAME: case AbstractExpressionModule.GENOMIC_TYPE_PARAMETER_NAME: gtfFeatureExon = p.getStringValue(); break; case AbstractExpressionModule.OLD_ATTRIBUTE_ID_PARAMETER_NAME: case AbstractExpressionModule.ATTRIBUTE_ID_PARAMETER_NAME: gtfTagExonParentTranscript = p.getStringValue(); break; default: break; } } count++; } } if (count == 0) { throw new EoulsanException("No expression step found in the workflow"); } if (count > 1) { throw new EoulsanException( "Found more than one expression step in the workflow"); } } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { try { final StringBuilder additionalArguments = new StringBuilder(); final Map<String, String> additionalDescription = new HashMap<>(); final List<File> temporaryFiles = new ArrayList<>(); // Search expression parameter is needed if (this.useExpressionStepParameters) { searchExpressionStepParameters(context); } if (this.gtfFile) { // Get the annotation data final Data annotationData = context .getInputData(this.gtfFormat ? ANNOTATION_GTF : ANNOTATION_GFF); // Get the annotation DataFile final DataFile gffFile = annotationData.getDataFile(); final File gffFilePath = uncompressFileIfNecessary(context, temporaryFiles, gffFile); additionalArguments.append("--sjdbGTFfile"); additionalArguments.append(' '); additionalArguments.append(gffFilePath.getAbsolutePath()); additionalArguments.append(' '); additionalDescription.put("sjdbGTFfile", computeMD5SumFile(gffFilePath)); } if (this.overhang != null) { additionalArguments.append("--sjdbOverhang"); additionalArguments.append(' '); additionalArguments.append(this.overhang.toString()); additionalArguments.append(' '); additionalDescription.put("sjdbOverhang", this.overhang.toString()); } if (this.gtfTagExonParentTranscript != null) { additionalArguments.append("--sjdbGTFtagExonParentTranscript"); additionalArguments.append(' '); additionalArguments.append(this.gtfTagExonParentTranscript); additionalArguments.append(' '); additionalDescription.put("sjdbGTFtagExonParentTranscript", this.gtfTagExonParentTranscript); } if (this.gtfFeatureExon != null) { additionalArguments.append("--sjdbGTFfeatureExon"); additionalArguments.append(' '); additionalArguments.append(this.gtfFeatureExon); additionalArguments.append(' '); additionalDescription.put("sjdbGTFfeatureExon", this.gtfFeatureExon); } if (this.chrStartEndFilename != null) { DataFile chrStartEndFile = new DataFile(this.chrStartEndFilename); if (!chrStartEndFile.exists()) { throw new IOException( "Unable to read chromosome startend file: " + chrStartEndFile); } final File chrStartEndFilePath = uncompressFileIfNecessary(context, temporaryFiles, chrStartEndFile); additionalArguments.append("--sjdbFileChrStartEnd"); additionalArguments.append(' '); additionalArguments.append(chrStartEndFilePath.getAbsolutePath()); additionalArguments.append(' '); additionalDescription.put("sjdbFileChrStartEnd", computeMD5SumFile(chrStartEndFilePath)); } if (this.genomeSAindexNbases != null) { additionalArguments.append("--genomeSAindexNbases"); additionalArguments.append(' '); additionalArguments.append(this.genomeSAindexNbases.toString()); additionalArguments.append(' '); additionalDescription.put("genomeSAindexNbases", this.genomeSAindexNbases.toString()); } if (this.genomeChrBinNbits != null) { additionalArguments.append("--genomeChrBinNbits"); additionalArguments.append(' '); additionalArguments.append(this.genomeChrBinNbits.toString()); additionalArguments.append(' '); additionalDescription.put("genomeChrBinNbits", this.genomeChrBinNbits.toString()); } status .setProgressMessage(this.mapper.getMapperName() + " index creation"); // Create the index GenomeMapperIndexGeneratorModule.execute(this.mapper, context, additionalArguments.toString(), additionalDescription, Common.getThreadsNumber(this.localThreads, this.maxLocalThreads)); // Remove temporary files for (File temporaryFile : temporaryFiles) { if (!temporaryFile.delete()) { context.getLogger() .warning("Cannot remove temporary file: " + temporaryFile); } } } catch (IOException | EoulsanException e) { return status.createTaskResult(e); } return status.createTaskResult(); } // // Other methods // /** * Uncompress a file if compressed. * @param context the step context * @param temporaryFiles the list of temporary files * @param file the file to process * @return the absolute path of the file (once uncompressed or not) * @throws IOException if an error occurs while uncompressing the file */ private File uncompressFileIfNecessary(final TaskContext context, List<File> temporaryFiles, final DataFile file) throws IOException { checkNotNull(file, "file argument cannot be null"); final File result; if (file.getCompressionType() != CompressionType.NONE || !file.isLocalFile()) { // Uncompress file final File uncompressedFile = uncompressFile(context, file); // Add the temporary file to the file of the file to remove temporaryFiles.add(uncompressedFile); result = uncompressedFile; } else { result = file.toFile(); } return result; } /** * Uncompress a file to a temporary file. * @param context Step context * @param file file to uncompress * @return the path to the uncompressed file * @throws IOException if an error occurs while creating the uncompressed file */ private File uncompressFile(final TaskContext context, final DataFile file) throws IOException { checkNotNull(file, "file argument cannot be null"); final DataFile realFile; final DataProtocol protocol = file.getProtocol(); // Get the underlying file if the file protocol is a storage protocol if (protocol instanceof StorageDataProtocol) { realFile = ((StorageDataProtocol) protocol).getUnderLyingData(file); } else { realFile = file; } final File outputFile = Files.createTempFile(context.getLocalTempDirectory().toPath(), MODULE_NAME + "-", realFile.getExtension()).toFile(); context.getLogger() .fine("Uncompress/copy " + realFile + " to " + outputFile); DataFiles.copy(realFile, new DataFile(outputFile)); return outputFile; } /** * Compute the md5 sum of a file. * @param file the file * @return a string with the md5sum of a file * @throws IOException if an error occurs while computing the md5sum */ private static String computeMD5SumFile(File file) throws IOException { MessageDigest md5Digest; try { md5Digest = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { throw new IOException(e); } try (InputStream is = new FileInputStream(file)) { new DigestInputStream(is, md5Digest); } return new BigInteger(1, md5Digest.digest()).toString(16); } }