/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.diffana;
import static com.google.common.base.Preconditions.checkNotNull;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.util.StringUtils.toCompactTime;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.data.DataFormats;
import fr.ens.biologie.genomique.eoulsan.design.Design;
import fr.ens.biologie.genomique.eoulsan.design.DesignUtils;
import fr.ens.biologie.genomique.eoulsan.design.Experiment;
import fr.ens.biologie.genomique.eoulsan.design.Sample;
import fr.ens.biologie.genomique.eoulsan.util.FileUtils;
import fr.ens.biologie.genomique.eoulsan.util.r.RExecutor;
import fr.ens.biologie.genomique.eoulsan.util.r.RSConnection;
/**
* This class create and launch an R script to compute normalisation of
* expression data
* @since 1.2
* @author Vivien Deshaies
*/
public class Normalization {
protected static final String TARGET_CREATION = "/DESeq1/targetCreation.Rnw";
protected static final String NORMALIZATION_FUNCTIONS =
"/DESeq1/normalization_anaDiff_RNAseq_Functions.R";
private static final String NORMALISATION_PART1_WHITH_TECHREP =
"/DESeq1/normalisationPart1WithTechRep.Rnw";
private static final String NORMALIZATION_PART1_WHITHOUT_TECHREP =
"/DESeq1/normalisationPart1WithoutTechRep.Rnw";
private static final String CLUSTERING_PCA_RAW =
"/DESeq1/clusteringAndPCARaw.Rnw";
private static final String CLUSTERING_PCA_NORM =
"/DESeq1/clusteringAndPCANorm.Rnw";
private static final String NORMALIZATION_PART2 =
"/DESeq1/normalizationPart2.Rnw";
protected final Design design;
protected final String expressionFilesPrefix;
protected final String expressionFilesSuffix;
protected RSConnection rConnection = null;
protected final RExecutor executor;
//
// Run methods
//
/**
* Run normalisation step
* @throws EoulsanException
*/
public void run(final TaskContext context, final Data data)
throws EoulsanException {
// Check if there more than one file to launch the analysis
if (data.size() < 2) {
throw new EoulsanException(
"Cannot run the analysis with less than 2 input files");
}
runRExecutor(context, data);
}
/**
* Execute Rnw script.
* @param context Step context
* @param data data to process
* @throws EoulsanException if an error occurs while executing the script
*/
protected void runRExecutor(final TaskContext context, final Data data)
throws EoulsanException {
final boolean saveRScript = context.getSettings().isSaveRscripts();
final DataFile workflowOutputDir = context.getOutputDirectory();
try {
// create an iterator on the map values
for (Experiment experiment : this.design.getExperiments()) {
// Skip experiment if required in design
if (DesignUtils.isSkipped(experiment)) {
continue;
}
getLogger().info("Experiment : " + experiment.getName());
// Open executor connection
executor.openConnection();
// Put input input files
for (Data d : data.getListElements()) {
final int sampleId = d.getMetadata().getSampleNumber();
// Check if the sample ID exists
if (sampleId == -1) {
throw new EoulsanException(
"No sample Id found for input file: " + d.getDataFile());
}
final String linkFilename = this.expressionFilesPrefix
+ sampleId + this.expressionFilesSuffix;
executor.putInputFile(d.getDataFile(), linkFilename);
}
// Generate the R script
final String rScript = generateScript(experiment, context);
// Set the description of the analysis
final String description = context.getCurrentStep().getId()
+ '_' + experiment.getId() + '-'
+ toCompactTime(System.currentTimeMillis());
// Set the Sweave output
final String sweaveOutput = context.getCurrentStep().getId()
+ '_' + experiment.getId() + ".tex";
// Execute the R script
executor.executeRScript(rScript, true, sweaveOutput, saveRScript,
description, workflowOutputDir);
// Remove input files
executor.removeInputFiles();
// Retrieve output files
executor.getOutputFiles();
// Close executor connection
executor.closeConnection();
}
} catch (IOException e) {
throw new EoulsanException(
"Error while running differential analysis: " + e.getMessage(), e);
}
}
//
// Getters
//
/**
* Test if there is Technical replicates into rRepTechGroup field.
* @param rRepTechGroup list of the technical replicate group
*/
protected boolean isTechnicalReplicates(final List<String> rRepTechGroup) {
Map<String, String> rtgMap = new HashMap<>();
for (String repTechGroup : rRepTechGroup) {
if (rtgMap.containsKey(repTechGroup)) {
return true;
}
rtgMap.put(repTechGroup, "");
}
return false;
}
//
// R code generation methods
//
/**
* Read a static part of the generated script.
* @param staticFile the name of a file containing a part of the script
* @return A String with the static part of the script
* @throws EoulsanException if an error occurs while reading the script
*/
protected String readStaticScript(final String staticFile)
throws EoulsanException {
final StringBuilder sb = new StringBuilder();
final InputStream is = DiffAna.class.getResourceAsStream(staticFile);
try {
final BufferedReader br = FileUtils.createBufferedReader(is);
String line;
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append('\n');
}
} catch (IOException e) {
throw new EoulsanException("Error while reading a file" + e.getMessage());
}
return sb.toString();
}
/**
* Generate the R script.
* @param experiment the experiment
* @param context step context
* @return String rScript R script to execute
* @throws EoulsanException if an error occurs while generate the R script
*/
protected String generateScript(final Experiment experiment,
final TaskContext context) throws EoulsanException {
final Map<String, List<Integer>> conditionsMap = new HashMap<>();
final List<Integer> rSampleIds = new ArrayList<>();
final List<String> rSampleNames = new ArrayList<>();
final List<String> rCondNames = new ArrayList<>();
List<String> rRepTechGroup = new ArrayList<>();
int i = 0;
// Get samples ids, conditions names/indexes and repTechGoups
for (Sample s : experiment.getSamples()) {
final String condition = DesignUtils.getCondition(experiment, s);
if (condition == null) {
throw new EoulsanException("No condition field found in design file.");
}
if ("".equals(condition)) {
throw new EoulsanException("No value for condition in sample: "
+ s.getName() + " (" + s.getId() + ")");
}
final String repTechGroup = DesignUtils.getRepTechGroup(experiment, s);
if (repTechGroup != null && !"".equals(repTechGroup)) {
rRepTechGroup.add(repTechGroup);
}
final List<Integer> index;
if (!conditionsMap.containsKey(condition)) {
index = new ArrayList<>();
conditionsMap.put(condition, index);
} else {
index = conditionsMap.get(condition);
}
index.add(i);
rSampleIds.add(s.getNumber());
rSampleNames.add(s.getName());
rCondNames.add(condition);
i++;
}
checkRepTechGroupCoherence(rRepTechGroup, rCondNames);
// Create Rnw script stringbuilder with preamble
String pdfTitle = escapeUnderScore(experiment.getName()) + " normalisation";
String filePrefix =
"normalization_" + escapeUnderScore(experiment.getName());
final StringBuilder sb =
generateRnwpreamble(experiment.getSamples(), pdfTitle, filePrefix);
/*
* Replace "na" values of repTechGroup by unique sample ids to avoid pooling
* problem while executing R script
*/
replaceRtgNA(rRepTechGroup, rSampleNames);
// Add sampleNames vector
generateSampleNamePart(rSampleNames, sb);
// Add SampleIds vector
generateSampleIdsPart(rSampleIds, sb);
// Add file names vector
generateExpressionFileNamesPart(sb);
// Add repTechGroupVector
generateRepTechGroupPart(rRepTechGroup, sb);
// Add condition to R script
generateConditionPart(rCondNames, sb);
// Add projectPath, outPath and projectName
sb.append("# projectPath : path of count files directory\n");
sb.append("projectPath <- \"\"\n");
sb.append("# outPath path of the outputs\n");
sb.append("outPath <- \"./\"\n");
sb.append("projectName <- \"");
sb.append(experiment.getName());
sb.append("\"\n@\n\n");
// Add target creation
sb.append(readStaticScript(TARGET_CREATION));
sb.append("\\section{Analysis}\n\n");
sb.append("\t\\subsection{Normalization}\n\n");
sb.append("\\begin{itemize}\n\n");
if (experiment.getSamples().size() > 2) {
sb.append(readStaticScript(CLUSTERING_PCA_RAW));
}
// Add normalization part
if (isTechnicalReplicates(rRepTechGroup)) {
sb.append(readStaticScript(NORMALISATION_PART1_WHITH_TECHREP));
} else {
sb.append(readStaticScript(NORMALIZATION_PART1_WHITHOUT_TECHREP));
}
// Add normalise data clustering if it's possible
if (isEnoughRepTechGroup(rRepTechGroup)) {
sb.append(readStaticScript(CLUSTERING_PCA_NORM));
}
sb.append(readStaticScript(NORMALIZATION_PART2));
// end document
sb.append("\\end{document}\n");
return sb.toString();
}
/**
* Write Rnw preamble.
* @param experimentSamplesList sample experiment list
* @param title title of the document
* @param filePrefix Sweave file prefix
* @return a StringBuilder with Rnw preamble
*/
protected StringBuilder generateRnwpreamble(
final List<Sample> experimentSamplesList, final String title,
final String filePrefix) {
StringBuilder sb = new StringBuilder();
// Add packages to the LaTeX StringBuilder
sb.append("\\documentclass[a4paper,10pt]{article}\n");
sb.append("\\usepackage[utf8]{inputenc}\n");
sb.append("\\usepackage{lmodern}\n");
sb.append("\\usepackage{a4wide}\n");
sb.append("\\usepackage{marvosym}\n");
sb.append("\\usepackage{graphicx}\n\n");
// Set Sweave options
sb.append("\\SweaveOpts{eps = FALSE, pdf = TRUE, prefix.string=");
sb.append(filePrefix);
sb.append("}\n\n");
sb.append("\\setkeys{Gin}{width=0.95\textwidth}\n\n");
// Add document title
sb.append("\\title{");
sb.append(title);
sb.append("}\n\n");
// Begin document...
sb.append("\\begin{document}\n");
sb.append("\\maketitle\n\n");
// Add a begin R code chunck mark
sb.append("<<functions, echo=FALSE>>=\n");
// Add the auto generate info
sb.append("### Auto generated by ");
sb.append(Globals.APP_NAME);
sb.append(" ");
sb.append(Globals.APP_VERSION_STRING);
sb.append(" on ");
sb.append(new Date(System.currentTimeMillis()));
sb.append(" ###\n\n");
// Add function part to string builder
try {
sb.append(readStaticScript(NORMALIZATION_FUNCTIONS));
} catch (EoulsanException e) {
e.printStackTrace();
}
// Add a end R code chunck mark
sb.append("@\n\n");
// Add initialization part
sb.append("\\section{Initialization}\n");
sb.append("<<>>=\n");
return sb;
}
/**
* Add sampleNames vector to R script.
* @param rSampleNames sample names
* @param sb StringBuilder where write the part of the script
*/
protected void generateSampleNamePart(final List<String> rSampleNames,
final StringBuilder sb) {
// Add samples names to R script
sb.append("# create sample names vector\n");
sb.append("sampleNames <- c(");
boolean first = true;
for (String r : rSampleNames) {
if (first) {
first = false;
} else {
sb.append(',');
}
sb.append('\"');
sb.append(r);
sb.append('\"');
}
sb.append(")\n\n");
}
/**
* Add SampleIds vector to R script.
* @param rSampleIds samples identifiers
* @param sb StringBuilder where write the part of the script
*/
protected void generateSampleIdsPart(final List<Integer> rSampleIds,
final StringBuilder sb) {
// Put sample ids into R vector
sb.append("sampleIds <- c(");
int i = 0;
for (int id : rSampleIds) {
i++;
sb.append(id);
if (i < rSampleIds.size()) {
sb.append(",");
}
}
sb.append(")\n\n");
}
/**
* Add expression file name vector to R script.
* @param sb StringBuilder where write the part of the script
*/
protected void generateExpressionFileNamesPart(final StringBuilder sb) {
// Add file names vector
sb.append("#create file names vector\n");
sb.append("fileNames <- paste(\"");
sb.append(this.expressionFilesPrefix);
sb.append("\",sampleIds" + ',' + '\"');
sb.append(this.expressionFilesSuffix);
sb.append("\",sep=\"\")\n\n");
}
/**
* Write the section of the script that handle technical replicate groups.
* @param rRepTechGroup list of technical replicate groups
* @param sb StringBuilder where write the part of the script
*/
protected void generateRepTechGroupPart(final List<String> rRepTechGroup,
final StringBuilder sb) {
if (isTechnicalReplicates(rRepTechGroup)) {
// Add repTechGroup vector
sb.append("# create technical replicates groups vector\n");
sb.append("repTechGroup <- c(");
boolean first = true;
for (String r : rRepTechGroup) {
if (first) {
first = false;
} else {
sb.append(',');
}
sb.append('\"');
sb.append(r);
sb.append('\"');
}
sb.append(")\n\n");
} else {
// Add repTechGroup vector equal to sampleNames to avoid error in R
// function buildTarget
sb.append("# create technical replicates groups vector\n");
sb.append("repTechGroup <- sampleNames\n\n");
}
}
/**
* Add condition vector to R script.
* @param rCondNames condition names
* @param sb StringBuilder where write the part of the script
*/
protected void generateConditionPart(final List<String> rCondNames,
final StringBuilder sb) {
sb.append("# create condition vector\n");
sb.append("condition <- c(");
boolean first = true;
for (String r : rCondNames) {
if (first) {
first = false;
} else {
sb.append(',');
}
sb.append('\"');
sb.append(r);
sb.append('\"');
}
sb.append(")\n\n");
}
//
// Other methods
//
/**
* Check if there is a problem in the repTechGroup coherence.
* @param rRepTechGroup technical replicate group
* @param rCondNames condition names
* @throws EoulsanException if an error if found in the design file
*/
protected void checkRepTechGroupCoherence(final List<String> rRepTechGroup,
final List<String> rCondNames) throws EoulsanException {
// Check repTechGroup field coherence
Map<String, String> condRepTGMap = new HashMap<>();
for (int i = 0; i < rRepTechGroup.size(); i++) {
String repTechGroup = rRepTechGroup.get(i);
String condition = rCondNames.get(i);
if (!repTechGroup.toLowerCase().equals("na")) {
if (!condRepTGMap.containsKey(repTechGroup)) {
condRepTGMap.put(repTechGroup, condition);
} else if (!condRepTGMap.get(repTechGroup).equals(condition)) {
throw new EoulsanException(
"There is a mistake in RepTechGroup field of design file : "
+ "two condition have the same repTechGroup value : "
+ repTechGroup);
}
}
}
}
/**
* Escape underscore for LaTeX title.
* @param s string to escape
* @return s with escaped underscore
*/
protected String escapeUnderScore(final String s) {
return s.replace("_", "\\_");
}
/**
* Replace na values in RepTechGroup list to avoid pooling error.
* @param rRepTechGroup list of technical replicate groups
* @param rSampleNames sample names
*/
protected void replaceRtgNA(final List<String> rRepTechGroup,
final List<String> rSampleNames) {
for (int j = 0; j < rRepTechGroup.size(); j++) {
if (rRepTechGroup.get(j).toLowerCase().equals("na")) {
rRepTechGroup.set(j, rSampleNames.get(j));
}
}
}
/*
* Private methods
*/
/**
* Test if there is enough distinct repTechGroup (>2) to perform clustering.
* @param rRepTechGroup list of technical replicate groups
* @return true if there is enough distinct repTechGroup (>2) to perform
* clustering
*/
private boolean isEnoughRepTechGroup(final List<String> rRepTechGroup) {
List<String> repTechGroupMap = new ArrayList<>();
for (String r : rRepTechGroup) {
if (!repTechGroupMap.contains(r)) {
repTechGroupMap.add(r);
}
if (repTechGroupMap.size() > 2) {
return true;
}
}
return false;
}
//
// Constructor
//
/**
* Public constructor.
* @param executor executor to use to execute the normalization
* @param design The design object
* @throws EoulsanException if an error occurs if connection to RServe server
* cannot be established
*/
public Normalization(final RExecutor executor, final Design design)
throws EoulsanException {
checkNotNull(design, "design is null.");
this.design = design;
final DataFormat eDF = DataFormats.EXPRESSION_RESULTS_TSV;
this.expressionFilesPrefix = eDF.getPrefix();
this.expressionFilesSuffix = eDF.getDefaultExtension();
this.executor = executor;
}
}