/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop;
import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.EXPRESSION_RESULTS_TSV;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_DESC_TXT;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.Settings;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException;
import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription;
import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqCounter;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqUtils;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.ExpressionOutputFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMInputFormat;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.Parameter;
import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule;
import fr.ens.biologie.genomique.eoulsan.modules.expression.FinalExpressionFeaturesCreator;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;
import fr.ens.biologie.genomique.eoulsan.util.locker.Locker;
import fr.ens.biologie.genomique.eoulsan.util.locker.ZooKeeperLocker;
/**
* This class is the main class for the expression program of the reads in
* hadoop mode.
* @since 1.0
* @author Laurent Jourdren
*/
@HadoopOnly
public class ExpressionHadoopModule extends AbstractExpressionModule {
private static final String TSAM_EXTENSION = ".tsam";
private static final String SERIALIZATION_EXTENSION = ".ser";
static final char SAM_RECORD_PAIRED_END_SERPARATOR = '£';
static final String GENOME_DESC_PATH_KEY =
Globals.PARAMETER_PREFIX + ".expression.genome.desc.file";
private Configuration conf;
/**
* Create JobConf object for HTSeq-count.
* @param context the task context
* @param alignmentsData alignment data
* @param featureAnnotationData feature annotations data
* @param gtfFormat true if the annotation file is in GTF format
* @param genomeDescriptionData genome description data
* @param genomicType genomic type
* @param attributeId attributeId
* @param splitAttributeValues split attribute values
* @param stranded stranded mode
* @param overlapMode overlap mode
* @param removeAmbiguousCases true to remove ambiguous cases
* @throws IOException if an error occurs while creating job
* @throws BadBioEntryException if an entry of the annotation file is invalid
* @throws EoulsanException if the job creating fails
*/
private static Job createJobHTSeqCounter(final Configuration parentConf,
final TaskContext context, final Data alignmentsData,
final Data featureAnnotationData, final boolean gtfFormat,
final Data genomeDescriptionData, final Data outData,
final String genomicType, final String attributeId,
final boolean splitAttributeValues, final StrandUsage stranded,
final OverlapMode overlapMode, final boolean removeAmbiguousCases,
final boolean tsamFormat)
throws IOException, BadBioEntryException, EoulsanException {
final Configuration jobConf = new Configuration(parentConf);
// Get input DataFile
DataFile inputDataFile = alignmentsData.getDataFile();
if (inputDataFile == null) {
throw new IOException("No input file found.");
}
final String dataFileSource;
if (tsamFormat) {
dataFileSource =
StringUtils.filenameWithoutExtension(inputDataFile.getSource())
+ TSAM_EXTENSION;
} else {
dataFileSource = inputDataFile.getSource();
}
// Set input path
final Path inputPath = new Path(dataFileSource);
// Get annotation DataFile
final DataFile annotationDataFile = featureAnnotationData.getDataFile();
// Get output file
final DataFile outFile = outData.getDataFile();
// Get temporary file
final DataFile tmpFile =
new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp");
getLogger().fine("sample: " + alignmentsData.getName());
getLogger().fine("inputPath.getName(): " + inputPath.getName());
getLogger().fine("annotationDataFile: " + annotationDataFile.getSource());
getLogger().fine("outFile: " + outFile.getSource());
getLogger().fine("tmpFile: " + tmpFile.getSource());
jobConf.set("mapred.child.java.opts", "-Xmx1024m");
// Set counter group
jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);
// Set Genome description path
final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile();
jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource());
// Set the "stranded" parameter
jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName());
// Set the "overlap mode" parameter
jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName());
// Set the "remove ambiguous cases" parameter
jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES,
removeAmbiguousCases);
final Path featuresIndexPath =
getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());
getLogger().info("featuresIndexPath: " + featuresIndexPath);
// Create serialized feature index
if (!PathUtils.isFile(featuresIndexPath, jobConf)) {
final Locker lock = createZookeeperLock(parentConf, context);
lock.lock();
createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType,
attributeId, splitAttributeValues, stranded, genomeDescDataFile,
featuresIndexPath, jobConf);
lock.unlock();
}
// Create the job and its name
final Job job = Job.getInstance(jobConf,
"Expression computation with htseq-count ("
+ alignmentsData.getName() + ", " + inputPath.getName() + ", "
+ annotationDataFile.getSource() + ", " + genomicType + ", "
+ attributeId + ", stranded: " + stranded
+ ", removeAmbiguousCases: " + removeAmbiguousCases + ")");
// Set the path to the features index
job.addCacheFile(featuresIndexPath.toUri());
// Set the jar
job.setJarByClass(ExpressionHadoopModule.class);
// Set input path
FileInputFormat.setInputPaths(job, inputPath);
// Set input format
job.setInputFormatClass(SAMInputFormat.class);
// Set the mapper class
job.setMapperClass(HTSeqCountMapper.class);
// Set the combiner class
job.setCombinerClass(HTSeqCountReducer.class);
// Set the reducer class
job.setReducerClass(HTSeqCountReducer.class);
// Set the output format
job.setOutputFormatClass(ExpressionOutputFormat.class);
// Set the output key class
job.setOutputKeyClass(Text.class);
// Set the output value class
job.setOutputValueClass(LongWritable.class);
// Set output path
FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource()));
return job;
}
private static Job createJobPairedEnd(final Configuration parentConf,
final TaskContext context, final Data alignmentsData,
final Data genomeDescriptionData)
throws IOException, BadBioEntryException {
final Configuration jobConf = new Configuration(parentConf);
// Get the source
final DataFile inputDataFile = alignmentsData.getDataFile();
// Set input path
final Path inputPath = new Path(inputDataFile.getSource());
// Set counter group
jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);
// Set Genome description path
jobConf.set(GENOME_DESC_PATH_KEY, genomeDescriptionData.getDataFilename());
// Create the job and its name
final Job job = Job.getInstance(jobConf,
"Pretreatment for the expression estimation step ("
+ alignmentsData.getName() + ", " + inputDataFile.getSource()
+ ")");
// Set the jar
job.setJarByClass(ExpressionHadoopModule.class);
// Set input path
FileInputFormat.addInputPath(job, inputPath);
// Set the Mapper class
job.setMapperClass(PreTreatmentExpressionMapper.class);
// Set the Reducer class
job.setReducerClass(PreTreatmentExpressionReducer.class);
// Set the output key class
job.setOutputKeyClass(Text.class);
// Set the output value class
job.setOutputValueClass(Text.class);
// Output name
String outputName =
StringUtils.filenameWithoutExtension(inputPath.getName());
outputName = outputName.substring(0, outputName.length());
outputName += TSAM_EXTENSION;
// Set output path
FileOutputFormat.setOutputPath(job,
new Path(inputPath.getParent(), outputName));
return job;
}
/**
* @param context Eoulsan context
* @param annotationFile GFF annotation file path
* @param gtfFormat true if the annotation file is in GTF format
* @param featureType feature type to use
* @param attributeId attribute id
* @param splitAttributeValues split attribute values
* @param stranded strand mode
* @param genomeDescDataFile genome description DataFile
* @param featuresIndexPath feature index output path
* @param conf Hadoop configuration object
* @throws IOException if an error occurs while creating the feature index
* file
* @throws BadBioEntryException if an entry of the annotation file is invalid
* @throws EoulsanException if an error occurs with feature types and feature
* identifiers
*/
private static void createFeaturesIndex(final TaskContext context,
final DataFile annotationFile, final boolean gtfFormat,
final String featureType, final String attributeId,
final boolean splitAttributeValues, final StrandUsage stranded,
final DataFile genomeDescDataFile, final Path featuresIndexPath,
final Configuration conf)
throws IOException, BadBioEntryException, EoulsanException {
// Do nothing if the file already exists
if (PathUtils.isFile(featuresIndexPath, conf)) {
return;
}
final GenomicArray<String> features = new GenomicArray<>();
final GenomeDescription genomeDescription =
GenomeDescription.load(genomeDescDataFile.open());
final Map<String, Integer> counts = new HashMap<>();
HTSeqUtils.storeAnnotation(features, annotationFile.open(), gtfFormat,
featureType, stranded, attributeId, splitAttributeValues, counts);
if (counts.size() == 0) {
throw new EoulsanException(
"Warning: No features of type '" + featureType + "' found.\n");
}
final File featuresIndexFile = context.getRuntime()
.createFileInTempDir(StringUtils.basename(annotationFile.getName())
+ SERIALIZATION_EXTENSION);
// Add all chromosomes even without annotations to the feature object
features.addChromosomes(genomeDescription);
// Save the annotation
features.save(featuresIndexFile);
PathUtils.copyLocalFileToPath(featuresIndexFile, featuresIndexPath, conf);
if (!featuresIndexFile.delete()) {
getLogger().warning("Can not delete features index file: "
+ featuresIndexFile.getAbsolutePath());
}
}
private static void createFinalExpressionFeaturesFile(
final TaskContext context, final Data featureAnnotationData,
final Data outData, final Job job, final Configuration conf)
throws IOException {
FinalExpressionFeaturesCreator fefc = null;
// Load the annotation index
final Path featuresIndexPath =
getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());
final FileSystem fs = featuresIndexPath.getFileSystem(conf);
fefc = new FinalExpressionFeaturesCreator(fs.open(featuresIndexPath));
// Set the result path
final Path resultPath = new Path(outData.getDataFile().getSource());
fefc.initializeExpressionResults();
// Load map-reduce results
fefc.loadPreResults(new DataFile(job.getConfiguration()
.get("mapreduce.output.fileoutputformat.outputdir")).open());
fefc.saveFinalResults(fs.create(resultPath));
}
/**
* Create the path to the serialized annotation index.
* @param featureAnnotationFile feature annotation file
* @return an Hadoop path with the path of the serialized annotation
* @throws IOException if an error occurs while getting the path
*/
private static Path getAnnotationIndexSerializedPath(
final DataFile featureAnnotationFile) throws IOException {
final DataFile file = new DataFile(featureAnnotationFile.getParent(),
featureAnnotationFile.getBasename() + SERIALIZATION_EXTENSION);
return new Path(file.getSource());
}
//
// Module methods
//
@Override
public InputPorts getInputPorts() {
return allPortsRequiredInWorkingDirectory(super.getInputPorts());
}
@Override
public void configure(final StepConfigurationContext context,
final Set<Parameter> stepParameters) throws EoulsanException {
super.configure(context, stepParameters);
this.conf = CommonHadoop.createConfiguration(EoulsanRuntime.getSettings());
}
@Override
public TaskResult execute(final TaskContext context,
final TaskStatus status) {
final Data alignmentsData = context.getInputData(MAPPER_RESULTS_SAM);
final Data featureAnnotationData =
context.getInputData(isGTFFormat() ? ANNOTATION_GFF : ANNOTATION_GFF);
final Data genomeDescriptionData = context.getInputData(GENOME_DESC_TXT);
final Data outData =
context.getOutputData(EXPRESSION_RESULTS_TSV, alignmentsData);
if (getCounter().getCounterName().equals(HTSeqCounter.COUNTER_NAME)) {
return executeJobHTSeqCounter(context, alignmentsData,
featureAnnotationData, genomeDescriptionData, outData, status);
}
return status.createTaskResult(
new EoulsanException(
"Unknown counter: " + getCounter().getCounterName()),
"Unknown counter: " + getCounter().getCounterName());
}
/**
* Execute HTSeq-count counter as an Hadoop job.
* @param context Eoulsan context
* @param status Eoulsan status
* @return a StepResult object
*/
private TaskResult executeJobHTSeqCounter(final TaskContext context,
final Data alignmentsData, final Data featureAnnotationData,
final Data genomeDescriptionData, final Data outData,
final TaskStatus status) {
// Create configuration object
final Configuration conf = createConfiguration();
try {
final long startTime = System.currentTimeMillis();
getLogger().info("Genomic type: " + getGenomicType());
// Get the paired end mode
boolean pairedEnd = HTSeqCounter.isPairedData(alignmentsData.getDataFile().open());
// Paired-end pre-processing
if (pairedEnd) {
MapReduceUtils.submitAndWaitForJob(
createJobPairedEnd(conf, context, alignmentsData,
genomeDescriptionData),
alignmentsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME,
status, COUNTER_GROUP);
}
// Create the list of jobs to run
final Job job = createJobHTSeqCounter(conf, context, alignmentsData,
featureAnnotationData, isGTFFormat(), genomeDescriptionData, outData,
getGenomicType(), getAttributeId(), isSplitAttributeValues(),
getStranded(), getOverlapMode(), isRemoveAmbiguousCases(), pairedEnd);
// Compute map-reduce part of the expression computation
MapReduceUtils.submitAndWaitForJob(job, alignmentsData.getName(),
CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);
final long mapReduceEndTime = System.currentTimeMillis();
getLogger().info("Finish the first part of the expression computation in "
+ ((mapReduceEndTime - startTime) / 1000) + " seconds.");
// Create the final expression files
createFinalExpressionFeaturesFile(context, featureAnnotationData, outData,
job, this.conf);
getLogger().info("Finish the create of the final expression files in "
+ ((System.currentTimeMillis() - mapReduceEndTime) / 1000)
+ " seconds.");
return status.createTaskResult();
} catch (IOException e) {
return status.createTaskResult(e,
"Error while running job: " + e.getMessage());
} catch (BadBioEntryException e) {
return status.createTaskResult(e,
"Invalid annotation entry: " + e.getEntry());
} catch (EoulsanException e) {
return status.createTaskResult(e,
"Error while reading the annotation file: " + e.getMessage());
}
}
/**
* Create a Zookeeper lock.
* @param conf Hadoop configuration
* @param context Eoulsan task context
* @return a Lock object
* @throws IOException if an error occurs while creating the lock
*/
private static Locker createZookeeperLock(final Configuration conf,
final TaskContext context) throws IOException {
final Settings settings = context.getSettings();
String connectString = settings.getZooKeeperConnectString();
if (connectString == null) {
connectString = conf.get("yarn.resourcemanager.hostname").split(":")[0]
+ ":" + settings.getZooKeeperDefaultPort();
}
return new ZooKeeperLocker(connectString,
settings.getZooKeeperSessionTimeout(),
"/eoulsan-locks-" + InetAddress.getLocalHost().getHostName(),
"expression-lock-job-"
+ context.getJobUUID() + "-step-"
+ context.getCurrentStep().getNumber());
}
}