/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.OUTPUT_MAPPING_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.util.StringUtils.unDoubleQuotes;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.InetAddress;
import java.net.URI;
import java.nio.channels.FileLock;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.google.common.base.Splitter;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanLogger;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.readsmappers.MapperProcess;
import fr.ens.biologie.genomique.eoulsan.bio.readsmappers.SequenceReadsMapper;
import fr.ens.biologie.genomique.eoulsan.bio.readsmappers.SequenceReadsMapperService;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.util.ProcessUtils;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.HadoopReporter;
import fr.ens.biologie.genomique.eoulsan.util.locker.Locker;
import fr.ens.biologie.genomique.eoulsan.util.locker.DistributedLocker;
/**
* This class defines a generic mapper for reads mapping.
* @since 1.0
* @author Laurent Jourdren
*/
public class ReadsMapperMapper extends Mapper<Text, Text, Text, Text> {
// Parameter keys
static final String MAPPER_NAME_KEY =
Globals.PARAMETER_PREFIX + ".mapper.name";
static final String MAPPER_VERSION_KEY =
Globals.PARAMETER_PREFIX + ".mapper.version";
static final String MAPPER_FLAVOR_KEY =
Globals.PARAMETER_PREFIX + ".mapper.flavor";
static final String PAIR_END_KEY =
Globals.PARAMETER_PREFIX + ".mapper.pairend";
static final String MAPPER_ARGS_KEY =
Globals.PARAMETER_PREFIX + ".mapper.args";
static final String MAPPER_THREADS_KEY =
Globals.PARAMETER_PREFIX + ".mapper.nb.threads";
static final String FASTQ_FORMAT_KEY =
Globals.PARAMETER_PREFIX + ".mapper.fastq.format";
static final String INDEX_CHECKSUM_KEY =
Globals.PARAMETER_PREFIX + ".mapper.index.checksum";
static final String ZOOKEEPER_CONNECT_STRING_KEY =
Globals.PARAMETER_PREFIX + ".mapper.zookeeper.connect.string";
static final String ZOOKEEPER_SESSION_TIMEOUT_KEY =
Globals.PARAMETER_PREFIX + ".mapper.zookeeper.session.timeout";
private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults();
private static final String MAPPER_INDEX_DIR_PREFIX =
Globals.APP_NAME + "-mapper-index-";
private static final String MAPPER_LAST_USED_FILENAME =
Globals.APP_NAME.toUpperCase() + "_LAST_USED";
private static final long DEFAULT_AGE_OF_UNUSED_MAPPER_INDEXES = 7;
private static final String LOCK_SUFFIX = ".lock";
private String counterGroup = this.getClass().getName();
private File mapperIndexDir;
private Locker lock;
private SequenceReadsMapper mapper;
private MapperProcess process;
private Thread samResultsParserThread;
private final BlockingDeque<String> queue = new LinkedBlockingDeque<>();
private final ExceptionWrapper exception = new ExceptionWrapper();
private int entriesParsed;
private boolean writeHeaders;
private final List<String> fields = new ArrayList<>();
private final Text outKey = new Text();
private final Text outValue = new Text();
private static final class ExceptionWrapper {
private IOException exception;
}
/**
* 'key': offset of the beginning of the line from the beginning of the TFQ
* file. 'value': the TFQ line (3 fields if data are in single-end mode, 6
* fields if data are in paired-end mode).
*/
@Override
protected void map(final Text key, final Text value, final Context context)
throws IOException, InterruptedException {
this.fields.clear();
for (String e : TAB_SPLITTER.split(value.toString())) {
this.fields.add(e);
}
final int fieldsSize = this.fields.size();
if (fieldsSize == 3) {
// Single end
this.process.writeEntry(this.fields.get(0), this.fields.get(1),
this.fields.get(2));
} else if (fieldsSize == 6) {
// Pair end
this.process.writeEntry(this.fields.get(0), this.fields.get(1),
this.fields.get(2), this.fields.get(3), this.fields.get(4),
this.fields.get(5));
}
writeResults(context, this.writeHeaders);
}
@Override
protected void setup(final Context context) throws IOException {
EoulsanLogger.initConsoleHandler();
getLogger().info("Start of setup()");
final Configuration conf = context.getConfiguration();
// Initialize Eoulsan Settings
if (!EoulsanRuntime.isRuntime()) {
HadoopEoulsanRuntime.newEoulsanRuntime(conf);
}
// Get mapper name
final String mapperName = conf.get(MAPPER_NAME_KEY);
if (mapperName == null) {
throw new IOException("No mapper set");
}
// Set the mapper
this.mapper =
SequenceReadsMapperService.getInstance().newService(mapperName);
// Set the mapper version
this.mapper.setMapperVersionToUse(conf.get(MAPPER_VERSION_KEY));
// Set the mapper flavor
this.mapper.setMapperFlavorToUse(conf.get(MAPPER_FLAVOR_KEY));
// Get counter group
final String counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY);
if (counterGroup != null) {
this.counterGroup = counterGroup;
}
final boolean pairedEnd = Boolean.parseBoolean(conf.get(PAIR_END_KEY));
final FastqFormat fastqFormat =
FastqFormat.getFormatFromName(conf.get(FASTQ_FORMAT_KEY,
"" + EoulsanRuntime.getSettings().getDefaultFastqFormat()));
// DistributedCache.purgeCache(conf);
// Download genome reference
final URI[] localCacheFiles = context.getCacheFiles();
if (localCacheFiles == null || localCacheFiles.length == 0) {
throw new IOException("Unable to retrieve genome index");
}
if (localCacheFiles.length > 1) {
throw new IOException("Retrieve more than one file in distributed cache");
}
// Get the local genome index zip file
getLogger().info("localCacheFiles[0]: " + localCacheFiles[0]);
final DataFile archiveIndexFile =
new DataFile(localCacheFiles[0].toString());
getLogger().info("Genome index compressed file (from distributed cache): "
+ archiveIndexFile);
// Set index directory
this.mapperIndexDir =
new File(EoulsanRuntime.getRuntime().getTempDirectory(),
MAPPER_INDEX_DIR_PREFIX
+ this.mapper.getMapperName() + "-index-"
+ conf.get(INDEX_CHECKSUM_KEY));
getLogger()
.info("Genome index directory where decompressed: " + mapperIndexDir);
// Set FASTQ format
this.mapper.setFastqFormat(fastqFormat);
getLogger().info("Fastq format: " + fastqFormat);
this.lock = new DistributedLocker(conf.get(ZOOKEEPER_CONNECT_STRING_KEY),
Integer.parseInt(conf.get(ZOOKEEPER_SESSION_TIMEOUT_KEY)),
"/eoulsan-locks-" + InetAddress.getLocalHost().getHostName(),
"eoulsan-mapper-lock");
// Get Mapper arguments
final String mapperArguments = unDoubleQuotes(conf.get(MAPPER_ARGS_KEY));
if (mapperArguments != null) {
this.mapper.setMapperArguments(mapperArguments);
}
// Get the number of threads to use
int mapperThreads = Integer.parseInt(conf.get(MAPPER_THREADS_KEY,
"" + Runtime.getRuntime().availableProcessors()));
if (mapperThreads > Runtime.getRuntime().availableProcessors()
|| mapperThreads < 1) {
mapperThreads = Runtime.getRuntime().availableProcessors();
}
if (!this.mapper.isMultipleInstancesEnabled()) {
this.mapper.setThreadsNumber(mapperThreads);
}
getLogger().info("Use "
+ this.mapper.getMapperName() + " with " + mapperThreads
+ " threads option");
// Create temporary directory if not exists
final File tempDir = EoulsanRuntime.getRuntime().getTempDirectory();
if (!tempDir.exists()) {
getLogger()
.fine("Create temporary directory: " + tempDir.getAbsolutePath());
if (!tempDir.mkdirs()) {
throw new IOException(
"Unable to create local Hadoop temporary directory: " + tempDir);
}
}
// Set mapper temporary directory
this.mapper.setTempDirectory(tempDir);
// Set mapper executable temporary directory
this.mapper.setExecutablesTempDirectory(tempDir);
// Enable multiple instance of the mapper, if not supported
// this.mapper.isMultipleInstancesEnabled() will return false
this.mapper.setMultipleInstancesEnabled(true);
// Update last used file timestamp for the mapper indexes clean up
updateLastUsedMapperIndex(this.mapperIndexDir);
context.setStatus("Wait lock");
// Lock if mapper
ProcessUtils.waitRandom(5000);
this.lock.lock();
// Init mapper
this.mapper.init(archiveIndexFile.open(), this.mapperIndexDir,
new HadoopReporter(context), this.counterGroup);
// Lock if no multiple instances enabled
if (this.mapper.isMultipleInstancesEnabled()) {
// Unlock
this.lock.unlock();
} else {
context.setStatus(
"Wait free JVM for running " + this.mapper.getMapperName());
// Wait free JVM
waitFreeJVM(context);
}
if (pairedEnd) {
this.process = this.mapper.mapPE();
} else {
this.process = this.mapper.mapSE();
}
this.writeHeaders = context.getTaskAttemptID().getTaskID().getId() == 0;
this.samResultsParserThread = startParseSAMResultsThread(this.process);
context.setStatus("Run " + this.mapper.getMapperName());
getLogger().info("End of setup()");
}
@Override
protected void cleanup(final Context context)
throws IOException, InterruptedException {
getLogger().info("Start of cleanup() of the mapper.");
// Close the writers
this.process.closeEntriesWriter();
// Wait the end of the SAM parsing
this.samResultsParserThread.join();
this.process.waitFor();
this.mapper.throwMappingException();
// Unlock if no multiple instances enabled
if (!this.mapper.isMultipleInstancesEnabled()) {
this.lock.unlock();
}
// Write headers
writeResults(context, this.writeHeaders);
getLogger().info(this.entriesParsed
+ " entries parsed in " + this.mapper.getMapperName() + " output file");
// Clear old mapper indexes
removeUnusedMapperIndexes(context.getConfiguration());
getLogger().info("End of close() of the mapper.");
}
//
// Other mapping methods
//
/**
* Wait a free JVM.
* @param context the Hadoop context
*/
private void waitFreeJVM(final Context context) {
final long waitStartTime = System.currentTimeMillis();
ProcessUtils
.waitUntilExecutableRunning(this.mapper.getMapperExecutableName());
getLogger().info("Wait "
+ StringUtils
.toTimeHumanReadable(System.currentTimeMillis() - waitStartTime)
+ " before running " + this.mapper.getMapperName());
context.setStatus("Run " + this.mapper.getMapperName());
}
/**
* Start SAM parser result thread.
* @param mp the mapper process
* @return the created thread
*/
private Thread startParseSAMResultsThread(final MapperProcess mp) {
final Thread t = new Thread(new Runnable() {
@Override
public void run() {
// Parse SAM result file
String line;
try (BufferedReader readerResults =
new BufferedReader(new InputStreamReader(mp.getStout()))) {
while ((line = readerResults.readLine()) != null) {
queue.add(line);
}
} catch (IOException e) {
exception.exception = e;
}
}
});
t.start();
return t;
}
/**
* Write results.
* @param context the Hadoop context
* @param writeHeader true if SAM header must be written
* @throws InterruptedException if an error occurs while writing data
* @throws IOException if an error occurs while writing data
*/
private void writeResults(final Context context, boolean writeHeader)
throws InterruptedException, IOException {
while (!this.queue.isEmpty()) {
final String line = this.queue.take().trim();
if (line.length() == 0) {
continue;
}
// Test if line is an header line
final boolean headerLine = line.charAt(0) == '@';
// Only write header lines once (on the first output file)
if (headerLine && !writeHeader) {
continue;
}
if (!headerLine) {
// Set the output key as the read id
final int tabPos = line.indexOf('\t');
if (tabPos == -1) {
outKey.set("");
} else {
outKey.set(line.substring(0, tabPos));
}
// Increment counters if not header
this.entriesParsed++;
context.getCounter(this.counterGroup,
OUTPUT_MAPPING_ALIGNMENTS_COUNTER.counterName()).increment(1);
} else {
// Set empty key for headers
this.outKey.set("");
}
// Set the output value
this.outValue.set(line);
// Write the result
context.write(this.outKey, this.outValue);
}
// Throw reader exception if exists
if (this.exception.exception != null) {
throw this.exception.exception;
}
}
//
// Old mappers indexes cleanup methods
//
/**
* Update the last usage of the current mapper index.
* @param mapperIndexDir the mapper index directory
*/
private void updateLastUsedMapperIndex(final File mapperIndexDir) {
final File lockFile = new File(mapperIndexDir.getParentFile(),
mapperIndexDir.getName() + LOCK_SUFFIX);
try (FileOutputStream out = new FileOutputStream(lockFile)) {
// Lock the mapper directory
FileLock lock = out.getChannel().lock();
final File lastMapperUsedFile =
new File(mapperIndexDir, MAPPER_LAST_USED_FILENAME);
if (lastMapperUsedFile.exists()) {
if (!lastMapperUsedFile.setLastModified(System.currentTimeMillis())) {
getLogger()
.warning("Unable to set the modification time of the file: "
+ lastMapperUsedFile);
}
}
// Unlock the mapper directory
lock.release();
} catch (IOException e) {
getLogger().warning(
"Cannot update the timestamp of the last usage of the current mapper index: "
+ e.getMessage());
}
}
/**
* Remove unused mapper indexes.
* @param conf Hadoop configuration
*/
private void removeUnusedMapperIndexes(final Configuration conf) {
final File mapperIndexesDir = this.mapperIndexDir.getParentFile();
for (File dir : mapperIndexesDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(final File dir, final String name) {
final File f = new File(dir, name);
return f.isDirectory() && name.startsWith(MAPPER_INDEX_DIR_PREFIX);
}
})) {
// First check without lock on the mapper index directory
if (isMapperIndexMustBeRemoved(mapperIndexesDir)) {
removeUnusedMapperIndex(dir, conf);
}
}
}
/**
* Check if a mapper index directory must be removed.
* @param mapperIndexDir the mapper index directory
* @return true if the mapper index directory must be removed
*/
private boolean isMapperIndexMustBeRemoved(final File mapperIndexDir) {
final File lastModifiedFile =
new File(mapperIndexDir, MAPPER_LAST_USED_FILENAME);
if (!lastModifiedFile.exists())
return false;
final long duration =
System.currentTimeMillis() - lastModifiedFile.lastModified();
return duration > (DEFAULT_AGE_OF_UNUSED_MAPPER_INDEXES * 24 * 3600 * 1000);
}
/**
* Remove an unused mapper index directory.
* @param mapperIndexDir the mapper index directory to remove
* @param conf Hadoop configuration
*/
private void removeUnusedMapperIndex(final File mapperIndexDir,
final Configuration conf) {
final File lockFile = new File(mapperIndexDir.getParentFile(),
mapperIndexDir.getName() + LOCK_SUFFIX);
try (FileOutputStream out = new FileOutputStream(lockFile)) {
// Lock the mapper directory
FileLock lock = out.getChannel().lock();
// Second check with lock on the mapper index directory
if (isMapperIndexMustBeRemoved(mapperIndexDir)) {
getLogger()
.info("Remove unused mapper index directory: " + mapperIndexDir);
// Remove the mapper index
// TODO use Datafile.delete(true)
final Path mapperIndexPath = new Path(mapperIndexDir.toURI());
final FileSystem fs = FileSystem.get(mapperIndexDir.toURI(), conf);
fs.delete(mapperIndexPath, true);
}
// Unlock the mapper directory
lock.release();
} catch (IOException e) {
getLogger().warning("Cannot remove unused mapper index directory ("
+ mapperIndexDir + "): " + e.getMessage());
}
}
}