package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static com.google.common.base.Preconditions.checkNotNull;
import static fr.ens.biologie.genomique.eoulsan.bio.io.BioCharsets.SAM_CHARSET;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import com.google.common.base.Splitter;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
/**
* This class contains methods and classes related to save and load SAM file
* header in Hadoop mappers and reducers.
* @author Laurent Jourdren
* @since 2.0
*/
public class SAMHeaderHadoopUtils {
static final String SAM_HEADER_FILE_PREFIX = "_samheader_";
/**
* This class allow to save the SAM header read by a mapper.
*/
public static class SAMHeaderWriter {
private List<String> headers;
private final String attemptId;
/**
* Write the line to the SAM header file if the line is a SAM header.
* @param context the Hadoop context
* @param line the line read
* @return if the line is an header or an empty line
* @throws IOException if an error occurs while writing the SAM file header
*/
public boolean writeIfHeaderLine(final JobContext context,
final String line) throws IOException {
checkNotNull(line, "line argument cannot be null");
// Test empty line
if (line.length() == 0) {
return true;
}
// Test if the line is a SAM header
if (line.charAt(0) == '@') {
if (this.headers == null) {
this.headers = new ArrayList<>();
}
this.headers.add(line);
return true;
}
close(context);
return false;
}
/**
* Close the SAM file header.
* @param context the Hadoop context
* @throws IOException if an error occurs while writing the SAM file header
*/
public void close(final JobContext context) throws IOException {
// If headers previously found write it in a file
if (this.headers != null) {
// Save headers
checkNotNull(context, "context argument cannot be null");
final Path outputPath = new Path(context.getConfiguration()
.get("mapreduce.output.fileoutputformat.outputdir"));
final Path headerPath =
new Path(outputPath, SAM_HEADER_FILE_PREFIX + attemptId);
final Writer writer =
new OutputStreamWriter(PathUtils.createOutputStream(headerPath,
context.getConfiguration()), SAM_CHARSET);
for (String l : this.headers) {
writer.write(l + "\n");
}
writer.close();
this.headers = null;
}
}
/**
* Constructor.
* @param attemptId Hadoop task attempt Id
*/
public SAMHeaderWriter(final String attemptId) {
checkNotNull(attemptId, "attemptId argument cannot be null");
this.attemptId = attemptId;
}
}
/**
* Load SAM headers.
* @param context the Hadoop context
* @return a list of String with the SAM headers
* @throws IOException if an error occurs while loading the headers
*/
public static List<String> loadSAMHeaders(final JobContext context)
throws IOException {
checkNotNull(context, "context argument cannot be null");
final List<String> result = new ArrayList<>();
// Get the output path of the reducer
final Path outputPath = new Path(context.getConfiguration()
.get("mapreduce.output.fileoutputformat.outputdir"));
// Get the file system object
final FileSystem fs =
context.getWorkingDirectory().getFileSystem(context.getConfiguration());
// Found the complete SAM header file
Path bestFile = null;
long maxLen = -1;
for (FileStatus status : fs.listStatus(outputPath)) {
if (status.getPath().getName().startsWith(SAM_HEADER_FILE_PREFIX)
&& status.getLen() > maxLen) {
maxLen = status.getLen();
bestFile = status.getPath();
}
}
// Check if the SAM header file has been found
if (bestFile == null) {
throw new IOException(
"No SAM header file found in reducer output directory: "
+ outputPath);
}
try (final BufferedReader reader = new BufferedReader(
new InputStreamReader(fs.open(bestFile), SAM_CHARSET))) {
String line = null;
while ((line = reader.readLine()) != null) {
result.add(line);
}
reader.close();
}
return result;
}
/**
* Create a SAMSequenceDictionary from the SAM header in a list of String.
* @param headers the list of String
* @return a new SAMSequenceDictionary object with the SAM headers
*/
public static SAMSequenceDictionary createSAMSequenceDictionaryFromSAMHeader(
final List<String> headers) {
checkNotNull(headers, "headers argument cannot be null");
final Splitter spliter = Splitter.on('\t');
// Dictionary for sequences
final SAMSequenceDictionary result = new SAMSequenceDictionary();
for (String line : headers) {
if (line.startsWith("@SQ\t")) {
// Parse sequence name and length
String sequenceName = null;
int sequenceLength = -1;
for (String f : spliter.split(line)) {
if (f.startsWith("SN:")) {
sequenceName = f.substring(3);
} else if (f.startsWith("LN:")) {
try {
sequenceLength = Integer.parseInt(f.substring(3));
} catch (NumberFormatException e) {
}
}
}
// Add sequence to SAM header
if (sequenceName != null && sequenceLength != -1) {
result
.addSequence(new SAMSequenceRecord(sequenceName, sequenceLength));
}
}
}
return result;
}
}