/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.tools.sequence;
import hbparquet.hadoop.util.ContextUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.seqdoop.hadoop_bam.*;
import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;
import org.seqdoop.hadoop_bam.util.ConfHelper;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by jtarraga on 21/05/15.
* Hadoop-BAM FastqInputFormat modification
* To avoid calling context.setStatus(...), since TaskAttemptContext became an interface (not a class) from r2.4.1
*/
public class FastqInputFormatMODIF extends FileInputFormat<Text, SequencedFragment>
{
public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding";
public static final String CONF_FILTER_FAILED_QC = "hbam.fastq-input.filter-failed-qc";
public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger";
public static class FastqRecordReader extends RecordReader<Text, SequencedFragment> {
/*
* fastq format:
* <fastq> := <block>+
* <block := @<seqname>\n<seq>\n+[<seqname>]\n<qual>\n
* <seqname> := [A-Za-z0-9_.:-]+
* <seq> := [A-Za-z\n\.~]+
* <qual> := [!-~\n]+
*
* LP: this format is broken, no? You can have multi-line sequence and quality strings,
* and the quality encoding includes '@' in its valid character range. So how should one
* distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line
* quality string?
*
* For now I'm going to assume single-line sequences. This works for our sequencing
* application. We'll see if someone complains in other applications.
*/
// start: first valid data index
private long start;
// end: first index value beyond the slice, i.e. slice is in range [start,end)
private long end;
// pos: current position in file
private long pos;
// file: the file being read
private Path file;
private LineReader lineReader;
private InputStream inputStream;
private Text currentKey = new Text();
private SequencedFragment currentValue = new SequencedFragment();
/* If true, will scan the identifier for read data as specified in the Casava
* users' guide v1.8:
* @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:
* <control number>:<index sequence>
* After the first name that doesn't match lookForIlluminaIdentifier will be
* set to false and no further scanning will be done.
*/
private boolean lookForIlluminaIdentifier = true;
private static final Pattern ILLUMINA_PATTERN =
Pattern.compile("([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)");
private Text buffer = new Text();
private BaseQualityEncoding qualityEncoding;
private boolean filterFailedQC = false;
// How long can a read get?
private static final int MAX_LINE_LENGTH = 10000;
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException {
setConf(conf);
file = split.getPath();
start = split.getStart();
end = start + split.getLength();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(file);
CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
CompressionCodec codec = codecFactory.getCodec(file);
if (codec == null) { // no codec. Uncompressed file.
positionAtFirstRecord(fileIn);
inputStream = fileIn;
} else { // compressed file
if (start != 0) {
throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
}
inputStream = codec.createInputStream(fileIn);
end = Long.MAX_VALUE; // read until the end of the file
}
lineReader = new LineReader(inputStream);
}
protected void setConf(Configuration conf) {
String encoding = conf.get(FastqInputFormat.CONF_BASE_QUALITY_ENCODING,
conf.get(
FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING,
FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT)
);
if ("illumina".equals(encoding)) {
qualityEncoding = BaseQualityEncoding.Illumina;
} else if ("sanger".equals(encoding)) {
qualityEncoding = BaseQualityEncoding.Sanger;
} else {
throw new RuntimeException("Unknown input base quality encoding value " + encoding);
}
filterFailedQC = ConfHelper.parseBoolean(
conf.get(FastqInputFormat.CONF_FILTER_FAILED_QC,
conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)),
false);
}
/*
* Position the input stream at the start of the first record.
*/
private void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
if (start > 0) {
// Advance to the start of the first record
// We use a temporary LineReader to read lines until we find the
// position of the right one. We then seek the file to that position.
stream.seek(start);
LineReader reader = new LineReader(stream);
int bytesRead = 0;
do {
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@')) {
start += bytesRead;
} else {
// line starts with @. Read two more and verify that it starts with a +
//
// If this isn't the start of a record, we want to backtrack to its end
long backtrackPosition = start + bytesRead;
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
break; // all good!
} else {
// backtrack to the end of the record we thought was the start.
start = backtrackPosition;
stream.seek(start);
reader = new LineReader(stream);
}
}
} while (bytesRead > 0);
stream.seek(start);
}
// else
// if start == 0 we presume it starts with a valid fastq record
pos = start;
}
/*
* Added to use mapreduce API.
*
*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
}
/*
* Added to use mapreduce API.
*/
public Text getCurrentKey() {
return currentKey;
}
/*
* Added to use mapreduce API.
*/
public SequencedFragment getCurrentValue() {
return currentValue;
}
/*
* Added to use mapreduce API.
*/
public boolean nextKeyValue() throws IOException, InterruptedException {
return next(currentKey, currentValue);
}
/*
* Close this RecordReader to future operations.
*/
public void close() throws IOException {
inputStream.close();
}
/*
* Create an object of the appropriate type to be used as a key.
*/
public Text createKey() {
return new Text();
}
/*
* Create an object of the appropriate type to be used as a value.
*/
public SequencedFragment createValue() {
return new SequencedFragment();
}
/*
* Returns the current position in the input.
*/
public long getPos() {
return pos;
}
/*
* How much of the input has the RecordReader consumed i.e.
*/
public float getProgress() {
if (start == end) {
return 1.0f;
} else {
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
}
public String makePositionMessage() {
return file.toString() + ":" + pos;
}
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException {
// ID line
long skipped = lineReader.skip(1); // skip @
pos += skipped;
if (skipped == 0) {
return false; // EOF
}
// ID
readLineInto(key);
// sequence
value.clear();
readLineInto(value.getSequence());
readLineInto(buffer);
if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') {
throw new RuntimeException("unexpected fastq line separating sequence and quality at "
+ makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key);
}
readLineInto(value.getQuality());
// look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false
lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value);
if (!lookForIlluminaIdentifier) {
scanNameForReadNumber(key, value);
}
return true;
}
/*
* Reads the next key/value pair from the input for processing.
*/
public boolean next(Text key, SequencedFragment value) throws IOException {
if (pos >= end) {
return false; // past end of slice
}
try {
boolean gotData;
boolean goodRecord;
do {
gotData = lowLevelFastqRead(key, value);
goodRecord = gotData
&& (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed());
} while (gotData && !goodRecord);
if (goodRecord) { // goodRecord falso also when we couldn't read any more data
if (qualityEncoding == BaseQualityEncoding.Illumina) {
try {
// convert illumina to sanger scale
SequencedFragment.convertQuality(value.getQuality(),
BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger);
} catch (FormatException e) {
throw new FormatException(e.getMessage() + " Position: "
+ makePositionMessage() + "; Sequence ID: " + key);
}
} else { // sanger qualities.
int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(),
BaseQualityEncoding.Sanger);
if (outOfRangeElement >= 0) {
throw new FormatException("Base quality out of range for Sanger Phred+33 format (found "
+ (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET)
+ ").\n"
+ "Maybe qualities are in Illumina Phred+64 format?\n"
+ "Position: " + makePositionMessage() + "; Sequence ID: " + key);
}
}
}
return goodRecord;
} catch (EOFException e) {
throw new RuntimeException("unexpected end of file in fastq record at "
+ makePositionMessage() + ". Id: " + key.toString());
}
}
private void scanNameForReadNumber(Text name, SequencedFragment fragment) {
// look for a /[0-9] at the end of the name
if (name.getLength() >= 2) {
byte[] bytes = name.getBytes();
int last = name.getLength() - 1;
if (bytes[last - 1] == '/' && bytes[last] >= '0' && bytes[last] <= '9') {
fragment.setRead(bytes[last] - '0');
}
}
}
private boolean scanIlluminaId(Text name, SequencedFragment fragment) {
Matcher m = ILLUMINA_PATTERN.matcher(name.toString());
boolean matches = m.matches();
if (matches) {
fragment.setInstrument(m.group(1));
fragment.setRunNumber(Integer.parseInt(m.group(2)));
fragment.setFlowcellId(m.group(3));
fragment.setLane(Integer.parseInt(m.group(4)));
fragment.setTile(Integer.parseInt(m.group(5)));
fragment.setXpos(Integer.parseInt(m.group(6)));
fragment.setYpos(Integer.parseInt(m.group(7)));
fragment.setRead(Integer.parseInt(m.group(8)));
fragment.setFilterPassed("N".equals(m.group(9)));
fragment.setControlNumber(Integer.parseInt(m.group(10)));
fragment.setIndexSequence(m.group(11));
}
return matches;
}
private int readLineInto(Text dest) throws IOException {
int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH);
if (bytesRead <= 0) {
throw new EOFException();
}
pos += bytesRead;
return bytesRead;
}
}
@Override
public boolean isSplitable(JobContext context, Path path) {
CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path);
return codec == null;
}
public RecordReader<Text, SequencedFragment> createRecordReader(
InputSplit genericSplit,
TaskAttemptContext context) throws IOException, InterruptedException {
//context.setStatus(genericSplit.toString());
return new FastqRecordReader(ContextUtil.getConfiguration(context), (FileSplit)genericSplit);
}
}