/*
* The MIT License
*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.StringUtil;
import java.io.File;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
/**
* this class enables creation of a SAMRecord object from a String in SAM text format.
*/
public class SAMLineParser {
// From SAM specification
private static final int QNAME_COL = 0;
private static final int FLAG_COL = 1;
private static final int RNAME_COL = 2;
private static final int POS_COL = 3;
private static final int MAPQ_COL = 4;
private static final int CIGAR_COL = 5;
private static final int MRNM_COL = 6;
private static final int MPOS_COL = 7;
private static final int ISIZE_COL = 8;
private static final int SEQ_COL = 9;
private static final int QUAL_COL = 10;
private static final int NUM_REQUIRED_FIELDS = 11;
// Read string must contain only these characters
private static final Pattern VALID_BASES = Pattern
.compile("^[acmgrsvtwyhkdbnACMGRSVTWYHKDBN.=]+$");
/**
* Allocate this once rather than for every line as a performance
* optimization. The size is arbitrary -- merely large enough to handle the
* maximum number of fields we might expect from a reasonable SAM file.
*/
private final String[] mFields = new String[10000];
/**
* Add information about the origin (reader and position) to SAM records.
*/
private final SamReader mParentReader;
private final SAMRecordFactory samRecordFactory;
private final ValidationStringency validationStringency;
private final SAMFileHeader mFileHeader;
private final File mFile;
private final TextTagCodec tagCodec = new TextTagCodec();
private int currentLineNumber;
private String currentLine;
//
// Constructors
//
/**
* Public constructor. Use the default SAMRecordFactory and stringency.
* @param samFileHeader SAM file header
*/
public SAMLineParser(final SAMFileHeader samFileHeader) {
this(new DefaultSAMRecordFactory(),
ValidationStringency.DEFAULT_STRINGENCY, samFileHeader,
null, null);
}
/**
* Public constructor. Use the default SAMRecordFactory and stringency.
* @param samFileHeader SAM file header
* @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null.
* @param samFile SAM file being read (for error message only, may be null)
*/
public SAMLineParser(final SAMFileHeader samFileHeader,
final SAMFileReader samFileReader, final File samFile) {
this(new DefaultSAMRecordFactory(),
ValidationStringency.DEFAULT_STRINGENCY, samFileHeader,
samFileReader, samFile);
}
/**
* Public constructor.
* @param samRecordFactory SamRecord Factory
* @param validationStringency validation stringency
* @param samFileHeader SAM file header
* @param samFileReader SAM file reader For passing to SAMRecord.setFileSource, may be null.
* @param samFile SAM file being read (for error message only, may be null)
*/
public SAMLineParser(final SAMRecordFactory samRecordFactory,
final ValidationStringency validationStringency,
final SAMFileHeader samFileHeader, final SamReader samFileReader,
final File samFile) {
if (samRecordFactory == null)
throw new NullPointerException("The SamRecordFactory must be set");
if (validationStringency == null)
throw new NullPointerException("The validationStringency must be set");
if (samFileHeader == null)
throw new NullPointerException("The mFileHeader must be set");
this.samRecordFactory = samRecordFactory;
this.validationStringency = validationStringency;
this.mFileHeader = samFileHeader;
// Can be null
this.mParentReader = samFileReader;
// Can be null
this.mFile = samFile;
}
/**
* Get the File header.
* @return the SAM file header
*/
public SAMFileHeader getFileHeader() {
return this.mFileHeader;
}
/**
* Get validation stringency.
* @return validation stringency
*/
public ValidationStringency getValidationStringency() {
return this.validationStringency;
}
private int parseInt(final String s, final String fieldName) {
final int ret;
try {
ret = Integer.parseInt(s);
} catch (NumberFormatException e) {
throw reportFatalErrorParsingLine("Non-numeric value in "
+ fieldName + " column");
}
return ret;
}
private void validateReferenceName(final String rname, final String fieldName) {
if (rname.equals("=")) {
if (fieldName.equals("MRNM")) {
return;
}
reportErrorParsingLine("= is not a valid value for "
+ fieldName + " field.");
}
if (this.mFileHeader.getSequenceDictionary().size() != 0) {
if (this.mFileHeader.getSequence(rname) == null) {
reportErrorParsingLine(fieldName
+ " '" + rname + "' not found in any SQ record");
}
}
}
/**
* Parse a SAM line.
* @param line line to parse
* @return a new SAMRecord object
*/
public SAMRecord parseLine(final String line) {
return parseLine(line, -1);
}
/**
* Parse a SAM line.
* @param line line to parse
* @param lineNumber line number in the file. If the line number is not known
* can be <=0.
* @return a new SAMRecord object
*/
public SAMRecord parseLine(final String line, final int lineNumber) {
final String mCurrentLine = line;
this.currentLineNumber = lineNumber;
this.currentLine = line;
final int numFields = StringUtil.split(mCurrentLine, mFields, '\t');
if (numFields < NUM_REQUIRED_FIELDS) {
throw reportFatalErrorParsingLine("Not enough fields");
}
if (numFields == mFields.length) {
reportErrorParsingLine("Too many fields in SAM text record.");
}
for (int i = 0; i < numFields; ++i) {
if (mFields[i].length() == 0) {
reportErrorParsingLine("Empty field at position " + i + " (zero-based)");
}
}
final SAMRecord samRecord =
samRecordFactory.createSAMRecord(this.mFileHeader);
samRecord.setValidationStringency(this.validationStringency);
if (mParentReader != null)
samRecord.setFileSource(new SAMFileSource(mParentReader, null));
samRecord.setHeader(this.mFileHeader);
samRecord.setReadName(mFields[QNAME_COL]);
final int flags = parseInt(mFields[FLAG_COL], "FLAG");
samRecord.setFlags(flags);
String rname = mFields[RNAME_COL];
if (!rname.equals("*")) {
rname = SAMSequenceRecord.truncateSequenceName(rname);
validateReferenceName(rname, "RNAME");
samRecord.setReferenceName(rname);
} else if (!samRecord.getReadUnmappedFlag()) {
reportErrorParsingLine("RNAME is not specified but flags indicate mapped");
}
final int pos = parseInt(mFields[POS_COL], "POS");
final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ");
final String cigar = mFields[CIGAR_COL];
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(samRecord
.getReferenceName())) {
if (pos == 0) {
reportErrorParsingLine("POS must be non-zero if RNAME is specified");
}
if (!samRecord.getReadUnmappedFlag() && cigar.equals("*")) {
reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified");
}
} else {
if (pos != 0) {
reportErrorParsingLine("POS must be zero if RNAME is not specified");
}
if (mapq != 0) {
reportErrorParsingLine("MAPQ must be zero if RNAME is not specified");
}
if (!cigar.equals("*")) {
reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified");
}
}
samRecord.setAlignmentStart(pos);
samRecord.setMappingQuality(mapq);
samRecord.setCigarString(cigar);
String mateRName = mFields[MRNM_COL];
if (mateRName.equals("*")) {
if (samRecord.getReadPairedFlag() && !samRecord.getMateUnmappedFlag()) {
reportErrorParsingLine("MRNM not specified but flags indicate mate mapped");
}
} else {
if (!samRecord.getReadPairedFlag()) {
reportErrorParsingLine("MRNM specified but flags indicate unpaired");
}
if (!"=".equals(mateRName)) {
mateRName = SAMSequenceRecord.truncateSequenceName(mateRName);
}
validateReferenceName(mateRName, "MRNM");
if (mateRName.equals("=")) {
if (samRecord.getReferenceName() == null) {
reportErrorParsingLine("MRNM is '=', but RNAME is not set");
}
samRecord.setMateReferenceName(samRecord.getReferenceName());
} else {
samRecord.setMateReferenceName(mateRName);
}
}
final int matePos = parseInt(mFields[MPOS_COL], "MPOS");
final int isize = parseInt(mFields[ISIZE_COL], "ISIZE");
if (!samRecord.getMateReferenceName().equals(
SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
if (matePos == 0) {
reportErrorParsingLine("MPOS must be non-zero if MRNM is specified");
}
} else {
if (matePos != 0) {
reportErrorParsingLine("MPOS must be zero if MRNM is not specified");
}
if (isize != 0) {
reportErrorParsingLine("ISIZE must be zero if MRNM is not specified");
}
}
samRecord.setMateAlignmentStart(matePos);
samRecord.setInferredInsertSize(isize);
if (!mFields[SEQ_COL].equals("*")) {
validateReadBases(mFields[SEQ_COL]);
samRecord.setReadString(mFields[SEQ_COL]);
} else {
samRecord.setReadBases(SAMRecord.NULL_SEQUENCE);
}
if (!mFields[QUAL_COL].equals("*")) {
if (samRecord.getReadBases() == SAMRecord.NULL_SEQUENCE) {
reportErrorParsingLine("QUAL should not be specified if SEQ is not specified");
}
if (samRecord.getReadString().length() != mFields[QUAL_COL].length()) {
reportErrorParsingLine("length(QUAL) != length(SEQ)");
}
samRecord.setBaseQualityString(mFields[QUAL_COL]);
} else {
samRecord.setBaseQualities(SAMRecord.NULL_QUALS);
}
for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) {
parseTag(samRecord, mFields[i]);
}
final List<SAMValidationError> validationErrors = samRecord.isValid();
if (validationErrors != null) {
for (final SAMValidationError errorMessage : validationErrors) {
reportErrorParsingLine(errorMessage.getMessage());
}
}
return samRecord;
}
private void validateReadBases(final String bases) {
/*
* Using regex is slow, so check for invalid characters via
* isValidReadBase(), which hopefully the JIT will optimize. if
* (!VALID_BASES.matcher(bases).matches()) {
* reportErrorParsingLine("Invalid character in read bases"); }
*/
for (int i = 0; i < bases.length(); ++i) {
if (!isValidReadBase(bases.charAt(i))) {
reportErrorParsingLine("Invalid character in read bases");
return;
}
}
}
private boolean isValidReadBase(final char base) {
switch (base) {
case 'a':
case 'c':
case 'm':
case 'g':
case 'r':
case 's':
case 'v':
case 't':
case 'w':
case 'y':
case 'h':
case 'k':
case 'd':
case 'b':
case 'n':
case 'A':
case 'C':
case 'M':
case 'G':
case 'R':
case 'S':
case 'V':
case 'T':
case 'W':
case 'Y':
case 'H':
case 'K':
case 'D':
case 'B':
case 'N':
case '.':
case '=':
return true;
default:
return false;
}
}
private void parseTag(final SAMRecord samRecord, final String tag) {
Map.Entry<String, Object> entry = null;
try {
entry = tagCodec.decode(tag);
} catch (SAMFormatException e) {
reportErrorParsingLine(e);
}
if (entry != null) {
if (entry.getValue() instanceof TagValueAndUnsignedArrayFlag) {
final TagValueAndUnsignedArrayFlag valueAndFlag =
(TagValueAndUnsignedArrayFlag) entry.getValue();
if (valueAndFlag.isUnsignedArray) {
samRecord.setUnsignedArrayAttribute(entry.getKey(),
valueAndFlag.value);
} else {
samRecord.setAttribute(entry.getKey(), valueAndFlag.value);
}
} else {
samRecord.setAttribute(entry.getKey(), entry.getValue());
}
}
}
//
// Error methods
//
private RuntimeException reportFatalErrorParsingLine(final String reason) {
return new SAMFormatException(makeErrorString(reason));
}
private void reportErrorParsingLine(final String reason) {
final String errorMessage = makeErrorString(reason);
if (validationStringency == ValidationStringency.STRICT) {
throw new SAMFormatException(errorMessage);
} else if (validationStringency == ValidationStringency.LENIENT) {
System.err
.println("Ignoring SAM validation error due to lenient parsing:");
System.err.println(errorMessage);
}
}
private void reportErrorParsingLine(final Exception e) {
final String errorMessage = makeErrorString(e.getMessage());
if (validationStringency == ValidationStringency.STRICT) {
throw new SAMFormatException(errorMessage);
} else if (validationStringency == ValidationStringency.LENIENT) {
System.err
.println("Ignoring SAM validation error due to lenient parsing:");
System.err.println(errorMessage);
}
}
private String makeErrorString(final String reason) {
String fileMessage = "";
if (mFile != null) {
fileMessage = "File " + mFile + "; ";
}
return "Error parsing text SAM file. "
+ reason + "; " + fileMessage + "Line "
+ (this.currentLineNumber <= 0 ? "unknown" : this.currentLineNumber)
+ "\nLine: " + this.currentLine;
}
}