/*
* The MIT License (MIT)
*
* Copyright (c) 2016 University of California San Diego
* Author: Jim Robinson
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.maf;
import htsjdk.samtools.*;
import org.broad.igv.Globals;
import org.broad.igv.feature.genome.ChromosomeNameComparator;
import org.broad.igv.tools.sort.SAMSorter;
import org.broad.igv.tools.sort.Sorter;
import org.broad.igv.tools.sort.SorterFactory;
import org.broad.igv.util.ParsingUtils;
import java.io.*;
import java.util.*;
/**
* Created by jrobinson on 5/5/16.
* <p>
* Parses a MAF file for Alignment records, as opposed to Multiple Alignments. For LAST aligner output.
*/
public class MAFtoSAM {
public static final int READ_PAIRED_FLAG = 0x1;
public static final int PROPER_PAIR_FLAG = 0x2;
public static final int READ_UNMAPPED_FLAG = 0x4;
public static final int MATE_UNMAPPED_FLAG = 0x8;
public static final int READ_STRAND_FLAG = 0x10;
public static final int MATE_STRAND_FLAG = 0x20;
public static final int FIRST_OF_PAIR_FLAG = 0x40;
public static final int SECOND_OF_PAIR_FLAG = 0x80;
public static final int NOT_PRIMARY_ALIGNMENT_FLAG = 0x100;
public static final int READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200;
public static final int DUPLICATE_READ_FLAG = 0x400;
public static final int SUPPLEMENTARY_FLAG = 0x800;
public static void main(String[] args) throws IOException {
String inputPath = args[0];
String outputPath = args.length > 1 ? args[1] : args[0] + ".sam";
convert(inputPath, outputPath, false);
}
public static void convert(String path, String outputPath, boolean noSATag) throws IOException {
File unsortedOutput = new File(outputPath + ".unsorted.sam");
File outputFile = new File(outputPath);
BufferedReader reader = ParsingUtils.openBufferedReader(path);
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(unsortedOutput)));
Map<String, Integer> sequenceDictionary = new LinkedHashMap<String, Integer>();
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("a ")) {
// Parse alignments until blank line
parseBlock(reader, out, sequenceDictionary);
}
}
out.flush();
out.close();
reader.close();
// Insert header and SA tags
addSATags(unsortedOutput, outputFile, sequenceDictionary);
unsortedOutput.deleteOnExit();
}
private static void addSATags(File inputFile, File outputFile, Map<String, Integer> sequenceDictionary) throws IOException {
String groupedOutput = inputFile.getAbsolutePath() + ".grouped.sam";
Sorter sorter = SorterFactory.getSorter(inputFile, new File(groupedOutput));
sorter.setComparator(SAMSorter.ReadNameComparator);
sorter.run();
// Get Sam file reader, output header, then iterate through sam records combining those with like read names
BufferedReader reader = new BufferedReader(new FileReader(groupedOutput));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
outputHeader(sequenceDictionary, out);
String line;
String[] lastRecord = null;
List<String[]> chimericAlignments = new ArrayList<>();
// Records should now be sorted by read name
while ((line = reader.readLine()) != null) {
if (line.startsWith("@")) {
out.println(line);
} else {
String[] record = Globals.tabPattern.split(line);
if (lastRecord == null || lastRecord[0].equals(record[0])) {
chimericAlignments.add(record);
} else {
if (chimericAlignments.size() > 0) {
printChimericAlignments(out, chimericAlignments);
}
chimericAlignments.clear();
chimericAlignments.add(record);
}
lastRecord = record;
}
}
// Last one
if (chimericAlignments.size() > 0) {
printChimericAlignments(out, chimericAlignments);
}
out.flush();
out.close();
(new File(groupedOutput)).deleteOnExit();
}
private static void printChimericAlignments(PrintWriter out, List<String[]> chimericAlignments) {
if (chimericAlignments.size() == 1) {
// Not chimeric, just print
printRecord(out, chimericAlignments.get(0));
out.println();
} else {
// Mark all but one alignment in the list as primary
int primaryRecordIndex = 0;
int longestSequence = chimericAlignments.get(0)[9].length();
for(int i=1; i<chimericAlignments.size(); i++) {
String[] thisRecord = chimericAlignments.get(i);
int seqLength = thisRecord[9].length();
if(seqLength > longestSequence) {
longestSequence = seqLength;
primaryRecordIndex = i;
}
}
for(int i=0; i<chimericAlignments.size(); i++) {
if(i == primaryRecordIndex) continue;
String[] thisRecord = chimericAlignments.get(i);
thisRecord[1] = String.valueOf(Integer.parseInt(thisRecord[1]) | NOT_PRIMARY_ALIGNMENT_FLAG);
}
for (int i = 0; i < chimericAlignments.size(); i++) {
String[] thisRecord = chimericAlignments.get(i);
StringBuffer saTag = new StringBuffer("SA:Z:");
// Build up the SA string -- essentially a list of the other parts of this chmeric alignment.
for (int j = 0; j < chimericAlignments.size(); j++) {
if (j == i) continue;
String[] supRecord = chimericAlignments.get(j);
// This is fragile, but we control the order
String nm;
String nmString = supRecord[11];
if (nmString.startsWith("NM:i:")) {
nm = nmString.substring(5);
} else {
nm = "0";
}
boolean isNegativeStrand = (Integer.parseInt(supRecord[1]) & READ_STRAND_FLAG) != 0;
saTag.append(supRecord[2] + "," + supRecord[3] + "," + (isNegativeStrand ? "-," : "+,") +
supRecord[5] + "," + supRecord[4] + "," + nm + ";");
}
if (i > 0) {
int flags = Integer.parseInt(thisRecord[1]) | SUPPLEMENTARY_FLAG;
thisRecord[1] = Integer.toString(flags);
}
printRecord(out, thisRecord);
out.print("\t" + saTag.toString());
out.println();
}
}
}
private static void printRecord(PrintWriter out, String[] record) {
out.print(record[0]);
for (int i = 1; i < record.length; i++) {
out.print("\t" + record[i]);
}
}
private static void parseBlock(BufferedReader reader, PrintWriter out, Map<String, Integer> sequenceDictionary) throws IOException {
String line;
SequenceLine referenceLine = null;
SequenceLine queryLine;
byte[] refBytes = null;
String chr = null;
float score = -1;
while ((line = reader.readLine()) != null) {
if (line.trim().length() == 0) {
return; // return someething
}
if (line.startsWith("a")) {
score = -1; // Reset score
String[] tokens = Globals.whitespacePattern.split(line);
for (String token : tokens) {
if (token.startsWith("score=")) {
try {
score = Float.parseFloat(token.substring(6));
} catch (NumberFormatException e) {
System.err.println("Could not parse score: " + token);
}
}
}
}
if (line.startsWith("s ")) {
if (null == referenceLine) {
referenceLine = parseSequenceLine(line);
refBytes = referenceLine.text.getBytes();
if (referenceLine.src.contains(".")) {
int idx = referenceLine.src.lastIndexOf('.') + 1;
chr = referenceLine.src.substring(idx);
} else {
chr = referenceLine.src;
}
sequenceDictionary.put(chr, referenceLine.srcSize);
} else {
queryLine = parseSequenceLine(line);
int flags = queryLine.strand == '+' ? 0 : 16;
// Build cigar string one byte at a time
byte[] queryBytes = queryLine.text.getBytes();
if (queryBytes.length != refBytes.length) {
throw new RuntimeException("Query and ref sequence unequal length");
}
String cigarString = "";
int nm = 0;
boolean indel = false;
for (int i = 0; i < queryBytes.length; i++) {
byte q = queryBytes[i];
byte ref = refBytes[i];
if (q == '-') {
if (ref == '-') {
//ignore, caused by insertion in another alignment;
} else {
cigarString += "D";
if (!indel) {
indel = true;
nm++;
}
}
} else {
if (ref == '-') {
cigarString += "I";
if (!indel) {
indel = true;
nm++;
}
} else {
indel = false;
cigarString += "M";
if (queryBytes[i] != refBytes[i]) {
nm++;
}
}
}
}
cigarString = collapseCigar(cigarString);
String readName = queryLine.src;
String barcode = null;
if (readName.contains("Barcode")) {
int bcIdx = readName.indexOf("Barcode") + 8;
int endIdx = readName.indexOf(':', bcIdx);
barcode = readName.substring(bcIdx, endIdx);
}
// Output SAM record
int start = referenceLine.start + 1;
int mapq = 30;
String rnext = "*";
int pnext = 0;
int tlen = 0;
String seq = collapseSequence(queryLine.text);
String qual = "*";
out.print(readName + "\t" + flags + "\t" + chr + "\t" + start + "\t" + mapq + "\t" + cigarString + "\t" +
rnext + "\t" + pnext + "\t" + tlen + "\t" + seq + "\t" + qual + "\tNM:i:" + nm);
if (barcode != null) {
out.print("\tBC:Z:" + barcode);
}
if(score > 0) {
out.print("\tsc:f:" + String.valueOf(score));
}
out.println();
}
}
}
// Output SAN header
}
private static void outputHeader(Map<String, Integer> sequenceDictionary, PrintWriter out) {
List<String> chrNames = new ArrayList<String>(sequenceDictionary.keySet());
Collections.sort(chrNames, ChromosomeNameComparator.get());
out.println("@HD\tVN:1.5");
for (String chr : chrNames) {
out.println("@SQ\tSN:" + chr + "\tLN:" + sequenceDictionary.get(chr));
}
out.println("@PG\tPN:MAFtoSAM\tID:MAFtoSAM");
}
private static String collapseCigar(String cigarString) {
if (cigarString.length() == 0) return "";
String collapsedCigar = "";
char lastOperator = cigarString.charAt(0);
int counter = 1;
for (int i = 1; i < cigarString.length(); i++) {
if (cigarString.charAt(i) == lastOperator) {
counter++;
} else {
collapsedCigar += ("" + counter + lastOperator);
lastOperator = cigarString.charAt(i);
counter = 1;
}
}
collapsedCigar += ("" + counter + lastOperator);
return collapsedCigar;
}
private static String collapseSequence(String text) {
return text.replaceAll("-", "");
}
/**
* Parse an alignment block. Assumes 1 alignment per block, first "s" record is reference, second is alignment
*
* @param line
*/
private static SequenceLine parseSequenceLine(String line) throws IOException {
String[] tokens = Globals.whitespacePattern.split(line);
SequenceLine sl = new SequenceLine();
sl.src = tokens[1];
sl.start = Integer.parseInt(tokens[2]);
sl.size = Integer.parseInt(tokens[3]);
sl.strand = tokens[4].charAt(0);
sl.srcSize = Integer.parseInt(tokens[5]);
sl.text = tokens[6];
return sl;
}
static class SequenceLine {
String src;
int start;
int size;
char strand;
int srcSize;
String text;
}
}