package org.seqcode.data.readdb.tools;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.CloseableIterator;
import java.io.*;
import java.util.*;
import org.apache.commons.cli.*;
/**
* Reads two files of SAM or BAM data and produces output on stdout in the
* format expected by ImportHits. Both files must be sorted in the *same order*.
* Note that Bowtie (perhaps others) DOES NOT preserve input order perfectly if
* it's using multiple processors, so you can only use one processor if you're
* going to use this program to pair the reads up (you can however split the
* input file and run a separate bowtie on each part).
*
* Only reads present in both files will be included in the output (on stdout).
*
* The matching of reads between files is done by stripping "/\d" from the end of the
* read name, as reads usually end in /1 or /2.
*
* Usage:
* java PairedSAMToReadDB --left leftreads.bam --right rightreads.bam
*
*
* Options: --nosuboptimal (flag to only take the hits with the minimum number of mismatches)
* --uniquehits (flag to only print 1:1 read to hit mappings)
*
* nosuboptimal is applied before uniquehits
*
* Output columns are
* 1) left chromname
* 2) left position
* 3) left strand
* 4) left readlen
* 5) right chromname
* 6) right position
* 7) right strand
* 8) right length
* 9) weight
* 10) paircode
*/
public class PairedSAMToReadDB {
public static boolean uniqueOnly, filterSubOpt, debug;
public static int chunksize = 2000;
public static ArrayList<SAMRecord> leftbuffer, rightbuffer;
public static CloseableIterator<SAMRecord> leftiter, rightiter;
public static void dumpRecords(Collection<SAMRecord> lefts,
Collection<SAMRecord> rights) {
if (filterSubOpt) {
lefts = filterSubOpt(filterNoChrom(lefts));
rights = filterSubOpt(filterNoChrom(rights));
} else {
lefts = filterNoChrom(lefts);
rights = filterNoChrom(rights);
}
int mapcount = lefts.size() * rights.size();
if (mapcount == 0) {
return;
}
if (uniqueOnly && mapcount > 1) {
return;
}
float weight = 1 / ((float)mapcount);
for (SAMRecord left : lefts) {
for (SAMRecord right : rights) {
System.out.println(String.format("%s\t%d\t%s\t%d\t%s\t%d\t%s\t%d\t%f\t%d",
left.getReferenceName(),
left.getReadNegativeStrandFlag() ?
left.getAlignmentEnd() :
left.getAlignmentStart(),
left.getReadNegativeStrandFlag() ? "-" : "+",
left.getReadLength(),
right.getReferenceName(),
right.getReadNegativeStrandFlag() ?
right.getAlignmentEnd() :
right.getAlignmentStart(),
right.getReadNegativeStrandFlag() ? "-" : "+",
right.getReadLength(),
weight,
1));
}
}
}
public static Collection<SAMRecord> filterNoChrom(Collection<SAMRecord> input) {
if (input.size() == 0) {
return input;
}
Collection<SAMRecord> output = new ArrayList<SAMRecord>();
for (SAMRecord r : input) {
if (!r.getReferenceName().equals("*")) {
if(!uniqueOnly || r.getMappingQuality()!=0) //For BWA
output.add(r);
}
}
return output;
}
public static Collection<SAMRecord> filterSubOpt(Collection<SAMRecord> input) {
if (input == null || input.size() ==0) {
return input;
}
int maxqual = SAMRecord.NO_MAPPING_QUALITY;
for (SAMRecord r : input) {
if (r.getMappingQuality() > maxqual) {
maxqual = r.getMappingQuality();
}
}
Collection<SAMRecord> output = new ArrayList<SAMRecord>();
for (SAMRecord r : input) {
if (maxqual == r.getMappingQuality()) {
if(!uniqueOnly || r.getMappingQuality()!=0) //For BWA
output.add(r);
}
}
return output;
}
public static boolean fillLeft(int n) {
boolean filled = false;
for (int i = 0; i < n; i++) {
if (leftiter.hasNext()) {
SAMRecord r = leftiter.next();
r.setReadName(r.getReadName().replaceAll("/\\d$",""));
leftbuffer.add(r);
filled = true;
}
}
return filled;
}
public static boolean fillRight(int n) {
boolean filled = false;
for (int i = 0; i < n; i++) {
if (rightiter.hasNext()) {
SAMRecord r = rightiter.next();
r.setReadName(r.getReadName().replaceAll("/\\d$",""));
rightbuffer.add(r);
filled = true;
}
}
return filled;
}
public static boolean fill(int n) {
return fillLeft(n) && fillRight(n);
}
public static void makePairs() {
ArrayList<SAMRecord> leftrecords = new ArrayList<SAMRecord>();
ArrayList<SAMRecord> rightrecords = new ArrayList<SAMRecord>();
while (fill(chunksize) || leftbuffer.size() > 0) {
int clearL = 0, clearR = 0;
int lbs = Math.min(leftbuffer.size(),10000);
for (int i = 0; i < lbs; i++) {
String readname = leftbuffer.get(i).getReadName();
String nextreadname = i < leftbuffer.size() - 1 ? leftbuffer.get(i+1).getReadName() : null;
int j = clearR;
while (j < rightbuffer.size()) {
if (readname.equals(rightbuffer.get(j).getReadName())) {
/* having found a match, find the rest of the reads with that ID and output */
int k = i;
int l = j;
/* make sure we'll be able to find all reads with this name */
while (readname.equals(leftbuffer.get(leftbuffer.size()-1).getReadName()) &&
fillLeft(chunksize)) {
}
while (readname.equals(rightbuffer.get(rightbuffer.size()-1).getReadName()) &&
fillRight(chunksize)) {
}
do {
leftrecords.add(leftbuffer.get(k++));
} while (k < leftbuffer.size() && readname.equals(leftbuffer.get(k).getReadName()));
do {
rightrecords.add(rightbuffer.get(l++));
} while (l < rightbuffer.size() && readname.equals(rightbuffer.get(l).getReadName()));
dumpRecords(leftrecords, rightrecords);
leftrecords.clear();
rightrecords.clear();
clearL = k;
clearR = l;
i = k-1;
break;
} else if (nextreadname != null && nextreadname.equals(rightbuffer.get(j).getReadName())) {
clearR = j;
break;
}
j++;
}
}
if (rightiter.hasNext()) {
leftbuffer.subList(0,clearL).clear();
rightbuffer.subList(0,clearR).clear();
} else {
leftbuffer.clear();
}
//System.err.println(String.format("loop %d, left %d, right %d", loop++, leftbuffer.size(), rightbuffer.size()));
}
}
public static void main(String args[]) throws IOException, ParseException {
Options options = new Options();
options.addOption("l","left",true,"filename of left side of read");
options.addOption("r","right",true,"filename of right side of read");
options.addOption("u","uniquehits",false,"only output hits with a single mapping");
options.addOption("s","nosuboptimal",false,"do not include hits whose score is not equal to the best score for the read");
options.addOption("D","debug",false,"enable debugging spew?");
CommandLineParser parser = new GnuParser();
CommandLine cl = parser.parse( options, args, false );
uniqueOnly = cl.hasOption("uniquehits");
filterSubOpt = cl.hasOption("nosuboptimal");
debug = cl.hasOption("debug");
String leftfile = cl.getOptionValue("left");
String rightfile = cl.getOptionValue("right");
SamReaderFactory factory =
SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
.validationStringency(ValidationStringency.SILENT);
SamReader leftreader = factory.open(SamInputResource.of(new FileInputStream(leftfile)));
SamReader rightreader = factory.open(SamInputResource.of(new FileInputStream(rightfile)));
leftiter = leftreader.iterator();
rightiter = rightreader.iterator();
leftbuffer = new ArrayList<SAMRecord>();
rightbuffer = new ArrayList<SAMRecord>();
makePairs();
leftreader.close();
rightreader.close();
}
}