package com.github.lindenb.jvarkit.tools.impactdup;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.SamRecordIntervalIteratorFactory;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;
@Program(name="impactofduplicates",
description="Impact of Duplicates per BAM.",
keywords={"bam"}
)
public class ImpactOfDuplicates extends Launcher
{
private static final Logger LOG = Logger.build(ImpactOfDuplicates.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
@Parameter(names="-B", description="BED File")
private File BEDFILE = null;
@ParametersDelegate
private WritingSortingCollection sortingCollectionArgs=new WritingSortingCollection();
/** sam file dict, to retrieve the sequences names */
private List< SAMSequenceDictionary> samFileDicts=new ArrayList<SAMSequenceDictionary>();
/** buffer for Duplicates */
private List<Duplicate> duplicatesBuffer=new ArrayList<Duplicate>();
/** output */
private PrintStream out=System.out;
/* current index in BAM list */
private int bamIndex;
/* all duplicates, sorted */
private SortingCollection<Duplicate> duplicates;
private class Duplicate implements Comparable<Duplicate>
{
int tid;
int pos;
int size;
int bamIndex;
public String getReferenceName()
{
return samFileDicts.get(this.bamIndex).getSequence(this.tid).getSequenceName();
}
public int compareChromPosSize(final Duplicate o)
{
int i=getReferenceName().compareTo(o.getReferenceName());
if(i!=0) return i;
i=pos-o.pos;
if(i!=0) return i;
i=size-o.size;
return i;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + bamIndex;
result = prime * result + pos;
result = prime * result + size;
result = prime * result + tid;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Duplicate other = (Duplicate) obj;
return compareTo(other)==0;
}
@Override
public int compareTo(final Duplicate o)
{
int i=compareChromPosSize(o);
if(i!=0) return i;
return bamIndex-o.bamIndex;
}
@Override
public String toString() {
return "(bamIndex:"+bamIndex+" pos:"+pos+" size:"+size+" tid:"+tid+")";
}
}
private class DuplicateCodec
extends BinaryCodec
implements SortingCollection.Codec<Duplicate >
{
@Override
public void encode(final Duplicate d)
{
this.writeInt(d.tid);
this.writeInt(d.pos);
this.writeInt(d.size);
this.writeInt(d.bamIndex);
}
@Override
public Duplicate decode()
{
Duplicate d=new Duplicate();
try
{
d.tid=this.readInt();
}
catch(RuntimeEOFException err)
{
return null;
}
d.pos=this.readInt();
d.size=this.readInt();
d.bamIndex=this.readInt();
return d;
}
@Override
public DuplicateCodec clone()
{
return new DuplicateCodec();
}
}
private void dumpDuplicatesBuffer(final List<File> INPUT)
{
if(this.duplicatesBuffer.isEmpty()) return;
int counts[]=new int[INPUT.size()];
Arrays.fill(counts, 0);
int maxDup=0;
for(int i=0;i< this.duplicatesBuffer.size();++i)
{
Duplicate di=this.duplicatesBuffer.get(i);
counts[di.bamIndex]++;
maxDup=Math.max(maxDup,counts[di.bamIndex]);
}
if(maxDup<10)
{
this.duplicatesBuffer.clear();
return;
}
int total=0;
for(int i:counts) total+=i;
Duplicate front=this.duplicatesBuffer.get(0);
out.print(
front.getReferenceName()+":"+
front.pos+"-"+
(front.pos+front.size)
);
out.print("\t"+maxDup+"\t"+(int)(total/(1.0*counts.length)));
for(int i=0;i< counts.length;++i)
{
out.print('\t');
out.print(counts[i]);
}
out.println();
this.duplicatesBuffer.clear();
}
@Override
public int doWork(final List<String> args) {
CloseableIterator<Duplicate> dupIter=null;
final List<File> INPUT = args.stream().map(S->new File(S)).collect(Collectors.toList());
try
{
this.duplicates=SortingCollection.newInstance(
Duplicate.class,
new DuplicateCodec(),
new Comparator<Duplicate>()
{
@Override
public int compare(Duplicate o1, Duplicate o2)
{
return o1.compareTo(o2);
}
},
this.sortingCollectionArgs.getMaxRecordsInRam(),
this.sortingCollectionArgs.getTmpDirectories()
);
for(this.bamIndex=0;
this.bamIndex< INPUT.size();
this.bamIndex++)
{
int prev_tid=-1;
int prev_pos=-1;
long nLines=0L;
File inFile= INPUT.get(this.bamIndex);
LOG.info("Processing "+inFile);
IOUtil.assertFileIsReadable(inFile);
SamReader samReader=null;
CloseableIterator<SAMRecord> iter=null;
try
{
samReader= SamReaderFactory.make().validationStringency(ValidationStringency.LENIENT).open(inFile);
final SAMFileHeader header=samReader.getFileHeader();
this.samFileDicts.add(header.getSequenceDictionary());
if(BEDFILE==null)
{
iter=samReader.iterator();
}
else
{
IntervalList intervalList=new IntervalList(header);
BufferedReader in=new BufferedReader(new FileReader(BEDFILE));
String line=null;
while((line=in.readLine())!=null)
{
if(line.isEmpty() || line.startsWith("#")) continue;
String tokens[]=line.split("[\t]");
Interval interval=new Interval(tokens[0], 1+Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]));
intervalList.add(interval);
}
in.close();
intervalList=intervalList.sorted();
List<Interval> uniqueIntervals=IntervalList.getUniqueIntervals(intervalList,false);
SamRecordIntervalIteratorFactory sriif=new SamRecordIntervalIteratorFactory();
iter=sriif.makeSamRecordIntervalIterator(samReader, uniqueIntervals, false);
}
while(iter.hasNext())
{
SAMRecord rec=iter.next();
if(rec.getReadUnmappedFlag()) continue;
if(!rec.getReadPairedFlag()) continue;
if(rec.getReferenceIndex()!=rec.getMateReferenceIndex()) continue;
if(!rec.getProperPairFlag()) continue;
if(!rec.getFirstOfPairFlag()) continue;
if(prev_tid!=-1 )
{
if(prev_tid> rec.getReferenceIndex())
{
throw new IOException("Bad sort order from "+rec);
}
else if(prev_tid==rec.getReferenceIndex() && prev_pos>rec.getAlignmentStart())
{
throw new IOException("Bad sort order from "+rec);
}
else
{
prev_pos=rec.getAlignmentStart();
}
}
else
{
prev_tid=rec.getReferenceIndex();
prev_pos=-1;
}
if((++nLines)%1000000==0)
{
LOG.info("In "+inFile+" N="+nLines);
}
Duplicate dup=new Duplicate();
dup.bamIndex=this.bamIndex;
dup.pos=Math.min(rec.getAlignmentStart(),rec.getMateAlignmentStart());
dup.tid=rec.getReferenceIndex();
dup.size=Math.abs(rec.getInferredInsertSize());
this.duplicates.add(dup);
}
}
finally
{
if(iter!=null) iter.close();
if(samReader!=null) samReader.close();
}
LOG.info("done "+inFile);
}
/** loop done, now scan the duplicates */
LOG.info("doneAdding");
this.duplicates.doneAdding();
this.out= super.openFileOrStdoutAsPrintStream(outputFile);
out.print("#INTERVAL\tMAX\tMEAN");
for(int i=0;i< INPUT.size();++i)
{
out.print('\t');
out.print(INPUT.get(i));
}
out.println();
dupIter=this.duplicates.iterator();
while(dupIter.hasNext())
{
Duplicate dup=dupIter.next();
if( this.duplicatesBuffer.isEmpty() ||
dup.compareChromPosSize(this.duplicatesBuffer.get(0))==0)
{
this.duplicatesBuffer.add(dup);
}
else
{
dumpDuplicatesBuffer(INPUT);
this.duplicatesBuffer.add(dup);
}
}
dumpDuplicatesBuffer(INPUT);
LOG.info("end iterator");
out.flush();
out.close();
}
catch (Exception e) {
LOG.error(e);
return -1;
}
finally
{
if(dupIter!=null) dupIter.close();
LOG.info("cleaning duplicates");
this.duplicates.cleanup();
}
return 0;
}
public static void main(final String[] argv)
{
new ImpactOfDuplicates().instanceMainWithExit(argv);
}
}