package com.github.lindenb.jvarkit.tools.mem; import java.io.File; import java.io.PrintWriter; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFlag; import htsjdk.samtools.SAMProgramRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.ProgressLoggerInterface; import htsjdk.samtools.SAMFileWriter; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.util.Counter; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlignFactory; @Program(name="findmyvirus",description="Find my Virus. Created for Adrien Inserm. Proportion of reads mapped on HOST/VIRUS. ") public class FindMyVirus extends Launcher { private static final Logger LOG= Logger.build(FindMyVirus.class).make(); /* how to split the bam, witch categories */ private enum CAT{ both_ref() { @Override public String getDescription() { return "Both reads map the HOST reference"; } } , both_virus() { @Override public String getDescription() { return "Both reads map the VIRUS reference"; } }, ref_orphan() { @Override public String getDescription() { return "In a pair: the read is mapped on the HOST Reference, the mate is unmapped. No spliced read mapped on VIRUS."; } }, virus_orphan() { @Override public String getDescription() { return "In a pair: the read is mapped on the VIRUS Reference, the mate is unmapped. No spliced read mapped on HOST."; } }, ref_and_virus() { @Override public String getDescription() { return "In a pair: a read is mapped on the VIRUS Reference, the other is mapped on HOST. No spliced read mapped on the other genome."; } }, ref_and_virus_spliced() { @Override public String getDescription() { return "In a pair: a read is mapped on the VIRUS Reference, the other is mapped on HOST. A spliced read is mapped on the other genome."; } }, unpaired() { @Override public String getDescription() { return "read is not paired."; } }, undetermined() { @Override public String getDescription() { return "Unknown category/case"; } }, unmapped() { @Override public String getDescription() { return "Read is unmapped"; } }, duplicate() { @Override public String getDescription() { return "Read is duplicate"; } }, secondary { @Override public String getDescription() { return "Secondary alignment"; } }, failsqual{ @Override public String getDescription() { return "Fails mapper quality-control"; } }; public String getDescription() { return name(); } }; @Parameter(names={"-o","--out"},description="output",required=true) private File outputFile=null; @Parameter(names={"-V"},description=" virus chrom name") private Set<String> virusNames=new HashSet<String>(); @ParametersDelegate private WritingBamArgs writingBamArgs=new WritingBamArgs(); @Override public int doWork(List<String> args) { if(virusNames.isEmpty()) { LOG.error("no virus name"); return -1; } SamReader sfr=null; SAMFileWriter sfwArray[]=new SAMFileWriter[CAT.values().length]; try { sfr = openSamReader(oneFileOrNull(args)); SAMFileHeader header=sfr.getFileHeader(); for(CAT category:CAT.values()) { LOG.info("Opening "+category); SAMFileHeader header2=header.clone(); header2.addComment("Category:"+category.name()); header2.addComment("Description:"+category.getDescription()); SAMProgramRecord rec=header2.createProgramRecord(); rec.setCommandLine(this.getProgramCommandLine()); rec.setProgramName(getProgramName()); rec.setProgramVersion(getVersion()); rec.setAttribute("CAT", category.name()); File outputFile=new File(this.outputFile.getParentFile(),this.outputFile.getName()+"."+category.name()+".bam"); LOG.info("Opening "+outputFile); File countFile=new File(outputFile.getParentFile(),outputFile.getName()+"."+category.name()+".count.txt"); SAMFileWriter sfw= writingBamArgs.openSAMFileWriter(outputFile, header2, true); sfw=new SAMFileWriterCount(sfw, countFile,category); sfwArray[category.ordinal()]=sfw; } SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(header.getSequenceDictionary()); OtherCanonicalAlignFactory xpAlignFactory=new OtherCanonicalAlignFactory(header); SAMRecordIterator iter=sfr.iterator(); while(iter.hasNext()) { SAMRecord rec=iter.next(); progress.watch(rec); CAT category=null; List<OtherCanonicalAlign> xpList=Collections.emptyList(); if(category==null && !rec.getReadPairedFlag()) { category=CAT.unpaired; } if(category==null && rec.isSecondaryOrSupplementary()) { category=CAT.secondary; } if(category==null && rec.getReadFailsVendorQualityCheckFlag()) { category=CAT.failsqual; } if(category==null && rec.getDuplicateReadFlag()) { category=CAT.duplicate; } if(category==null && rec.getReadUnmappedFlag()) { category=CAT.unmapped; } if(category==null) { xpList=xpAlignFactory.getXPAligns(rec); } boolean xp_containsVirus=false; boolean xp_containsChrom=false; for(OtherCanonicalAlign xpa:xpList) { if(virusNames.contains(xpa.getReferenceName())) { xp_containsVirus=true; } else { xp_containsChrom=true; } } /* both reads mapped on ref */ if(category==null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName()) ) { if(!xp_containsVirus) { category=CAT.both_ref; } else { category=CAT.ref_and_virus_spliced; } } /* pair(unmapped,mapped on reference) */ if(category==null && ( (!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getMateReferenceName())) )) { if(!xp_containsVirus) { category=CAT.ref_orphan; } else { category=CAT.ref_and_virus_spliced; } } /* both reads mapped on virus */ if(category==null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName()) ) { if(!xp_containsChrom) { category=CAT.both_virus; } else { category=CAT.ref_and_virus_spliced; } } if(category==null && ( (!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getMateReferenceName())) )) { if(!xp_containsChrom) { category=CAT.virus_orphan; } else { category=CAT.ref_and_virus_spliced; } } if(category==null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && ( (virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName())) || (!virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName())) ) ) { category=CAT.ref_and_virus; } /*dispatch */ if(category==null) { LOG.warning("Not handled: "+rec); category=CAT.undetermined; } sfwArray[category.ordinal()].addAlignment(rec); } for(SAMFileWriter sfw:sfwArray) { LOG.info("Closing "+sfw); sfw.close(); } return 0; } catch(Exception err) { LOG.error(err); return -1; } finally { LOG.info("Closing"); CloserUtil.close(sfr); CloserUtil.close(sfwArray); } } public static void main(String[] args) { new FindMyVirus().instanceMainWithExit(args); } /** wrapper of SAMFileWriter to get some statistics about a BAM. write data on close() */ private class SAMFileWriterCount implements SAMFileWriter { private CAT category; private SAMFileWriter delegate; private File countFile; private Counter<String> chrom=new Counter<String>(); private Counter<Integer> flags=new Counter<Integer>(); @SuppressWarnings("unused") private ProgressLoggerInterface progressLogger; SAMFileWriterCount(SAMFileWriter delegate,File countFile,CAT category) { this.category=category; this.countFile=countFile; this.delegate=delegate; for(SAMSequenceRecord rec:delegate.getFileHeader().getSequenceDictionary().getSequences()) { chrom.initializeIfNotExists(rec.getSequenceName()); } chrom.initializeIfNotExists(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); } @Override public void setProgressLogger(ProgressLoggerInterface progressLogger) { this.progressLogger=progressLogger; } @Override public void addAlignment(SAMRecord alignment) { this.delegate.addAlignment(alignment); this.flags.incr(alignment.getFlags()); if(!alignment.getReadUnmappedFlag()) { chrom.incr(alignment.getReferenceName()); } else { chrom.incr(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); } } @Override public SAMFileHeader getFileHeader() { return this.delegate.getFileHeader(); } @Override public void close() { LOG.info("Closing SAMFileWriterCount "); PrintWriter fw=null; try { LOG.info("Writing "+countFile); fw=new PrintWriter(countFile); fw.println(this.category.name()); fw.println(this.category.getDescription()); fw.println("#CHROMOSOME\tCOUNT"); for(String c:this.chrom.keySetDecreasing()) { fw.println(c+"\t"+this.chrom.count(c)); } fw.println("Total\t"+this.chrom.getTotal()); fw.println("#FLAG\tCOUNT\texplain"); for(Integer c:this.flags.keySetDecreasing()) { fw.print(c+"\t"+this.flags.count(c)+"\t"); for(SAMFlag flg:SAMFlag.values()) { if(flg.isSet(c)) { fw.write(flg.name()+" "); } } fw.println(); } fw.flush(); } catch(Exception err) { LOG.error(err); throw new RuntimeException("Boum:"+countFile,err); } finally { CloserUtil.close(fw); } this.delegate.close(); } @Override public String toString() { return "SAMFileWriterCount "+countFile; } } }