package com.github.lindenb.jvarkit.tools.mem; import java.io.BufferedReader; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.List; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.io.IOUtils; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.SamSequenceRecordTreeMap; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlignFactory; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.util.CloserUtil; /** * : ant bwamemdigest && ~/package/bwa/bwa-0.7.4/bwa mem \ * -M ~/tmp/DATASANGER/hg18/chr1.fa \ * ~/tmp/DATASANGER/4368_1_1.fastq.gz ~/tmp/DATASANGER/4368_1_2.fastq.gz 2> /dev/null | * java -jar dist/bwamemdigest.jar BED=<(curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gap.txt.gz" | gunzip -c | cut -f2,3,4 ) XBED=500 | tee /dev/tty | gzip --best > /tmp/jeter.mem.bed.gz */ @Program(name="bwamemdigest",description="") public class BWAMemDigest extends Launcher { private static final Logger LOG = Logger.build(BWAMemDigest.class).make(); @Parameter(names={"-B"}, description="BED of Regions to ignore.") public File IGNORE_BED = null; @Parameter(names={"-x"}, description=" Extend BED Regions to ignore by 'x' bases. ") public int IGNORE_EXTEND = 0; private abstract class AbstractMemOuput implements Closeable { } private class DefaultMemOuput extends AbstractMemOuput { PrintWriter out=new PrintWriter(stdout()); public void insertion(SAMRecord record,long readNum,float countS,float countM) { stdout().println( bed(record)+ "\t"+readNum+ "\tINSERT\t"+ record.getCigarString()+"\t"+(countS/countM) ); } public void xp(SAMRecord record,long readNum,OtherCanonicalAlign xp) { stdout().println( bed(record)+ "\t"+readNum+ "\tXP"+ "\t"+record.getCigarString()+ "\t"+xp.getReferenceName()+ "\t"+xp.getAlignmentStart()+ "\t"+(xp.getReadNegativeStrandFlag()?"-":"+")+ "\t"+xp.getCigarString() ); } public void orphan(SAMRecord record,long readNum) { stdout().println( bed(record)+ "\t"+readNum+ "\tORPHAN" ); } public void deletion(SAMRecord record,long readNum) { stdout().println( bed(record)+ "\t"+readNum+ "\tDEL"+ "\t"+mate(record) ); } public void transloc(SAMRecord record,long readNum) { stdout().println( bed(record)+ "\t"+readNum+ "\tTRANSLOC"+ "\t"+mate(record) ); } @Override public void close() throws IOException { out.flush(); out.close(); } } public BWAMemDigest() { } private static String bed(SAMRecord record) { return record.getReferenceName()+"\t"+ (record.getAlignmentStart()-1)+"\t"+ record.getAlignmentEnd()+"\t"+ (record.getReadNegativeStrandFlag()?"-":"+") ; } private static String mate(SAMRecord record) { return record.getMateReferenceName()+"\t"+ (record.getMateAlignmentStart()-1)+"\t"+ (record.getMateNegativeStrandFlag()?"-":"+") ; } @Override public int doWork(List<String> args) { SamSequenceRecordTreeMap<Boolean> ignore=null; DefaultMemOuput output=new DefaultMemOuput(); final float limitcigar=0.15f; SamReader r=null; try { r = super.openSamReader(oneFileOrNull(args)); SAMFileHeader header=r.getFileHeader(); if(IGNORE_BED!=null) { LOG.info("open "+IGNORE_BED); ignore=new SamSequenceRecordTreeMap<Boolean>(header.getSequenceDictionary()); BufferedReader in=IOUtils.openFileForBufferedReading(IGNORE_BED); String line; while((line=in.readLine())!=null) { if(line.isEmpty() || line.startsWith("#")) continue; String tokens[]=line.split("[\t]"); if(tokens.length<3) continue; if(ignore.put(tokens[0], Math.max(1,Integer.parseInt(tokens[1])-(1+IGNORE_EXTEND)), Integer.parseInt(tokens[2])+IGNORE_EXTEND, Boolean.TRUE )) { LOG.warn("BED:ignoring "+line); } } in.close(); } OtherCanonicalAlignFactory xPalignFactory=new OtherCanonicalAlignFactory(header); SAMRecordIterator iter=r.iterator(); long readNum=0L; while(iter.hasNext()) { SAMRecord record=iter.next(); ++readNum; if(!record.getReadPairedFlag()) continue; if(record.getProperPairFlag()) continue; if(record.getReadFailsVendorQualityCheckFlag()) continue; if(record.getDuplicateReadFlag()) continue; if(record.getReadUnmappedFlag()) continue; if(ignore!=null && ignore.containsOverlapping( record.getReferenceIndex(), record.getAlignmentStart(), record.getAlignmentEnd()) ) { LOG.info("ignore "+record); continue; } float countM=0f; float countS=0f; for(CigarElement c:record.getCigar().getCigarElements()) { switch(c.getOperator()) { case M: countM+=c.getLength(); break; case S: countS+=c.getLength(); break; default: break; } } if(countM>10 && ((countS/countM)>(1f-limitcigar) && (countS/countM)< (1f+limitcigar) )) { output.insertion(record, readNum, countS, countM); } for(OtherCanonicalAlign xp: xPalignFactory.getXPAligns(record)) { if(ignore!=null && ignore.containsOverlapping( xp.getReferenceName(), xp.getAlignmentStart(), xp.getAlignmentStart() )) { LOG.info("ignore "+record); continue; } output.xp(record, readNum, xp); } if(record.getMateUnmappedFlag()) { output.orphan(record, readNum); continue; } if(ignore!=null && ignore.containsOverlapping( record.getMateReferenceIndex(), record.getMateAlignmentStart(), record.getMateAlignmentStart()) ) { LOG.info("ignore "+record); continue; } if(record.getReferenceIndex()==record.getMateReferenceIndex()) { output.deletion(record, readNum); } else { output.transloc(record, readNum); } } } catch (Exception e) { e.printStackTrace(); LOG.error(e); return -1; } finally { CloserUtil.close(r); CloserUtil.close(output); } return 0; } public static void main(String[] args) { new BWAMemDigest().instanceMainWithExit(args); } }