package com.github.lindenb.jvarkit.tools.mem; import java.io.File; import java.util.ArrayList; import java.util.List; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.DefaultSAMRecordFactory; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecord.SAMTagAndValue; import htsjdk.samtools.SAMRecordFactory; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SAMValidationError; import htsjdk.samtools.util.CloserUtil; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign; import com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlignFactory; @Program(name="bwamemnop",description="Merge the other BWA-MEM alignements with its SA:Z:* attributes to an alignment containing a cigar string with 'N' ( Skipped region from the reference.)") public class BWAMemNOp extends Launcher { private static final Logger LOG=Logger.build(BWAMemNOp.class).make(); @Parameter(names={"-o"},description="output file") private File outputFile=null; @Parameter(names={"-m"},description="min soft clip length") private int min_soft_clip_length=10; @Parameter(names={"-S"},description=" only print converted reads") private boolean print_only_spit_read=false; @ParametersDelegate private WritingBamArgs writingBamArgs=new WritingBamArgs(); static class CigarEvt { CigarOperator op; int read0; int ref1; } private static List<CigarEvt> cigarEvents(int read0,int ref1,Cigar cigar) { List<CigarEvt> L=new ArrayList<CigarEvt>(); for(CigarElement ce:cigar.getCigarElements()) { for(int i=0;i< ce.getLength();++i) { CigarEvt evt=new CigarEvt(); evt.op=ce.getOperator(); evt.read0=read0; evt.ref1=ref1; L.add(evt); switch(ce.getOperator()) { case P:break; case H:break; case I: case S: { read0++; break; } case D: case N: { ref1++; break; } case X:case EQ:case M: { read0++; ref1++; break; } default:throw new IllegalStateException(); } } } return L; } @Override public int doWork(List<String> args) { SamReader r=null; SAMFileWriter w=null; try { r = super.openSamReader(oneFileOrNull(args)); SAMFileHeader header=r.getFileHeader(); OtherCanonicalAlignFactory ocaf=new OtherCanonicalAlignFactory(header); w = writingBamArgs.openSAMFileWriter(outputFile, header, true); SAMRecordFactory samRecordFactory=new DefaultSAMRecordFactory(); SAMRecordIterator iter=r.iterator(); while(iter.hasNext()) { SAMRecord rec=iter.next(); if(rec.getSupplementaryAlignmentFlag()) { continue; } if(rec.getReadUnmappedFlag()) { if(!print_only_spit_read) w.addAlignment(rec); continue; } Cigar cigar1=rec.getCigar(); if(cigar1==null || cigar1.isEmpty() || !(cigar1.getCigarElement(cigar1.numCigarElements()-1).getOperator().equals(CigarOperator.S) || cigar1.getCigarElement(0).getOperator().equals(CigarOperator.S) ) ) //last or first is soft clipping { if(!print_only_spit_read) w.addAlignment(rec); continue; } rec.getAlignmentStart(); List<OtherCanonicalAlign> xps=ocaf.getXPAligns(rec); if(xps.isEmpty()) { if(!print_only_spit_read) w.addAlignment(rec); continue; } boolean found_one=false; for(OtherCanonicalAlign xp:xps) { if(!rec.getReferenceName().equals(xp.getReferenceName())) continue; if(xp.getReadNegativeStrandFlag()!=rec.getReadNegativeStrandFlag() ) continue; Cigar cigar2=xp.getCigar(); if(cigar2==null || cigar2.isEmpty() ) { continue; } SAMRecord newrec=null; List<CigarEvt> L1=null; List<CigarEvt> L2=null; if( cigar1.getCigarElement(cigar1.numCigarElements()-1).getOperator().equals(CigarOperator.S) && cigar1.getCigarElement(cigar1.numCigarElements()-1).getLength() >= this.min_soft_clip_length && cigar2.getCigarElement(0).getOperator().equals(CigarOperator.S) && cigar2.getCigarElement(0).getLength() >= this.min_soft_clip_length && rec.getAlignmentEnd()< xp.getAlignmentStart() ) { newrec=samRecordFactory.createSAMRecord(header); int ref1=rec.getAlignmentStart(); newrec.setAlignmentStart(ref1); L1=cigarEvents(0, ref1, cigar1); L2=cigarEvents(0, xp.getAlignmentStart(), cigar2); } else if( cigar2.getCigarElement(cigar2.numCigarElements()-1).getOperator().equals(CigarOperator.S) && cigar2.getCigarElement(cigar2.numCigarElements()-1).getLength() >= this.min_soft_clip_length && cigar1.getCigarElement(0).getOperator().equals(CigarOperator.S) && cigar1.getCigarElement(0).getLength() >= this.min_soft_clip_length && xp.getAlignmentEnd()< rec.getAlignmentStart() ) { newrec=samRecordFactory.createSAMRecord(header); int ref1=xp.getAlignmentStart(); newrec.setAlignmentStart(ref1); L1=cigarEvents(0, ref1, cigar2); L2=cigarEvents(0, rec.getAlignmentStart(), cigar1); } if(newrec==null) continue; newrec.setFlags(rec.getFlags()); newrec.setReadName(rec.getReadName()); newrec.setReadBases(rec.getReadBases()); newrec.setMappingQuality(rec.getMappingQuality()); newrec.setReferenceIndex(rec.getReferenceIndex()); newrec.setBaseQualities(rec.getBaseQualities()); if(found_one) { newrec.setNotPrimaryAlignmentFlag(true); } found_one=true; for(SAMTagAndValue tav: rec.getAttributes()) { if(tav.tag.equals(ocaf.getAttributeKey())) continue; if(tav.tag.equals("NM")) continue; newrec.setAttribute(tav.tag, tav.value); } if(rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { newrec.setMateAlignmentStart(rec.getMateAlignmentStart()); newrec.setMateReferenceIndex(rec.getMateReferenceIndex()); newrec.setInferredInsertSize(rec.getInferredInsertSize()); } while(!L1.isEmpty() && (L1.get(L1.size()-1).op.equals(CigarOperator.S) || L1.get(L1.size()-1).op.equals(CigarOperator.D) || L1.get(L1.size()-1).op.equals(CigarOperator.H))) { L1.remove(L1.size()-1); } while(!L2.isEmpty() && L2.get(0).read0<=L1.get(L1.size()-1).read0) { L2.remove(0); } List<CigarElement> cigarElements=new ArrayList<CigarElement>(); int i=0; while(i< L1.size()) { int j=i+1; while(j< L1.size() && L1.get(i).op.equals(L1.get(j).op)) { j++; } cigarElements.add(new CigarElement(j-i, L1.get(i).op)); i=j; } //add 'N' cigarElements.add( new CigarElement((L2.get(0).ref1-L1.get(L1.size()-1).ref1)-1, CigarOperator.N) ); i=0; while(i< L2.size()) { int j=i+1; while(j< L2.size() && L2.get(i).op.equals(L2.get(j).op)) { j++; } cigarElements.add(new CigarElement(j-i, L2.get(i).op)); i=j; } //cleanup : case where 'S' is close to 'N' i=0; while(i+1< cigarElements.size()) { CigarElement ce1=cigarElements.get(i); CigarElement ce2=cigarElements.get(i+1); if( i>0 && ce1.getOperator().equals(CigarOperator.S) && ce2.getOperator().equals(CigarOperator.N) ) { cigarElements.set(i, new CigarElement( ce1.getLength(), CigarOperator.X)); } else if(i+2 < cigarElements.size() && ce1.getOperator().equals(CigarOperator.N) && ce2.getOperator().equals(CigarOperator.S) ) { cigarElements.set(i+1, new CigarElement( ce2.getLength(), CigarOperator.X)); } i++; } newrec.setCigar(new Cigar(cigarElements)); List<SAMValidationError> validations=newrec.isValid(); if(validations!=null) { for(SAMValidationError err:validations) { LOG.warning(err.getType()+":"+err.getMessage()); } } w.addAlignment(newrec); } if(!found_one) { if(!print_only_spit_read) w.addAlignment(rec); } } iter.close(); return 0; } catch(Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(r); CloserUtil.close(w); } } /** * @param args */ public static void main(String[] args) { new BWAMemNOp().instanceMainWithExit(args); } }