package com.github.lindenb.jvarkit.tools.misc; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamReader; import java.io.File; import java.util.ArrayList; import java.util.List; import com.github.lindenb.jvarkit.lang.SubSequence; import com.github.lindenb.jvarkit.util.align.LongestCommonSequence; import com.github.lindenb.jvarkit.util.bio.RevCompCharSequence; import com.github.lindenb.jvarkit.util.picard.GenomicSequence; import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; @Program(name="localrealignreads",description="Local Realignment of Reads") public class LocalRealignReads extends Launcher { private static final Logger LOG = Logger.build(LocalRealignReads.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @Parameter(names={"-R","--reference"},description="Indexed fasta Reference") private File faidxFile = null; @ParametersDelegate private WritingBamArgs writingBamArgs=new WritingBamArgs(); private int MIN_ALIGN_LEN=15; private void align( final List<LongestCommonSequence.Hit> hits, final LongestCommonSequence matrix, final CharSequence S1, int start1, int end1, final CharSequence S2, int start2, int end2, int side ) { if(end1-start1<MIN_ALIGN_LEN) return; if(end2-start2<MIN_ALIGN_LEN) return; LongestCommonSequence.Hit hit = matrix.align( S1, start1,end1, S2, start2,end2 ); int align_size = hit.size(); if(align_size<MIN_ALIGN_LEN) return; hits.add(hit); if(side==-1 || side==1) align(hits,matrix,S1,start1,end1,S2,hit.getEndY(),end2,1); if(side==-1 || side==2) align(hits,matrix,S1,start1,end1,S2,0,hit.getStartY(),2); } @Override public int doWork(List<String> args) { if(this.faidxFile==null) { LOG.error("REFerence file missing;"); return -1; } IndexedFastaSequenceFile indexedFastaSequenceFile =null; SamReader samReader =null; SAMFileWriter w = null; SAMRecordIterator iter = null; GenomicSequence genomicSequence = null; LongestCommonSequence matrix =new LongestCommonSequence(); try { indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile); samReader = openSamReader(oneFileOrNull(args)); final SAMFileHeader header1 = samReader.getFileHeader(); final SAMFileHeader header2 = header1.clone(); header2.setSortOrder(SortOrder.unsorted); w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile,header2, true); final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1); iter = samReader.iterator(); while(iter.hasNext()) { final SAMRecord rec = progress.watch(iter.next()); if( rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) { w.addAlignment(rec); continue; } final Cigar cigar = rec.getCigar(); final int nCigarElement = cigar.numCigarElements(); if( nCigarElement <2) { w.addAlignment(rec); continue; } final CigarElement ce5 = cigar.getCigarElement(0); final CigarElement ce3 = cigar.getCigarElement(nCigarElement-1); if(!( (ce3.getOperator().equals(CigarOperator.S) && ce3.getLength()>=MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength()>=MIN_ALIGN_LEN)) ) { w.addAlignment(rec); continue; } if( genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName()) ) { genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName()); } final CharSequence readseq = rec.getReadString(); /** short invert */ if(ce5.getOperator()==CigarOperator.S && ce5.getLength()>=MIN_ALIGN_LEN) { CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength()); CharSequence revcomp = new RevCompCharSequence(clipseq); LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0,rec.getUnclippedStart()-rec.getReadLength()), Math.min(rec.getUnclippedEnd()+rec.getReadLength(),genomicSequence.length()), revcomp, 0, revcomp.length() ); if(hit.size()>=MIN_ALIGN_LEN) { System.err.println("REVCOMP5' "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString()); } /* hit = matrix.align( readseq, 0, readseq.length(), revcomp, 0, revcomp.length() ); if(hit.size()>=MIN_ALIGN_LEN) { System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString()); } */ } if(ce3.getOperator()==CigarOperator.S && ce3.getLength()>=MIN_ALIGN_LEN) { CharSequence clipseq = new SubSequence(readseq,readseq.length()-ce3.getLength(),readseq.length()); CharSequence revcomp = new RevCompCharSequence(clipseq); LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0,rec.getUnclippedStart()-rec.getReadLength()), Math.min(rec.getUnclippedEnd()+rec.getReadLength(),genomicSequence.length()), revcomp, 0, revcomp.length() ); if(hit.size()>=MIN_ALIGN_LEN) { System.err.println("REVCOMP3' "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()); } /* hit = matrix.align( readseq, 0, readseq.length(), revcomp, 0, revcomp.length() ); if(hit.size()>=MIN_ALIGN_LEN) { System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString()); }*/ } /* other */ List<LongestCommonSequence.Hit> hits = new ArrayList<>(); align(hits, matrix, genomicSequence, Math.max(0,rec.getUnclippedStart()-rec.getReadLength()), Math.min(rec.getUnclippedEnd()+rec.getReadLength(),genomicSequence.length()), readseq, 0,readseq.length(), -1 ); if(hits.size()<2000) continue; for(LongestCommonSequence.Hit hit:hits) { int readstart=0; boolean in_M=false; for(int i=0;i< nCigarElement;++i) { final CigarElement ce = cigar.getCigarElement(i); if(ce.getOperator().consumesReadBases()) { int readend = readstart + ce.getLength(); if( ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) { if(!(hit.getEndY() <=readstart || readend<= hit.getStartY())) { in_M=true; break; } } readstart=readend; } } if(in_M) continue; int align_size = hit.size(); System.err.println(rec.getReadName()+" "+rec.getCigarString()+" "+rec.getAlignmentStart()+"-"+rec.getAlignmentEnd()); System.err.println("REF: "+hit.getStartX()+"-"+hit.getEndX()); System.err.println("READ "+hit.getStartY()+"-"+hit.getEndY()); System.err.print("REF :"); for(int i=0;i< align_size;++i) { System.err.print(genomicSequence.charAt(hit.getStartX()+i)); } System.err.println(); System.err.print("READ :"); for(int i=0;i< align_size;++i) { System.err.print(readseq.charAt(hit.getStartY()+i)); } System.err.println(); } System.err.println(); } progress.finish(); return RETURN_OK; } catch (Exception e) { return wrapException(e); } finally { genomicSequence=null; CloserUtil.close(iter); CloserUtil.close(samReader); CloserUtil.close(w); CloserUtil.close(indexedFastaSequenceFile); matrix=null; } } public static void main(String[] args) { new LocalRealignReads().instanceMainWithExit(args); } }