package com.github.lindenb.jvarkit.tools.rnaseq;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalTreeMap;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CloserUtil;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.io.NullOuputStream;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.ucsc.KnownGene;
import com.github.lindenb.jvarkit.util.ucsc.KnownGene.Exon;
@Program(name="findnewsplicesites",description="use the 'N' operator in the cigar string to find unknown splice sites")
public class FindNewSpliceSites extends Launcher
{
private static final Logger LOG = Logger.build(FindNewSpliceSites.class).make();
private IntervalTreeMap<List<KnownGene>> knownGenesMap=new IntervalTreeMap<>();
@Parameter(names={"-out","--out"},description="output")
private File outputFile = null;
@Parameter(names="-k",description="UCSC Known Gene URI")
private Set<String> knownGeneUris = new LinkedHashSet<>();
@Parameter(names="-d",description="max distance between known splice site and cigar end")
private int max_distance=10;
@ParametersDelegate
private WritingBamArgs writingBamArgs = new WritingBamArgs();
private SAMFileWriter sfw=null;
private SAMFileWriter weird=null;
private FindNewSpliceSites()
{
}
private boolean is_close_to(int d1,int d2)
{
return Math.abs(d1-d2)<=this.max_distance;
}
private boolean findJunction(
Collection<KnownGene> genes,
int start1,
int end1
)
{
for(KnownGene g:genes)
{
for(int k=0;k+1< g.getExonCount();++k)
{
Exon ex0=g.getExon(k);
Exon ex1=g.getExon(k+1);
if( is_close_to(ex0.getEnd()+1,start1) &&
is_close_to(ex1.getStart()+1,end1))
{
return true;
}
}
}
return false;
}
private static boolean isMatch(CigarElement e)
{
switch(e.getOperator())
{
case X:case EQ: case M: return true;
default: return false;
}
}
private void scanRead(
SAMRecord rec,
SAMSequenceDictionary dict
)
{
Cigar cigar=rec.getCigar();
//if(cigar==null || !rec.getCigarString().contains("N")) return; //aleady checked
final Interval interval=new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd());
final List<KnownGene> genes=new ArrayList<>();
for(final List<KnownGene> list:this.knownGenesMap.getOverlapping(interval))
{
genes.addAll(list);
}
if(genes.isEmpty())
{
return;
}
int refPos1=rec.getAlignmentStart();
for(int cIdx=0;cIdx< cigar.numCigarElements();++cIdx)
{
CigarElement ce=cigar.getCigarElement(cIdx);
switch(ce.getOperator())
{
case S: break;
case I: break;
case N:
{
if(cIdx+1<cigar.numCigarElements() &&
isMatch(cigar.getCigarElement(cIdx+1)) &&
!findJunction(genes,refPos1-1,refPos1+ce.getLength()))
{
this.sfw.addAlignment(rec);//unknown junction
return;
}
refPos1+=ce.getLength();
break;
}
case D:
case X:
case EQ:
case M:
{
refPos1+=ce.getLength();
break;
}
case H:case P: break;//ignore
default:throw new RuntimeException("operator not handled. ops.");
}
}
}
private static boolean isWeird(SAMRecord rec,SAMSequenceDictionary dict)
{
if(rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() &&
rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) &&
(
rec.getReadNegativeStrandFlag()==rec.getMateNegativeStrandFlag() ||
(rec.getReadNegativeStrandFlag()&& !rec.getMateNegativeStrandFlag() && rec.getAlignmentStart() < rec.getMateAlignmentStart()) ||
(!rec.getReadNegativeStrandFlag() && rec.getMateNegativeStrandFlag() && rec.getAlignmentStart() > rec.getMateAlignmentStart())
))
{
if(rec.getAlignmentStart() < rec.getMateAlignmentStart())
{
if(rec.getMateAlignmentStart() < rec.getAlignmentEnd()) return true;
}
if(rec.getAlignmentStart() > rec.getMateAlignmentStart())
{
if(rec.getMateAlignmentStart() + Math.abs(rec.getInferredInsertSize() ) > rec.getAlignmentStart()) return true;
}
return true;
}
return false;
}
private void scan(SamReader in)
{
SAMSequenceDictionary dict=in.getFileHeader().getSequenceDictionary();
if(dict==null) throw new RuntimeException("Sequence dictionary missing");
SAMRecordIterator iter=in.iterator();
SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(dict);
while(iter.hasNext())
{
SAMRecord rec=iter.next();
if(rec.getReadUnmappedFlag()) continue;
if(rec.isSecondaryOrSupplementary()) continue;
progress.watch(rec);
if(isWeird(rec,dict))
{
this.weird.addAlignment(rec);
continue;
}
for(CigarElement ce:rec.getCigar().getCigarElements())
{
if(ce.getOperator().equals(CigarOperator.N))
{
scanRead(rec,dict);
break;
}
}
}
iter.close();
progress.finish();
}
@Override
public int doWork(List<String> args) {
if(this.knownGeneUris.isEmpty())
{
LOG.error("known Gene file undefined");
return -1;
}
SamReader sfr=null;
try
{
Pattern tab=Pattern.compile("[\t]");
for(String kgUri: this.knownGeneUris)
{
LOG.info("Opening "+kgUri);
LineIterator r=IOUtils.openURIForLineIterator(kgUri);
while(r.hasNext())
{
final KnownGene g=new KnownGene(tab.split(r.next()));
if(g.getExonCount()==1) continue;//need spliced one
final Interval interval = new Interval(g.getContig(), g.getTxStart()+1, g.getTxEnd());
List<KnownGene> L = this.knownGenesMap.get(interval);
if(L==null) {
L= new ArrayList<>();
this.knownGenesMap.put(interval,L);
}
L.add(g);
}
LOG.info("Done reading: "+kgUri);
}
sfr = super.openSamReader(oneFileOrNull(args));
SAMFileHeader header=sfr.getFileHeader().clone();
SAMProgramRecord p=header.createProgramRecord();
p.setCommandLine(getProgramCommandLine());
p.setProgramVersion(getVersion());
p.setProgramName(getProgramName());
this.sfw=this.writingBamArgs.openSAMFileWriter(outputFile, header, true);
header=sfr.getFileHeader().clone();
p=header.createProgramRecord();
p.setCommandLine(getProgramCommandLine());
p.setProgramVersion(getVersion());
p.setProgramName(getProgramName());
this.weird=this.writingBamArgs.createSAMFileWriterFactory().makeSAMWriter(header,true, new NullOuputStream());
scan(sfr);
sfr.close();
LOG.info("Done");
return 0;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(sfr);
CloserUtil.close(this.sfw);
CloserUtil.close(this.weird);
}
}
/**
* @param args
*/
public static void main(String[] args) {
new FindNewSpliceSites().instanceMainWithExit(args);
}
}