package com.github.lindenb.jvarkit.tools.blast;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import gov.nih.nlm.ncbi.blast.Hit;
import gov.nih.nlm.ncbi.blast.HitHsps;
import gov.nih.nlm.ncbi.blast.Hsp;
import gov.nih.nlm.ncbi.blast.Iteration;
import gov.nih.nlm.ncbi.blast.IterationHits;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import htsjdk.samtools.util.CloserUtil;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.bio.blast.BlastHspAlignment;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
@Program(name="mergesplittedblast",description="merge blast Hits from splitted BLAST database. See http://www.biostars.org/p/90186/",biostars=90186)
public class MergeSplittedBlast extends Launcher {
private static final Logger LOG=Logger.build(MergeSplittedBlast.class).make();
private Unmarshaller unmarshaller;
private Marshaller marshaller;
/* force javac to compile */
@SuppressWarnings("unused")
private gov.nih.nlm.ncbi.blast.ObjectFactory _ignore_for_javac=null;
private static class Split
{
String chrom;
int start1;
int end1;
int len;
}
private MergeSplittedBlast()
{
}
private Split parse(Hit hit)
{
String s=hit.getHitDef();
if(s==null ) return null;
int colon=s.indexOf(':');
if(colon==-1) return null;
int dash=s.indexOf('-', colon+1);
if(dash==-1) return null;
int colon2=s.indexOf(':', dash+1);
if(colon2==-1) return null;
Split split=new Split();
try {
split.chrom=s.substring(0, colon).trim();
if(split.chrom.isEmpty()) return null;
split.start1=Integer.parseInt(s.substring(colon+1,dash));
if(split.start1<1) return null;
split.end1=Integer.parseInt(s.substring(dash+1,colon2));
if(split.end1<=split.start1) return null;
split.len=Integer.parseInt(s.substring(colon2+1));
}
catch(NumberFormatException err)
{
LOG.error("expect (chrom:start-end:length) Cannot parse Hit.def: "+s ,err);
return null;
}
return split;
}
private boolean overlap(int a_start1,int a_end1,int b_start1,int b_end1)
{
//debug(" overlapp ["+a_start1+"-"+a_end1+"] vs ["+b_start1+"-"+b_end1+"]");
if(b_end1+1 < a_start1) return false;
if(a_end1+1 < b_start1) return false;
//debug(" ok overlap");
return true;
}
private Hsp merge(Hsp hh0,Hsp hh1)
{
BlastHspAlignment aln0=new BlastHspAlignment(hh0);
BlastHspAlignment aln1=new BlastHspAlignment(hh1);
//order hsp on query pos
if(aln0.getQueryFrom1() > aln1.getQueryFrom1())
{
Hsp th=hh0;
hh0=hh1;
hh1=th;
aln0=new BlastHspAlignment(hh0);
aln1=new BlastHspAlignment(hh1);
}
/* not the same strand */
if(aln0.isPlusPlus()!=aln1.isPlusPlus())
{
//debug("strand");
return null;
}
/* hits overlap */
if(!overlap(aln0.getHitFrom1(),aln0.getHitTo1(),aln1.getHitFrom1(),aln1.getHitTo1()))
{
//debug("no overlap hit");
return null;
}
/* query overlap */
if(!overlap(aln0.getQueryFrom1(),aln0.getQueryTo1(),aln1.getQueryFrom1(),aln1.getQueryTo1()))
{
//debug("no overlap query");
return null;
}
//hit1 is contained in hit0
if(aln0.getQueryFrom1() <= aln1.getQueryFrom1() && aln1.getQueryTo1()<=aln0.getQueryTo1())
{
//debug("contained");
return hh0;
}
StringBuilder qsb=new StringBuilder();
StringBuilder msb=new StringBuilder();
StringBuilder hsb=new StringBuilder();
int expect_hit=-1;
int found_hit=-1;
for(BlastHspAlignment.Align a:aln0)
{
if(a.getQueryIndex1()>=aln1.getQueryFrom1())
{
//debug("###BREAK###############");
expect_hit=a.getHitIndex1();
break;
}
qsb.append(a.getQueryChar());
msb.append(a.getMidChar());
hsb.append(a.getHitChar());
}
if(expect_hit==-1)
{
//debug("HU?");
return null;
}
for(BlastHspAlignment.Align a:aln1)
{
if(a.getQueryIndex1()<aln1.getQueryFrom1()) continue;
if(found_hit==-1)
{
found_hit=a.getHitIndex1();
if(expect_hit!=found_hit)
{
LOG.info("Not the expected hit position "+expect_hit+"/"+found_hit);
return null;
}
}
qsb.append(a.getQueryChar());
msb.append(a.getMidChar());
hsb.append(a.getHitChar());
}
//info("\n"+qsb+"\n"+msb+"\n"+hsb);
Hsp newHsp=BlastHspAlignment.cloneHsp(hh0);
newHsp.setHspAlignLen(String.valueOf(msb.length()));
newHsp.setHspMidline(msb.toString());
newHsp.setHspQseq(qsb.toString());
newHsp.setHspHseq(hsb.toString());
newHsp.setHspQueryFrom(String.valueOf(Math.min(aln0.getQueryFrom1(),aln1.getQueryFrom1())));
newHsp.setHspQueryTo(String.valueOf(Math.max(aln0.getQueryTo1(),aln1.getQueryTo1())));
if(aln0.isPlusPlus())
{
newHsp.setHspHitFrom(String.valueOf(Math.min(aln0.getHitFrom1(),aln1.getHitFrom1())));
newHsp.setHspHitTo(String.valueOf(Math.max(aln0.getHitTo1(),aln1.getHitTo1())));
}
else
{
newHsp.setHspHitFrom(String.valueOf(Math.max(aln0.getHitFrom1(),aln1.getHitFrom1())));
newHsp.setHspHitTo(String.valueOf(Math.min(aln0.getHitTo1(),aln1.getHitTo1())));
}
newHsp.setHspGaps(String.valueOf(newHsp.getHspMidline().replaceAll("[^ ]", "").length()));
newHsp.setHspScore(String.valueOf(newHsp.getHspMidline().replaceAll("[ ]", "").length()));
//debug("success");
return newHsp;
}
private Hit merge(List<Hit> hits)
{
Hit first=hits.get(0);
Split firstSplit=parse(first);
Hit newHit=new Hit();
newHit.setHitAccession(first.getHitAccession());
newHit.setHitDef(firstSplit.chrom);
newHit.setHitId(first.getHitId());
newHit.setHitNum(first.getHitNum());
newHit.setHitLen(String.valueOf(firstSplit.len));
newHit.setHitHsps(new HitHsps());
List<Hsp> hsps=new ArrayList<Hsp>();
for(Hit h0:hits)
{
/* fix hit_from/hit_to */
Split split=parse(h0);
List<Hsp> L=h0.getHitHsps().getHsp();
for(int i=0;i< L.size();++i)
{
Hsp newHsp=BlastHspAlignment.cloneHsp(L.get(i));
BlastHspAlignment aln=new BlastHspAlignment(newHsp);
int h_from=aln.getHitFrom1();
int h_to=aln.getHitTo1();
h_from+=(split.start1-1);
h_to+=(split.start1-1);
newHsp.setHspHitFrom(String.valueOf(h_from));
newHsp.setHspHitTo(String.valueOf(h_to));
hsps.add(newHsp);
}
}
boolean done=false;
while(!done)
{
done=true;
for(int i=0;i+1< hsps.size() ;++i)
{
Hsp hsp0=hsps.get(i);
for(int j=i+1;j< hsps.size();++j)
{
//debug("comparing hsp "+i+" vs "+j+" N="+hsps.size());
Hsp hsp1=hsps.get(j);
Hsp newHitHsp=merge(hsp0,hsp1);
if(newHitHsp!=null)
{
hsps.set(i,newHitHsp);
hsps.remove(j);
done=false;
break;
}
}
if(!done) break;
}
}
for(int i=0;i< hsps.size();++i) hsps.get(i).setHspNum(String.valueOf(i+1));
newHit.getHitHsps().getHsp().addAll(hsps);
return newHit;
}
private Iteration merge(Iteration iteration)
{
if(iteration.getIterationHits().getHit().size()<=1) return iteration;
Map<String,List<Hit>> chrom2hits=new LinkedHashMap<String,List<Hit>>();
for(Hit hit:iteration.getIterationHits().getHit())
{
Split s=parse(hit);
if(s==null) return iteration;
List<Hit> L= chrom2hits.get(s.chrom);
if(L==null)
{
L=new ArrayList<Hit>();
chrom2hits.put(s.chrom,L);
}
L.add(hit);
}
Iteration newiteration=new Iteration();
List<Hit> newHits=new ArrayList<Hit>();
for(String chrom:chrom2hits.keySet())
{
List<Hit> L= chrom2hits.get(chrom);
Hit newHit=merge(L);
newHits.add(newHit);
}
newiteration.setIterationIterNum(iteration.getIterationIterNum());
newiteration.setIterationQueryID(iteration.getIterationQueryID());
newiteration.setIterationQueryLen(iteration.getIterationQueryLen());
newiteration.setIterationQueryDef(iteration.getIterationQueryDef());
newiteration.setIterationMessage(iteration.getIterationMessage());
newiteration.setIterationHits(new IterationHits());
newiteration.getIterationHits().getHit().addAll(newHits);
return newiteration;
}
private void run(
XMLEventReader r,
XMLEventWriter w
)
throws XMLStreamException,JAXBException
{
while(r.hasNext())
{
XMLEvent evt=r.peek();
if(!(evt.isStartElement()))
{
w.add(r.nextEvent());
continue;
}
String name=evt.asStartElement().getName().getLocalPart();
if(name.equals("BlastOutput_program"))
{
w.add(r.nextEvent());
evt=r.nextEvent();//TEXT
if(!evt.isCharacters()) throw new XMLStreamException("Expected a string after "+name);
String prog=evt.asCharacters().getData();
if(!"blastn".equals(prog))
{
throw new XMLStreamException("Not a blastn input:"+prog);
}
w.add(evt);
continue;
}
if(!name.equals("Iteration"))
{
w.add(r.nextEvent());
continue;
}
Iteration iter= this.unmarshaller.unmarshal(r, Iteration.class).getValue();
iter=merge(iter);
this.marshaller.marshal(iter, w);
}
}
@Override
public int doWork(List<String> args) {
XMLEventReader rx=null;
XMLEventWriter wx=null;
try
{
JAXBContext jc = JAXBContext.newInstance("gov.nih.nlm.ncbi.blast");
this.unmarshaller=jc.createUnmarshaller();
this.marshaller=jc.createMarshaller();
this.marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT,true);
this.marshaller.setProperty(Marshaller.JAXB_FRAGMENT,true);
XMLInputFactory xmlInputFactory=XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver()
{
@Override
public Object resolveEntity(String arg0, String arg1, String arg2,
String arg3) throws XMLStreamException
{
LOG.info("resolveEntity:" +arg0+"/"+arg1+"/"+arg2);
return null;
}
});
XMLOutputFactory xof=XMLOutputFactory.newFactory();
wx=xof.createXMLEventWriter(System.out, "UTF-8");
if(args.isEmpty())
{
LOG.info("Reading from stdin");
rx=xmlInputFactory.createXMLEventReader(System.in);
}
else if(args.size()==1)
{
LOG.info("Reading from "+args.get(0));
rx=xmlInputFactory.createXMLEventReader(IOUtils.openURIForBufferedReading(args.get(0)));
}
else
{
LOG.error("Illegal number of args");
return -1;
}
run(rx,wx);
return 0;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(wx);
CloserUtil.close(rx);
}
}
public static void main(String[] args) {
new MergeSplittedBlast().instanceMainWithExit(args);
}
}