package com.github.lindenb.jvarkit.tools.blastmapannots; /** * Author: * Pierre LIndenbaum PhD * WWW: * http://plindenbaum.blogspot.com * Mail: * plindenbaum@yahoo.fr * Motivation: * map the annotations of a genbank uniprot file to a blast hit * prints the results as a BED file. * */ import gov.nih.nlm.ncbi.blast.BlastOutput; import gov.nih.nlm.ncbi.blast.BlastOutputIterations; import gov.nih.nlm.ncbi.blast.Hit; import gov.nih.nlm.ncbi.blast.Hsp; import gov.nih.nlm.ncbi.blast.Iteration; import gov.nih.nlm.ncbi.gb.GBFeature; import gov.nih.nlm.ncbi.gb.GBInterval; import gov.nih.nlm.ncbi.gb.GBQualifier; import gov.nih.nlm.ncbi.gb.GBSeq; import gov.nih.nlm.ncbi.gb.GBSet; import java.awt.Color; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; import javax.xml.bind.JAXBContext; import javax.xml.bind.Unmarshaller; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.uniprot.Entry; import org.uniprot.FeatureType; import org.uniprot.LocationType; import org.uniprot.Uniprot; import org.w3c.dom.Document; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; @Program(name="blastmapannots",description="Maps uniprot/genbank annotations on a blast result.") public class BlastMapAnnotations extends Launcher { private static Logger LOG=Logger.build(BlastMapAnnotations.class).make(); private BlastOutput blastOutput=null; @Parameter(names={"-u","-g","--genbank","--uniprot"}, description="XML sequence file Genbank.xml or uniprot.xml.",required=true) private File IN=null; @Parameter(names={"-APC"}, description="append the sequence accession before the feature name.",required=true) private boolean APPEND_ACN=false; @Parameter(names= "--include", description="Restrict to uniprot/feature/type of genbank/feature/key.") private Set<String> INCL=new HashSet<String>(); @Parameter(names= "--exclude", description="Exclude uniprot/feature/type of genbank/feature/key.") private Set<String> EXCL=new HashSet<String>(); private static class BlastPos { int query; int hit; @Override public String toString() { return "{hit:"+hit+",query:"+query+"}"; } } private abstract class Interval { Hit hit; Hsp hsp; protected final BlastPos convertQuery(int qPos1) { BlastPos left=getHspStart1(); String qS=this.hsp.getHspQseq(); String hS=this.hsp.getHspHseq(); for(int i=0;i< qS.length() && i< hS.length();++i) { if(left.query>=qPos1) break; if(isSeqLetter(qS.charAt(i))) { left.query+=this.queryShift(); } if(isSeqLetter(hS.charAt(i))) { left.hit+=this.hitShift(); } } return left; } protected final int queryShift() { return 1; } protected final int hitShift() { int shift=1; if(blastOutput.getBlastOutputProgram().equals("tblastn")) { shift=3; } else if(blastOutput.getBlastOutputProgram().equals("blastn")) { shift=1; } else { throw new RuntimeException("Sorry program not handled: "+blastOutput.getBlastOutputProgram()); } return shift*(isHspForward()?1:-1); } private final boolean isSeqLetter(char c) { if(c=='-' || c=='*' || Character.isWhitespace(c)) return false; if(Character.isLetter(c)) return true; throw new IllegalArgumentException("letter: "+c); } private final boolean isMidMatch(char c) { if(Character.isWhitespace(c)) return false; if(Character.isLetter(c) || c=='|' || c=='+') return true; return false; } protected final BlastPos getHspStart1() { BlastPos p=new BlastPos(); p.query=Integer.parseInt(this.hsp.getHspQueryFrom()); p.hit=Integer.parseInt(this.hsp.getHspHitFrom()); return p; } protected final BlastPos getHspEnd1() { BlastPos p=new BlastPos(); p.query=Integer.parseInt(this.hsp.getHspQueryTo()); p.hit=Integer.parseInt(this.hsp.getHspHitTo()); return p; } protected final boolean isHspForward() { if(getHspStart1().query>getHspEnd1().query) throw new IllegalStateException(); return getHspStart1().hit<=getHspEnd1().hit; } public final int getHspQueryStart0() { return Math.min(getHspStart1().query,getHspEnd1().query)-1; } public final int getHspQueryEnd0() { return Math.max(getHspStart1().query,getHspEnd1().query); } public final int getBedScore() { int len=featureEnd0()-featureStart0(); BlastPos left=getHspStart1(); float match=0f; String qS=this.hsp.getHspQseq(); String hS=this.hsp.getHspHseq(); String mid=this.hsp.getHspMidline(); for(int i=0;i< qS.length() && i< hS.length();++i) { if(left.query-1 >= featureStart0() && left.query-1 < featureEnd0()) { if(isSeqLetter(qS.charAt(i)) ) { match+=(1/3.0); } if(isSeqLetter(hS.charAt(i))) { match+=(1/3.0); } if(isMidMatch(mid.charAt(i))) { match+=(1/3.0); } } if(isSeqLetter(qS.charAt(i)) ) { left.query+=this.queryShift(); } if(isSeqLetter(hS.charAt(i))) { left.hit+=this.hitShift(); } } int score= (int)((match/len)*1000f); if(score<0) score=0; if(score>1000) { LOG.error("SCORE > 10000 in "+toString()); score=1000; } if(score==0) LOG.info("score==0 "+match+"/"+len); return score; } public final boolean isFeatureOverlapHsp() { if(featureEnd0()<=getHspQueryStart0()) return false; if(featureStart0()>=getHspQueryEnd0()) return false; return true; } public abstract Color getColor(); public final String getBedColor() { Color c= getColor(); if(c==null) c=Color.WHITE; return ""+c.getRed()+","+c.getGreen()+","+c.getBlue(); } public final String getChrom() { return hit.getHitDef(); } protected abstract int featureStart0(); protected abstract int featureEnd0(); public abstract String getBedName(); public abstract int getBedStart(); public abstract int getBedEnd(); public abstract char getBedStrand(); private String normalizeName(String s) { return s.replaceAll("[ \t,]+","_"); } public final String toBedString() { StringBuilder b=new StringBuilder(); b.append(normalizeName(getChrom())).append('\t'); b.append(getBedStart()).append('\t'); b.append(getBedEnd()).append('\t'); b.append(normalizeName(getBedName())).append('\t'); b.append(getBedScore()).append('\t'); b.append(getBedStrand()).append('\t'); b.append(getBedStart()).append('\t'); b.append(getBedEnd()).append('\t'); b.append(getBedColor()).append('\t'); b.append(1).append('\t'); b.append(getBedEnd()-getBedStart()).append('\t'); b.append(0); return b.toString(); } } /**=================================================================================== * Interval for Uniprot */ private class UniprotInterval extends Interval { Entry entry; FeatureType featureType; public int getEntryStart1() { LocationType lt=featureType.getLocation(); if(lt==null) throw new IllegalStateException(); if(lt.getPosition()!=null) { return lt.getPosition().getPosition().intValue(); } else if(lt.getBegin()!=null) { return lt.getBegin().getPosition().intValue(); } else { throw new IllegalStateException(); } } public int getEntryEnd1() { LocationType lt=featureType.getLocation(); if(lt==null) throw new IllegalStateException(); if(lt.getPosition()!=null) { return lt.getPosition().getPosition().intValue(); } else if(lt.getEnd()!=null) { return lt.getEnd().getPosition().intValue(); } else { throw new IllegalStateException(); } } @Override public String toString() { return "start1:"+getEntryStart1()+" end1:"+getEntryEnd1()+" acn:"+getBedName()+" start0:"+getEntryStart0()+" end0:"+getEntryEnd0()+"\n"+ " hsp.start:"+getHspStart1()+" hsp.end:"+getHspEnd1()+" hsp.foward:"+isHspForward()+"\nHSP:overlap-gb:"+isFeatureOverlapHsp(); } private String entryName() { for(String s:entry.getName()) return s; for(String s:entry.getAccession()) return s; throw new IllegalStateException(); } @Override public String getBedName() { String s=this.featureType.getType(); if(this.featureType.getDescription()!=null && !s.equals("sequence conflict")) { s= this.featureType.getDescription(); } if(s.equals("sequence conflict") && featureType.getOriginal()!=null && featureType.getVariation()!=null && !featureType.getVariation().isEmpty()) { s+=":"+featureType.getOriginal()+"/"; for(String v:featureType.getVariation()) { if(!s.endsWith("/")) s+=","; s+=v; } } if(!APPEND_ACN) return s; return entryName()+":"+s; } private int getEntryStart0() { return getEntryStart1()-1; } private int getEntryEnd0() { return getEntryEnd1(); } @Override protected int featureEnd0() { return getEntryEnd1(); } @Override protected int featureStart0() { return getEntryStart0(); } @Override public int getBedStart() { return Math.min(convertQuery(getEntryStart1()).hit,convertQuery(getEntryEnd1()).hit)-1; } @Override public int getBedEnd() { int bedEnd= Math.max(convertQuery(getEntryStart1()).hit,convertQuery(getEntryEnd1()).hit); if(bedEnd==getBedStart()) { LOG.warning("Empty bed:"); } return bedEnd; } @Override public char getBedStrand() { int str=1; if(!isHspForward()) str*=-1; return str==1?'+':'-'; } @Override public Color getColor() { String fkey="";//TODO if(fkey.equals("CDS")) { return Color.YELLOW; } if(fkey.equals("gene")) { return Color.ORANGE; } if(fkey.equals("gene")) { return Color.GREEN; } return Color.WHITE; } } static enum QualKey { region_name,product,mol_type,gene,locus_tag,site_type,note }; /**=================================================================================== * Interval for Genbank */ private class GenbankInterval extends Interval { GBSeq gbSeq; GBInterval gbInterval; GBFeature gbFeature; public int getGBStart1() { if(gbInterval.getGBIntervalPoint()!=null) { return Integer.parseInt(gbInterval.getGBIntervalPoint()); } else if(gbInterval.getGBIntervalFrom()!=null) { return Integer.parseInt(gbInterval.getGBIntervalFrom()); } else { throw new IllegalStateException(); } } public int getGBEnd1() { if(gbInterval.getGBIntervalPoint()!=null) { return Integer.parseInt(gbInterval.getGBIntervalPoint()); } else if(gbInterval.getGBIntervalTo()!=null) { return Integer.parseInt(gbInterval.getGBIntervalTo()); } else { throw new IllegalStateException(); } } @Override public String toString() { return "start1:"+getGBStart1()+" end1:"+getGBEnd1()+" forward:"+isGbForward()+" acn:"+getBedName()+" start0:"+getGBStart0()+" end0:"+getGBEnd0()+"\n"+ " hsp.start:"+getHspStart1()+" hsp.end:"+getHspEnd1()+" hsp.foward:"+isHspForward()+"\nHSP:overlap-gb:"+isFeatureOverlapHsp(); } private String gbName() { String s=null; s=gbSeq.getGBSeqLocus(); if(s!=null) return s; s=gbSeq.getGBSeqAccessionVersion(); if(s!=null) return s; s=gbSeq.getGBSeqEntryVersion(); if(s!=null) return s; return "?"; } @Override public String getBedName() { String fkey=this.gbFeature.getGBFeatureKey(); TreeMap<QualKey, String> t=new TreeMap<QualKey, String>(); if(this.gbFeature.getGBFeatureQuals()!=null && this.gbFeature.getGBFeatureQuals().getGBQualifier()!=null ) { for(GBQualifier q:this.gbFeature.getGBFeatureQuals().getGBQualifier()) { String key=q.getGBQualifierName(); for(QualKey qk:QualKey.values()) { if(qk.name().equals(key)) { t.put(qk, q.getGBQualifierValue()); break; } } } } if(t.isEmpty()) { LOG.info("not qual for "+fkey); if(!APPEND_ACN) return fkey; return gbName()+":"+fkey; } if(!APPEND_ACN) return fkey+":"+t.get(t.keySet().iterator().next()); return gbName()+":"+fkey+":"+t.get(t.keySet().iterator().next()); } public boolean isGbForward() { return getGBStart1()<=getGBEnd1(); } private int getGBStart0() { return Math.min(getGBStart1(), getGBEnd1())-1; } private int getGBEnd0() { return Math.max(getGBStart1(), getGBEnd1()); } @Override protected int featureEnd0() { return getGBEnd0(); } @Override protected int featureStart0() { return getGBStart0(); } @Override public int getBedStart() { return Math.min(convertQuery(getGBStart1()).hit,convertQuery(getGBEnd1()).hit)-1; } @Override public int getBedEnd() { return Math.max(convertQuery(getGBStart1()).hit,convertQuery(getGBEnd1()).hit); } @Override public char getBedStrand() { int str=1; if(!isGbForward()) str*=-1; if(!isHspForward()) str*=-1; return str==1?'+':'-'; } @Override public Color getColor() { String fkey=this.gbFeature.getGBFeatureKey(); if(fkey.equals("CDS")) { return Color.YELLOW; } if(fkey.equals("gene")) { return Color.ORANGE; } if(fkey.equals("gene")) { return Color.GREEN; } return Color.WHITE; } } private void printUniprot(Uniprot uniprotSet) { if(uniprotSet.getEntry().isEmpty()) { LOG.warn("empty uniprot entry."); return; } if(uniprotSet.getEntry().size()>1) { LOG.warn("entry contains more than one sequence."); } for(Entry entry:uniprotSet.getEntry()) { BlastOutputIterations iterations=this.blastOutput.getBlastOutputIterations(); for(Iteration iteration:iterations.getIteration()) { for(FeatureType feature:entry.getFeature()) { if(!acceptfeature(feature.getType())) continue; for(Hit hit:iteration.getIterationHits().getHit()) { for(Hsp hsp :hit.getHitHsps().getHsp()) { UniprotInterval bi=new UniprotInterval(); bi.entry=entry; bi.featureType=feature; bi.hit=hit; bi.hsp=hsp; LOG.debug("interval "+bi); if(!bi.isFeatureOverlapHsp()) { continue; } stdout().println(bi.toBedString()); } } } break; } break; } //System.err.println("OK"); } private void printGB(GBSet gbSet) { for(GBSeq gbSeq:gbSet.getGBSeq()) { BlastOutputIterations iterations=this.blastOutput.getBlastOutputIterations(); for(Iteration iteration:iterations.getIteration()) { for(GBFeature feature:gbSeq.getGBSeqFeatureTable().getGBFeature()) { if(feature.getGBFeatureIntervals()==null) continue; if(!acceptfeature(feature.getGBFeatureKey())) continue; for(GBInterval interval:feature.getGBFeatureIntervals().getGBInterval()) { for(Hit hit:iteration.getIterationHits().getHit()) { for(Hsp hsp :hit.getHitHsps().getHsp()) { GenbankInterval bi=new GenbankInterval(); bi.gbSeq=gbSeq; bi.gbFeature=feature; bi.gbInterval=interval; bi.hit=hit; bi.hsp=hsp; LOG.debug("interval "+bi); if(!bi.isGbForward()) LOG.info("CHECK INTERVAL REVERSE"); if(!bi.isFeatureOverlapHsp()) continue; stdout().println(bi.toBedString()); } } } } break; } } //System.err.println("OK"); } @Override public int doWork(List<String> args) { try { /** xml parser */ DocumentBuilder docBuilder; /** transforms XML/DOM to GBC entry */ Unmarshaller unmarshaller; //create a DOM parser DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); f.setCoalescing(true); //f.setNamespaceAware(true); no, why does it break the parsing of uniprot ?? f.setValidating(false); f.setExpandEntityReferences(true); docBuilder= f.newDocumentBuilder(); docBuilder.setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { return new InputSource(new StringReader("")); } }); //create a Unmarshaller for NCBI JAXBContext jc = JAXBContext.newInstance("gov.nih.nlm.ncbi.gb:gov.nih.nlm.ncbi.blast:org.uniprot"); unmarshaller=jc.createUnmarshaller(); LOG.info("reading entry "+IN); Document domEntry=docBuilder.parse(IN); GBSet gbSet=null; Uniprot uniprotSet=null; if("GBSet".equals(domEntry.getDocumentElement().getNodeName())) { LOG.info("parsing as GBSet"); gbSet=unmarshaller.unmarshal(domEntry,GBSet.class).getValue(); } else if("uniprot".equals(domEntry.getDocumentElement().getNodeName())) { LOG.info("parsing as Uniprot "+domEntry.getDocumentElement()); uniprotSet=unmarshaller.unmarshal(domEntry,Uniprot.class).getValue(); //LOG.info(uniprotSet.getEntry().size()); //jc.createMarshaller().marshal(uniprotSet, System.err); } else { LOG.info("unknown root element:"+domEntry.getDocumentElement().getNodeName()); return -1; } Document blastDom; if(args.size()==1) { LOG.info("reading "+args.get(0)); blastDom=docBuilder.parse(new File(args.get(0))); } else if(args.isEmpty()) { LOG.info("reading from stdin"); blastDom=docBuilder.parse(stdin()); } else { LOG.error("Illegal number of args"); return -1; } this.blastOutput=unmarshaller.unmarshal(blastDom,BlastOutput.class).getValue(); if(uniprotSet!=null) printUniprot(uniprotSet); if(gbSet!=null) printGB(gbSet); return 0; } catch(Exception err) { LOG.error(err); return -1; } } private boolean acceptfeature(String s) { if(s==null || s.isEmpty()) return false; if(!this.INCL.isEmpty()) { if(!INCL.contains(s)) { return false; } } if(!this.EXCL.isEmpty()) { if(EXCL.contains(s)) { return false; } } return true; } public static void main(String[] args) { /* force javac to compile those */ @SuppressWarnings("unused") gov.nih.nlm.ncbi.blast.ObjectFactory of1=null; @SuppressWarnings("unused") gov.nih.nlm.ncbi.gb.ObjectFactory of2=null; @SuppressWarnings("unused") org.uniprot.ObjectFactory of3=null; new BlastMapAnnotations().instanceMainWithExit(args); } }