package com.github.lindenb.jvarkit.tools.evs2bed; import java.io.InputStream; import java.io.PrintStream; import java.io.StringWriter; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.soap.SOAPConstants; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; @Program(name="evs2bed",description= "Download data from EVS http://evs.gs.washington.edu/EVS as a BED chrom/start/end/XML For later use, see VCFTabixml.") public class DumpExomeVariantServerData extends Launcher { private static final Logger LOG=Logger.build(DumpExomeVariantServerData.class).make(); @Parameter(names="-N",description=" download using a step of 'N' bases") private int STEP_SIZE=25000; @Parameter(names="-L",description="limit to L records (for debugging)") private long LIMIT=-1L; private long count_records=0L; private long genome_total_size=0L; private long genome_curr_size=0L; public static final String EVS_NS="http://webservice.evs.gs.washington.edu/"; private DocumentBuilder documentBuilder; private Transformer transformer; private DumpExomeVariantServerData() { } private static Element first(Element root,String namespaceuri,String localName) { if(root==null) return null; for(Node n=root.getFirstChild();n!=null;n=n.getNextSibling()) { if(n.getNodeType()!=Node.ELEMENT_NODE) continue; if(namespaceuri!=null && !namespaceuri.equals(n.getNamespaceURI())) continue; if(namespaceuri!=null && !localName.equals(n.getLocalName())) continue; if(namespaceuri==null && !localName.equals(n.getNodeName())) continue; return Element.class.cast(n); } return null; } private final int MAX_TRY=10; private Element fetchEvsData(String chrom,int start,int end) { double ratio=100.0*(this.genome_curr_size+start)/(double)this.genome_total_size; LOG.info(chrom+":"+start+"-"+end+ " N="+count_records+" "+(int)ratio+"%"); try { URL url = new URL("http://gvs-1.gs.washington.edu/wsEVS/EVSDataQueryService"); // Send data URLConnection conn = null; for(int n_try=0;n_try<MAX_TRY;++n_try) { try { conn=url.openConnection(); } catch(java.net.ConnectException err) { if(n_try+1==MAX_TRY) throw err; System.err.println( "Error: trying "+(n_try)+"/"+MAX_TRY+" "+url ); } } conn.setDoOutput(true); PrintStream wr=new PrintStream(conn.getOutputStream()); wr.print("<?xml version='1.0' ?>"+ "<S:Envelope xmlns:S='http://schemas.xmlsoap.org/soap/envelope/'>"+ "<S:Body>"+ "<ns2:getEvsData xmlns:ns2='http://webservice.evs.gs.washington.edu/'>"+ "<arg0>" ); wr.print(chrom); wr.print(":"); wr.print(String.valueOf(start)); wr.print("-"); wr.print(String.valueOf(end)); wr.print("</arg0>"+ "</ns2:getEvsData>"+ "</S:Body>"+ "</S:Envelope>" ); wr.flush(); InputStream rd = conn.getInputStream(); Document dom=this.documentBuilder.parse(rd); wr.close(); rd.close(); Element e=first(dom.getDocumentElement(), SOAPConstants.URI_NS_SOAP_ENVELOPE, "Body"); e=first(e, EVS_NS, "getEvsDataResponse"); e=first(e, null, "return"); if(e==null) return null; return e; } catch(Exception err) { err.printStackTrace(); return null; } } private class Fetcher { String chrom; int length; Fetcher(String chrom,int length) { this.chrom=chrom; this.length=length; } public void run() throws Exception { if(DumpExomeVariantServerData.this.LIMIT>0 && DumpExomeVariantServerData.this.count_records>=DumpExomeVariantServerData.this.LIMIT) return; final int step=DumpExomeVariantServerData.this.STEP_SIZE; int start=1; do { Element root=fetchEvsData(chrom,start,start+step+10); for(Node n=(root==null?null:root.getFirstChild());n!=null;n=n.getNextSibling()) { if(n.getNodeType()!=Node.ELEMENT_NODE) continue; if(!n.getNodeName().equals("snpList")) continue; String chromosome=null; String chrPosition=null; for(Node n2=n.getFirstChild();n2!=null;n2=n2.getNextSibling()) { if(n2.getNodeType()!=Node.ELEMENT_NODE) continue; if(n2.getNodeName().equals("chromosome")) { chromosome=n2.getTextContent(); } else if(n2.getNodeName().equals("chrPosition")) { chrPosition=n2.getTextContent(); } } count_records++; if(LIMIT>0 && count_records>=LIMIT) break; StringWriter sw=new StringWriter(); transformer.transform( new DOMSource(n), new StreamResult(sw) ); sw.flush(); String xml=sw.toString().replace("\n", ""); System.out.print(chromosome); System.out.print('\t'); System.out.print(Integer.parseInt(chrPosition)-1); System.out.print('\t'); System.out.print(chrPosition); System.out.print('\t'); System.out.println(xml); } start+=step; if(LIMIT>0 && count_records>=LIMIT) break; } while(start<=length); } } private Fetcher fetch(String chrom,int length) throws Exception { return new Fetcher(chrom, length); } private int doWork() { try { DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); f.setCoalescing(true); f.setNamespaceAware(true); f.setValidating(false); f.setExpandEntityReferences(true); f.setIgnoringComments(false); this.documentBuilder= f.newDocumentBuilder(); TransformerFactory factory=TransformerFactory.newInstance(); this.transformer=factory.newTransformer(); this.transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); List<Fetcher> fetchers=new ArrayList<Fetcher>(24); fetchers.add( fetch("1",249250621) ); fetchers.add( fetch("2",243199373) ); fetchers.add( fetch("3",198022430) ); fetchers.add( fetch("4",191154276) ); fetchers.add( fetch("5",180915260) ); fetchers.add( fetch("6",171115067) ); fetchers.add( fetch("7",159138663 )); fetchers.add( fetch("8",146364022) ); fetchers.add( fetch("9",141213431) ); fetchers.add( fetch("10",135534747) ); fetchers.add( fetch("11",135006516) ); fetchers.add( fetch("12",133851895) ); fetchers.add( fetch("13",115169878) ); fetchers.add( fetch("14",107349540) ); fetchers.add( fetch("15",102531392) ); fetchers.add( fetch("16",90354753) ); fetchers.add( fetch("17",81195210) ); fetchers.add( fetch("18",78077248) ); fetchers.add( fetch("19",59128983) ); fetchers.add( fetch("20",63025520) ); fetchers.add( fetch("21",48129895) ); fetchers.add( fetch("22",51304566) ); fetchers.add( fetch("X",155270560) ); //fetch("Y",59373566); not in evs //fetch("M",16571); this.genome_total_size=0L; this.genome_curr_size=0L; for(Fetcher fetcher: fetchers) { this.genome_total_size += fetcher.length; } for(Fetcher fetcher: fetchers) { fetcher.run(); this.genome_curr_size += fetcher.length; } } catch (Exception e) { e.printStackTrace(); return -1; } return 0; } @Override public int doWork(List<String> args) { try { return doWork(); } catch(Exception err) { LOG.error(err); return -1; } finally { } } public static void main(String[] args) { new DumpExomeVariantServerData().instanceMainWithExit(args); } }