package converters.dbgap; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import converters.dbgap.jaxb.Study; import converters.dbgap.jaxb.data_dict.Data_Dict; import converters.dbgap.jaxb.var_report.Var_Report; /** * Used primarily from the DbGapToPheno to access dbGaP ftp and download data */ public class DbGapService { private Document document; private boolean debug = false; private List<Study> studyCache = null; private List<Data_Dict> dictionaryCache = null; private List<Var_Report> reportCache = null; private File fileCache = null; public DbGapService(URL url, File cache) throws ParserConfigurationException, SAXException, IOException { DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); document = builder.parse(url.openStream()); if (cache != null && !cache.exists()) throw new IOException("cache folder " + cache + " doesn't exist"); fileCache = cache; } /** * List all studies in dbGaP by browsing the FTP_Table_of_Contents.xml file. * Alternative versions are listed as separated studies. */ public List<Study> listStudies() throws ParserConfigurationException, SAXException, IOException { if (studyCache == null) { Map<String, Study> studies = new TreeMap<String, Study>(); for (Data_Dict d : listDictionaries()) { if (studies.get(d.study_id) == null) { Study s = new Study(); s.id = d.study_id; s.version = d.study_version; // find description NodeList dirs = document.getElementsByTagName("directory"); for (int i = 0; i < dirs.getLength(); i++) { String name = dirs.item(i).getAttributes().getNamedItem("name").getNodeValue(); if (name.startsWith(s.id)) { if (dirs.item(i).getAttributes().getNamedItem("comment") != null) s.description = dirs .item(i).getAttributes().getNamedItem("comment").getNodeValue(); } } studies.put(d.study_id + "." + d.study_version, s); } } System.out.println("listStudies: " + studies.values().size()); studyCache = new ArrayList<Study>(studies.values()); } return studyCache; } public List<Data_Dict> listDictionaries() throws SAXException, IOException, ParserConfigurationException { if (dictionaryCache == null) { List<Data_Dict> result = new ArrayList<Data_Dict>(); // find files and filter on *data_dict*.xml NodeList files = document.getElementsByTagName("file"); for (int i = 0; i < files.getLength(); i++) { String name = files.item(i).getAttributes().getNamedItem("name").getNodeValue(); String link = files.item(i).getAttributes().getNamedItem("link").getNodeValue(); String comment = null; if (files.item(i).getAttributes().getNamedItem("comment") != null) comment = files.item(i) .getAttributes().getNamedItem("comment").getNodeValue(); if (name.contains("data_dict") && name.endsWith(".xml") && !link.contains("Archive")) { Data_Dict dd = new Data_Dict(); String[] ids = name.split("\\."); dd.description = comment; dd.study_id = ids[0]; dd.study_version = ids[1]; dd.id = ids[2]; dd.version = ids[3]; dd.url = new URL(link); result.add(dd); } } System.out.println("listDictionaries: " + result.size()); dictionaryCache = result; } return dictionaryCache; } public List<Var_Report> listVariableReports() throws SAXException, IOException, ParserConfigurationException { if (reportCache == null) { List<Var_Report> result = new ArrayList<Var_Report>(); // find files and filter on *var_report*.xml NodeList files = document.getElementsByTagName("file"); for (int i = 0; i < files.getLength(); i++) { String name = files.item(i).getAttributes().getNamedItem("name").getNodeValue(); String link = files.item(i).getAttributes().getNamedItem("link").getNodeValue(); String comment = null; if (files.item(i).getAttributes().getNamedItem("comment") != null) comment = files.item(i) .getAttributes().getNamedItem("comment").getNodeValue(); if (name.contains("var_report") && name.endsWith(".xml") && !link.contains("Archive")) { Var_Report vr = new Var_Report(); String[] ids = name.split("\\."); vr.description = comment; vr.study_id = ids[0]; vr.study_version = ids[1]; vr.dataset_id = ids[2]; vr.version = ids[3]; vr.url = new URL(link); result.add(vr); } } System.out.println("listVariableReports: " + result.size()); reportCache = result; } return reportCache; } public void loadDictionaries(Study s) throws JAXBException, IOException, SAXException, ParserConfigurationException, InterruptedException { for (Data_Dict d : listDictionaries()) { // System.out.println("testing "+ d.study_id +" = "+s.id + // " having "+d.url); if (s.id.equals(d.study_id) && s.version.equals(d.study_version)) { Data_Dict loaded = loadDictionary(d); loaded.description = d.description; loaded.url = d.url; loaded.study_id = d.study_id; s.dictionaries.add(loaded); } } System.out.println("loadDictionaries: " + s.dictionaries.size() + " loaded"); } public void loadVariableReports(Study s) throws JAXBException, IOException, SAXException, ParserConfigurationException, InterruptedException { for (Var_Report r : listVariableReports()) { // System.out.println("testing "+ r.study_id +" againsts "+s.id); if (r.study_id.equals(s.id) && s.version.equals(r.study_version)) { Var_Report loaded = loadVariableReport(r); loaded.description = r.description; loaded.url = r.url; loaded.study_id = r.study_id; s.reports.add(loaded); if (debug) break; } } System.out.println("loadVariableReports: " + s.reports.size() + " loaded"); } // public void loadSummaries(Study s) throws JAXBException, IOException, // SAXException, ParserConfigurationException, // InterruptedException // { // for (Data_Dict d : listVariableReports()) // { // if (d.study_id.equals(s.id)) // { // Data_Dict loaded = loadDictionary(d.url); // loaded.description = d.description; // loaded.url = d.url; // loaded.study_id = d.study_id; // s.dictionaries.add(loaded); // Thread.sleep(1000); // } // } // System.out.println("loadSummaries(): " + s.dictionaries.size()); // // } public Var_Report loadVariableReport(Var_Report r) throws JAXBException, IOException { System.out.println("loadVariableReport from " + r.url); URL url = r.url; if (fileCache != null) { File cachedFile = new File(fileCache.getAbsolutePath() + "\\" + new File(new File(r.url.getFile()).getName())); if (!cachedFile.exists()) downloadFile(r.url, cachedFile); url = cachedFile.toURI().toURL(); } JAXBContext jaxbContext = JAXBContext.newInstance("converters.dbgap.jaxb.var_report"); Unmarshaller m = jaxbContext.createUnmarshaller(); return (Var_Report) m.unmarshal(url.openStream()); } public Data_Dict loadDictionary(Data_Dict d) throws JAXBException, IOException { System.out.println("loadDictionary from " + d.url); URL url = d.url; if (fileCache != null) { File cachedFile = new File(fileCache.getAbsolutePath() + "\\" + new File(new File(d.url.getFile()).getName())); if (!cachedFile.exists()) downloadFile(d.url, cachedFile); url = cachedFile.toURI().toURL(); } JAXBContext jaxbContext = JAXBContext.newInstance("converters.dbgap.jaxb.data_dict"); Unmarshaller m = jaxbContext.createUnmarshaller(); return (Data_Dict) m.unmarshal(url.openStream()); } public static void downloadFile(URL url, File destination) throws IOException { System.out.println("downloading " + url + " to " + destination); BufferedInputStream in = null; BufferedOutputStream out = null; try { URLConnection urlc = url.openConnection(); in = new BufferedInputStream(urlc.getInputStream()); out = new BufferedOutputStream(new FileOutputStream(destination)); byte[] buf = new byte[1024]; int len; while ((len = in.read(buf)) > 0) { out.write(buf, 0, len); } } finally { if (in != null) try { in.close(); } catch (IOException ioe) { ioe.printStackTrace(); } if (out != null) try { out.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } // because of NCBI abuse rules try { Thread.sleep(500); } catch (InterruptedException e) { e.printStackTrace(); } } }