package converters.dbgap; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.bind.JAXBException; import org.apache.log4j.Logger; import org.molgenis.core.OntologyTerm; import org.molgenis.organization.Investigation; import org.molgenis.pheno.Category; import org.molgenis.pheno.Measurement; import org.molgenis.pheno.ObservedValue; import org.molgenis.pheno.Panel; import org.molgenis.protocol.Protocol; import app.CsvExport; import converters.dbgap.jaxb.Study; import converters.dbgap.jaxb.data_dict.Data_Dict; import converters.dbgap.jaxb.data_dict.Value; import converters.dbgap.jaxb.data_dict.Variable; import converters.dbgap.jaxb.var_report.Stat; import converters.dbgap.jaxb.var_report.Var_Report; import converters.dbgap.jaxb.var_report.VariableSummary; public class DbGapToPheno { static Logger logger = Logger.getLogger(DbGapToPheno.class); List<Investigation> investigations = new ArrayList<Investigation>(); List<Protocol> protocols = new ArrayList<Protocol>(); Map<String, Measurement> measurements = new LinkedHashMap<String, Measurement>(); Set<OntologyTerm> ontologyterms = new HashSet<OntologyTerm>(); Map<String, Category> categories = new TreeMap<String, Category>(); List<Panel> panels = new ArrayList<Panel>(); List<ObservedValue> observedValues = new ArrayList<ObservedValue>(); public static void main(String[] args) throws Exception { // This will need updating if run on a different machine // String outputFolder = "d:/Data/dbgap/"; String outputFolder = "../pheno_data/dbgap/"; String dbgapUrl = outputFolder + "FTP_Table_of_Contents.xml"; DbGapService dbgap = new DbGapService(new File(dbgapUrl).toURI().toURL(), new File(outputFolder)); int count = 1; // get studies List<Study> studies = dbgap.listStudies(); // filter out last versions only Map<String, Study> lastversions = new TreeMap<String, Study>(); for (Study s : studies) { // System.out.println("filtering "+ s.id + " "+s.version+ " " // +s.description); // pht000182 no v if (s.version.startsWith("v") && (lastversions.get(s.id) == null || extractVersion(lastversions.get(s.id).version) < extractVersion(s.version))) { lastversions.put(s.id, s); } } // caching all files // for (Data_Dict vr : dbgap.listDictionaries()) // { // File f = new File(outputFolder + "download/" + vr.id + ".xml"); // if(!f.exists()) downloadFile(vr.url,f); // } // for (Var_Report vr : dbgap.listVariableReports()) // { // File f = new File(outputFolder + "download/" + vr.dataset_id + // ".xml"); // if(!f.exists()) downloadFile(vr.url, f); // } // download the last versions System.out.println("lastversions = " + lastversions.size()); for (Study s : lastversions.values()) { DbGapToPheno converter = new DbGapToPheno(); // writing the data_dicts File dir = new File(outputFolder + s.id); System.out.println("converting " + s.id + " " + s.version + " " + s.description + " to " + dir); dbgap.loadDictionaries(s); dbgap.loadVariableReports(s); converter.read(s); dir.mkdirs(); // System.out.println(convertor.toString()); converter.write(dir); // debug purposes only count++; if (count > 6) { System.out.println("skipped other studies!"); break; } } } public void write(File dir) throws Exception { new CsvExport().exportAll(dir, investigations, new ArrayList<OntologyTerm>(ontologyterms), protocols, new ArrayList(measurements.values()), new ArrayList(categories.values()), panels, observedValues); } public static void downloadFile(URL url, File destination) throws IOException { logger.debug("downloading " + url + " to " + destination); BufferedInputStream in = null; BufferedOutputStream out = null; try { URLConnection urlc = url.openConnection(); in = new BufferedInputStream(urlc.getInputStream()); out = new BufferedOutputStream(new FileOutputStream(destination)); byte[] buf = new byte[1024]; int len; while ((len = in.read(buf)) > 0) { out.write(buf, 0, len); } } finally { if (in != null) try { in.close(); } catch (IOException ioe) { ioe.printStackTrace(); } if (out != null) try { out.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } /** * Read the dbGaP study into the convertor * * @param s * @throws JAXBException * @throws IOException */ public void read(Study s) throws JAXBException, IOException { Investigation i = new Investigation(); i.setDescription(s.description); i.setName(s.id + "." + s.version); investigations.add(i); // data dictionaries = protocols + variables + features (+ ontologies) for (Data_Dict dd : s.dictionaries) { Protocol p = new Protocol(); p.setName(dd.description); p.setInvestigation_Name(i.getName()); p.setName(dd.id); protocols.add(p); for (Variable var : dd.variables) { Measurement measurement = new Measurement(); measurement.setInvestigation_Name(i.getName()); measurement.setName(var.name.toLowerCase()); measurement.setDescription(var.description); // todo: add annotation feature NVT type? if (var.type != null && !var.type.equals("")) { String dataType; // available types: // xref,string,categorical,datetime,int,code,image,decimal,bool,file,log,data if (var.type.contains("decimal")) { // that handles 'decimal, encoded' dataType = "decimal"; } else if (var.type.contains("integer")) { // same for encoded and enumerated dataType = "int"; } else { dataType = "string"; } measurement.setDataType(dataType); // f.set__Type(var.type); } if (var.logical_min != null) measurement.setDescription(measurement.getDescription() + " LogicalMin=" + var.logical_min + "."); if (var.logical_min != null) measurement.setDescription(measurement.getDescription() + " LogicalMax=" + var.logical_max + "."); measurement.setUnit_Name(var.unit); if (var.unit != null && !var.unit.equals("")) { OntologyTerm ot = new OntologyTerm(); ot.setName(var.unit); ontologyterms.add(ot); } measurements.put(measurement.getName(), measurement); p.getFeatures_Name().add(measurement.getName()); // if (var.unit != null && terms.get(var.unit) == null) { // Category t = new Category(); // t.setName(var.unit); // t.setCode_String(var.unit); // t.setLabel(var.unit); // t.setDescription("N/A."); // t.setInvestigation_Name(i.getName()); // // //link category to measurement // measurement.getCategories_Name().add(t.getName()); // //t.getFeature_Name().add(f.getName()); // // // t.setInvestigationLabel(i.getName()); // // if (terms.containsKey(var.unit)) // logger.warn("duplicate term " + var.unit); // terms.put(var.unit, t); // } if (var.values.size() > 0) { for (Value v : var.values) { Category category = new Category(); category.setCode_String(v.code); category.setLabel(v.value); category.setDescription("NA"); category.setName(measurement.getName() + "_" + v.code); category.setInvestigation_Name(i.getName()); categories.put(category.getName(), category); // code.getFeature_Name().add(f.getName()); measurement.getCategories_Name().add(category.getName()); // give error on duplicate term if (v.code == null) { logger.warn("empty code on " + v.value); } if (v.code != null && categories.containsKey(v.code)) { logger.warn("duplicate term " + v.code); if (v.code != null) { categories.put(v.code, category); } } } } } } // var report = observedValues, protocolApplication, panels Panel total_panel = new Panel(); total_panel.setName("total"); total_panel.setInvestigation_Name(i.getName()); Panel cases_panel = new Panel(); cases_panel.setName("cases"); cases_panel.setInvestigation_Name(i.getName()); Panel controls_panel = new Panel(); controls_panel.setName("controls"); controls_panel.setInvestigation_Name(i.getName()); this.panels.add(total_panel); this.panels.add(cases_panel); this.panels.add(controls_panel); for (Var_Report vr : s.reports) { logger.debug("var_report " + vr.dataset_id); for (VariableSummary vs : vr.variables) { if (vs.total != null) addStatsToPanel(total_panel, i, vs, vs.total.stats); if (vs.cases != null) addStatsToPanel(cases_panel, i, vs, vs.cases.stats); if (vs.controls != null) addStatsToPanel(controls_panel, i, vs, vs.controls.stats); } } } /** Is there an ontology for these stat terms? */ private void addStatsToPanel(Panel panel, Investigation investigation, VariableSummary vs, List<Stat> stats) { for (Stat stat : stats) { if (stat.n != null) addObservedValue(panel, investigation, vs, stat.n, "n"); if (stat.nulls != null) addObservedValue(panel, investigation, vs, stat.nulls, "nulls"); if (stat.invalid_values != null) addObservedValue(panel, investigation, vs, stat.invalid_values, "invalid_values"); if (stat.special_values != null) addObservedValue(panel, investigation, vs, stat.special_values, "special_values"); if (stat.mean != null) addObservedValue(panel, investigation, vs, stat.mean, "mean"); if (stat.mean_count != null) addObservedValue(panel, investigation, vs, stat.mean_count, "mean_count"); if (stat.sd != null) addObservedValue(panel, investigation, vs, stat.sd, "sd"); if (stat.median != null) addObservedValue(panel, investigation, vs, stat.median, "median"); if (stat.median_count != null) addObservedValue(panel, investigation, vs, stat.median_count, "median_count"); if (stat.min != null) addObservedValue(panel, investigation, vs, stat.min, "min"); if (stat.min_count != null) addObservedValue(panel, investigation, vs, stat.min_count, "min_count"); if (stat.max != null) addObservedValue(panel, investigation, vs, stat.max, "max"); if (stat.max_count != null) addObservedValue(panel, investigation, vs, stat.max_count, "max_count"); } } private void addObservedValue(Panel p, Investigation i, VariableSummary vs, String value, String inferenceType) { Measurement inference = measurements.get(inferenceType); if (inference == null) { inference = new Measurement(); inference.setName(inferenceType); inference.setDescription("N/A."); inference.setInvestigation_Name(i.getName()); // t.setInvestigation_Name(i.getName()); measurements.put(inference.getName(), inference); } Measurement feature = measurements.get(vs.var_name); if (feature == null) { logger.warn("var_name '" + vs.var_name + "' not found. Is it missing in dictionary? We add it now..."); feature = new Measurement(); feature.setName(vs.var_name); feature.setDataType(vs.description); if ("integer".equals(vs.calculated_type) || "enum_integer".equals(vs.calculated_type)) { feature.setDataType("int"); } else if ("decimal".equals(vs.calculated_type)) { feature.setDataType("decimal"); } else if ("string".equals(vs.calculated_type)) { feature.setDataType("string"); } else { logger.error("cannot get data type " + vs.calculated_type); } measurements.put(feature.getName(), feature); } if (feature.getName().contains("Specific diagnosis Mutation")) { logger.debug("found"); } ObservedValue v = new ObservedValue(); v.setInvestigation_Name(i.getName()); v.setFeature_Name(feature.getName()); v.setValue(value); v.setRelation_Name(inferenceType); // v.setInferenceTypeLabel(inferenceType); v.setTarget_Name(p.getName()); // System.out.println("inferfed value " + v); this.observedValues.add(v); } public String toString() { String result = ""; for (Investigation i : investigations) result += i + "\n"; for (Protocol p2 : protocols) result += p2 + "\n"; for (Measurement f2 : measurements.values()) result += f2 + "\n"; for (Category t : categories.values()) result += t + "\n"; for (Panel p : panels) result += p + "\n"; for (ObservedValue i : observedValues) result += i + "\n"; return result; } public static Integer extractVersion(String s) { Pattern p = Pattern.compile("\\d+$"); Matcher m = p.matcher(s); m.find(); return Integer.parseInt(m.group()); } }