// PathVisio, // a tool for data visualization and analysis using Biological Pathways // Copyright 2006-2009 BiGCaT Bioinformatics // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // package org.bridgedb.util.hmdb; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.text.SimpleDateFormat; import java.util.Date; import java.util.zip.ZipFile; import org.bridgedb.IDMapperException; import org.bridgedb.Xref; import org.bridgedb.bio.BioDataSource; import org.bridgedb.rdb.construct.DBConnector; import org.bridgedb.rdb.construct.DataDerby; import org.bridgedb.rdb.construct.GdbConstruct; import org.bridgedb.rdb.construct.GdbConstructImpl3; import org.bridgedb.util.hmdb.ParseHmdb.Compound; import org.bridgedb.util.hmdb.ParseHmdb.ParseException; /** * Program to create a metabolite database based on a * metabocards flat text file, which can be downloaded from http://www.hmdb.ca * * In fall '08 HMDB changed the metabocard file format, * This program is requires the newer format. */ public class Hmdb2Gdb { /** * @param args command line arguments * * Commandline: * - output database: .pgdb * - input metabocards .txt file */ public static void main(String[] args) { String dbname = args[0]; String file = args[1]; Hmdb2Gdb h2g = new Hmdb2Gdb(); try { GdbConstruct simpleGdb = GdbConstructImpl3.createInstance(dbname, new DataDerby(), DBConnector.PROP_RECREATE); h2g.init (dbname, simpleGdb); InputStream is; if (file.toLowerCase().endsWith(".zip")) { ZipFile zip = new ZipFile(file); is = zip.getInputStream(zip.entries().nextElement()); // is = new ZipInputStream(new FileInputStream(new File(file))); } else { is = new FileInputStream (new File (file)); } h2g.run(is); h2g.done(); } catch (IDMapperException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } GdbConstruct simpleGdb; String dbName; private void init(String dbname, GdbConstruct simpleGdb) throws IDMapperException, ClassNotFoundException { this.simpleGdb = simpleGdb; this.dbName = dbname; // simpleGdb.connect (true); simpleGdb.createGdbTables(); simpleGdb.preInsert(); String dateStr = new SimpleDateFormat("yyyyMMdd").format(new Date()); simpleGdb.setInfo("BUILDDATE", dateStr); simpleGdb.setInfo("DATASOURCENAME", "HMDB"); simpleGdb.setInfo("DATASOURCEVERSION", "metabocards_" + dateStr); simpleGdb.setInfo("DATATYPE", "Metabolite"); simpleGdb.setInfo("SERIES", "standard_metabolite"); } private void done() throws IDMapperException { simpleGdb.commit(); //TODO // System.out.println("total ids in gene table: " + simpleGdb.getGeneCount()); System.out.println("total errors (duplicates): " + error); System.out.println("END processing text file"); System.out.println("Compacting database"); System.out.println("Closing connections"); simpleGdb.finalize(); } int error = 0; int progress = 0; private void addCompound (Compound c) throws IDMapperException { Xref ref = c.idHmdb; error += simpleGdb.addGene(ref); error += simpleGdb.addLink(ref, ref); error += simpleGdb.addAttribute(ref, "Symbol", c.symbol); error += simpleGdb.addAttribute(ref, "BrutoFormula", c.formula); if (c.symbol != null) { // hmdb id is actually also the NUGOWIKI id. Xref right = c.idHmdb; error += simpleGdb.addGene (right); error += simpleGdb.addLink (ref, right); } if (c.inchi != null) { error += simpleGdb.addAttribute(ref, "InChI", c.inchi); } for (Xref right : c.idKegg) { error += simpleGdb.addGene(right); error += simpleGdb.addLink(ref, right); } for (Xref right : c.idChebi) { error += simpleGdb.addGene(right); error += simpleGdb.addLink(ref, right); } for (Xref right : c.idPubchem) { error += simpleGdb.addGene(right); error += simpleGdb.addLink(ref, right); } for (Xref right : c.idCas) { error += simpleGdb.addGene(right); error += simpleGdb.addLink(ref, right); } for (Xref right : c.idWikipedia) { error += simpleGdb.addGene(right); error += simpleGdb.addLink(ref, right); } if (c.smiles != null) { error += simpleGdb.addAttribute(ref, "SMILES", c.smiles); } if (c.synonyms != null) for (String synonym : c.synonyms) { error += simpleGdb.addAttribute(ref, "Synonym", synonym); } } private void run(InputStream is) throws IOException, IDMapperException { ParseHmdb parser = new ParseHmdb(); LineNumberReader br = new LineNumberReader (new InputStreamReader(is)); Compound c; try { while ((c = parser.readNext(br)) != null) { progress++; addCompound (c); if(progress % PROGRESS_INTERVAL == 0) { System.out.println("Processed " + progress + " record"); simpleGdb.commit(); } System.out.println (c.symbol + " added"); } System.out.println ("Total: " + progress); } catch (ParseException pe) { System.err.println (pe.getMessage()); System.err.println ("Please check that this is a valid metabocards file"); pe.printStackTrace(); } } private final static long PROGRESS_INTERVAL = 100; }