/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.kegg; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ggasoftware.indigo.Indigo; import com.ggasoftware.indigo.IndigoInchi; import com.ggasoftware.indigo.IndigoObject; import org.json.JSONObject; import org.json.JSONArray; import act.shared.ConsistentInChI; import act.server.DBIterator; import act.server.MongoDB; import act.shared.Chemical; import act.shared.Reaction; import act.shared.helpers.P; import org.biopax.paxtools.model.level3.ConversionDirectionType; import org.biopax.paxtools.model.level3.StepDirection; public class KeggParser { private static final String keggXrefUrlPrefix = "http://www.kegg.jp/entry/"; /** * All the params are file names from KEGG * See data/kegg * @param reactionList * @param compound * @param reactions * @param cofactors * @throws IOException */ public static void parseKegg(String reactionList, String compoundInchi, String compound, String reactions, String cofactors, MongoDB db) { try { // First figure out what compounds are used in reactions Set<String> requiredKeggCompounds = parseReactions(reactionList, db); // Get kegg compounds to inchi mapping Map<String, String> keggIDInchi = parseChemicalInchis(compoundInchi, db); // Get required kegg compounds to inchi mapping Map<String, String> requiredKeggInchi = new HashMap<String, String>(); for (String c : requiredKeggCompounds) requiredKeggInchi.put(c, keggIDInchi.get(c)); Set<String> cofactorsSet = parseCofactors(cofactors); // Add chemicals to db parseChemicalsDetailed(compound, db, requiredKeggInchi, cofactorsSet); // Add reactions to db parseReactionsDetailed(reactions, db); } catch (IOException e) { e.printStackTrace(); } } /** * @param filename - File containing list of KEGG ids marked as cofactors * @return set of KEGG ids marked as cofactors */ public static Set<String> parseCofactors(String filename) { Set<String> result = new HashSet<String>(); try{ BufferedReader br = new BufferedReader( new InputStreamReader( new DataInputStream(new FileInputStream(filename)))); String strLine = ""; while ((strLine = br.readLine()) != null) { result.add(strLine.trim()); } br.close(); } catch(Exception e) { } return result; } /** * Takes the KEGG reaction.lst file. * @param filename * @param db * @return set of KEGG compounds involved in kegg reactions * @throws IOException */ public static Set<String> parseReactions(String filename, MongoDB db) throws IOException { //set up existing reactions List<Long> ids = db.getAllReactionUUIDs(); Set<Long> cofactorIDs = Chemical.getChemicalIDs(db.getCofactorChemicals()); Set<P> pairs = new HashSet<P>(); for (Long id : ids) { Reaction reaction = db.getReactionFromUUID(id); Set<Long> substrateIDs = new HashSet(Arrays.asList(reaction.getSubstrates())); substrateIDs.removeAll(cofactorIDs); Set<Long> productIDs = new HashSet(Arrays.asList(reaction.getProducts())); productIDs.removeAll(cofactorIDs); pairs.add(new P(substrateIDs, productIDs)); } int numExisting = pairs.size(); Set<String> newReactionsToAdd = new HashSet<String>(); //keggIDs to add Set<String> missingKeggCompounds = new HashSet<String>(); //kegg compound ids we don't have Set<String> reactionsMissingKeggCompounds = new HashSet<String>(); //kegg reactions that require the above ids Set<String> allRequiredKeggCompounds = new HashSet<String>(); Map<String, Long> keggID_ActID = db.getKeggID_ActID(false); try{ BufferedReader br = new BufferedReader( new InputStreamReader( new DataInputStream(new FileInputStream(filename)))); String strLine; int numGood = 0, numNew = 0, i = 0; while ((strLine = br.readLine()) != null) { String[] splitted = strLine.split("\\s+"); String keggID = splitted[0].substring(0, 6); //removes colon after id boolean bad = false, productSide = false; Set<Long> reactantIDs = new HashSet<Long>(); Set<Long> productIDs = new HashSet<Long>(); // does KEGG provide consumed/modified cofactor information? // currently unpopulated Set<Long> reactantCofactorIDs = new HashSet<Long>(); Set<Long> productCofactorIDs = new HashSet<Long>(); // does KEGG provide auxiliary coenzyme information? // currently unpopulated Set<Long> coenzymeIDs = new HashSet<Long>(); for (String s : splitted) { if (s.charAt(0) == 'C' || s.charAt(0) == 'G') { //is it a chemical or glycan? s = s.substring(0, 6); //remove the possible parameters allRequiredKeggCompounds.add(s); if (!keggID_ActID.containsKey(s)) { bad = true; missingKeggCompounds.add(s); } else if (!productSide) reactantIDs.add(keggID_ActID.get(s)); else productIDs.add(keggID_ActID.get(s)); } else if (s.contains("=")) { productSide = true; } } if (!bad) { numGood++; if (reactantIDs.isEmpty() || productIDs.isEmpty()) { System.out.println("KeggParser.parseReactions: no reactants or products" + strLine); } else { Reaction toAdd = new Reaction(-1, reactantIDs.toArray(new Long[0]), productIDs.toArray(new Long[0]), reactantCofactorIDs.toArray(new Long[0]), productCofactorIDs.toArray(new Long[0]), coenzymeIDs.toArray(new Long[0]), null, // ecnum ConversionDirectionType.LEFT_TO_RIGHT, StepDirection.LEFT_TO_RIGHT, keggID, // readable name Reaction.RxnDetailType.CONCRETE ); reactantIDs.removeAll(cofactorIDs); productIDs.removeAll(cofactorIDs); P<Set, Set> pair = new P(reactantIDs, productIDs); if (!pairs.contains(pair)) pair = new P(productIDs, reactantIDs); if (pairs.add(pair)) { newReactionsToAdd.add(keggID); } } } else { // add reactions with a chemical not in database into this set reactionsMissingKeggCompounds.add(keggID); } if (i % 1000 == 0) System.out.println("KeggParser.parseReactions: Done " + i); i++; } System.out.println("KeggParser.parseReactions: Num Missing KEGG Chemical IDs: " + missingKeggCompounds.size()); System.out.println("KeggParser.parseReactions: Num Missing KEGG Reactions: " + reactionsMissingKeggCompounds.size()); System.out.println("KeggParser.parseReactions: Num Found Reactions: " + numGood + " New: " + (pairs.size() - numExisting)); } catch (Exception e){ e.printStackTrace(); } return allRequiredKeggCompounds; } public static Set<String> getAllKeggIDsWInchis(String filename, MongoDB db) { return parseChemicalInchis(filename, db).keySet(); } /** * Parses the KEGG file mapping from KEGG ID to InChIs * and creates a chemical entry in db for each not already in db * @param filename * @param db * @return map from KEGG id to consistent InChI */ public static Map<String, String> parseChemicalInchis(String filename, MongoDB db) { Map<String, String> keggID_InChI = new HashMap<String, String>(); DBIterator it = db.getIteratorOverChemicals(); Map<String, Long> inchi_ID = new HashMap<String, Long>(); while (it.hasNext()) { Chemical chemical = db.getNextChemical(it); inchi_ID.put(chemical.getInChI(), chemical.getUuid()); } try{ BufferedReader br = new BufferedReader( new InputStreamReader( new DataInputStream(new FileInputStream(filename)))); FileWriter fstream = new FileWriter("keggCompoundsNotFound.log.html"); BufferedWriter notFound = new BufferedWriter(fstream); notFound.write("<html><head></head><body>"); String strLine; int i = 0; int numFound = 0; Indigo indigo = new Indigo(); IndigoInchi indigoInchi = new IndigoInchi(indigo); while ((strLine = br.readLine()) != null) { String[] splitted = strLine.split("\\s+"); String keggID = splitted[0]; String inchi = splitted[1]; keggID_InChI.put(keggID, inchi); inchi = ConsistentInChI.consistentInChI(inchi, "Kegg Parser"); String inchiKey = indigoInchi.getInchiKey(inchi); Chemical chemical = db.getChemicalFromInChI(inchi); i++; if (chemical == null) { try { if (inchi_ID.containsKey(inchi)) { chemical = db.getChemicalFromChemicalUUID(inchi_ID.get(inchi)); } if (chemical == null) { IndigoObject test = indigoInchi.loadMolecule(inchi); if (keggID.startsWith("G")) notFound.write("<a href=\"http://www.kegg.jp/dbget-bin/www_bget?gl:" + keggID + "\">" + keggID + "</a>\n"); else notFound.write("<a href=\"http://www.kegg.jp/dbget-bin/www_bget?cpd:" + keggID + "\">" + keggID + "</a>\n"); Chemical newChemical = new Chemical(inchi); // calls setInchi which sets the inchikey newChemical.setSmiles(test.canonicalSmiles()); addKeggRef(keggID, newChemical); db.submitToActChemicalDB(newChemical, db.getNextAvailableChemicalDBid()); continue; } } catch (Exception e){ if (keggID.startsWith("G")) notFound.write("<i><a href=\"http://www.kegg.jp/dbget-bin/www_bget?gl:" + keggID + "\">" + keggID + "</a></i>\n"); else notFound.write("<i><a href=\"http://www.kegg.jp/dbget-bin/www_bget?cpd:" + keggID + "\">" + keggID + "</a></i>\n"); Chemical newChemical = new Chemical(inchi); addKeggRef(keggID, newChemical); db.submitToActChemicalDB(newChemical, db.getNextAvailableChemicalDBid()); continue; } } numFound++; addKeggRef(keggID, chemical); db.updateActChemical(chemical, chemical.getUuid()); if (i % 1000 == 0) System.out.println("KeggParser.parseChemicalInchis: Done " + i); } notFound.write("</body><html>"); br.close(); notFound.close(); System.out.println("KeggParser.parseChemicalInchis: # already in DB " + numFound); } catch (Exception e){ e.printStackTrace(); } return keggID_InChI; } /** * Adds keggID reference to chemical if it doesn't exist already * @param keggID * @param chemical */ private static void addKeggRef(String keggID, Chemical chemical) { JSONObject existing = (JSONObject) chemical.getRef(Chemical.REFS.KEGG); if (existing != null) { JSONArray ids = (JSONArray) existing.get("id"); if (!jsonArrayContains(ids, keggID)) ids.put(keggID); if (!existing.has("url")) existing.put("url", keggXrefUrlPrefix + keggID); } else { JSONObject entry = new JSONObject(); JSONArray ids = new JSONArray(); ids.put(keggID); entry.put("id", ids); if (!entry.has("url")) entry.put("url", keggXrefUrlPrefix + keggID); chemical.putRef(Chemical.REFS.KEGG, entry); } } private static boolean jsonArrayContains(JSONArray a, Object contains) { for (int i = 0; i < a.length(); i++) { if (a.get(i).equals(contains)) return true; } return false; } private static void jsonArrayRemove(JSONArray a, Object toRemove) { for (int i = 0; i< a.length(); i++) { if (a.get(i).equals(toRemove)) a.remove(i); } } /** * Parses file KEGG file with chemical details. * Anything in requiredIdInchi not already in db is added. * @param filename * @param db * @param requiredIdInchi * @param cofactorsSet * @throws IOException */ public static void parseChemicalsDetailed(String filename, MongoDB db, Map<String, String> requiredIdInchi, Set<String> cofactorsSet) throws IOException { System.out.println("KeggParser.parseChemicalsDetailed: start"); BufferedReader br = new BufferedReader( new InputStreamReader( new DataInputStream(new FileInputStream(filename)))); Map<String, Long> keggID_ActID = db.getKeggID_ActID(false); int numEntriesUpdated = 0; int numFailedToFind = 0; int numNewKegg = 0; // set of variables to keep track of when parsing one entry String currKeggID = null; Long currActID = null; String currFormula = null; List<String> currSynonyms = null; String strLine; while ((strLine = br.readLine()) != null) { if (strLine.startsWith(" ")) continue; //in middle of a field we don't need now String[] field_val = strLine.split(" +", 2); String key = field_val[0]; if (key.equals("ENTRY")) { currKeggID = field_val[1].split(" +")[0]; currActID = keggID_ActID.get(currKeggID); if (currActID == null) { currActID = keggID_ActID.get(currKeggID + "n"); keggID_ActID.put(currKeggID, currActID); } currFormula = null; currSynonyms = null; } else if (key.equals("FORMULA")) { currFormula = field_val[1]; } else if (key.equals("NAME")) { currSynonyms = new ArrayList<String>(); String currName = field_val[1]; while (true) { if (currName.endsWith(";")) { currSynonyms.add(currName.substring(0, currName.length() - 1)); strLine = br.readLine(); currName = strLine.split(" +", 2)[1]; } else { currSynonyms.add(currName); break; } } } else if (key.equals("///")) { if (requiredIdInchi.containsKey(currKeggID)) { boolean needUpdate = false; //if (requiredIdInchi.get(currKeggID) == null) // System.out.println("NAMES " + currSynonyms + " ACT_ID " + currActID + " FOR " + currKeggID + " " + currFormula); if (currActID == null) { if (currSynonyms != null) { for (String s : currSynonyms) { Long id = db.getChemicalIDFromName(s); if (id != null && id != -1L) { Chemical chemical = db.getChemicalFromChemicalUUID(id); addKeggRef(currKeggID, chemical); db.updateActChemical(chemical, id); keggID_ActID.put(currKeggID, id); currActID = id; break; } } } if (currActID == null ) { // chemical not in compound.inchi and not in db currActID = db.getNextAvailableChemicalDBid(); Chemical chemical = new Chemical(currActID); chemical.setInchi("none " + currKeggID); addKeggRef(currKeggID, chemical); db.submitToActChemicalDB(chemical, currActID); numFailedToFind++; } needUpdate = true; } Chemical toUpdate = db.getChemicalFromChemicalUUID(currActID); if (cofactorsSet.contains(currKeggID) && !toUpdate.isCofactor()) { toUpdate.setAsCofactor(); needUpdate = true; } if (currSynonyms == null) currSynonyms = new ArrayList<String>(); List<String> existingSynonyms = toUpdate.getSynonyms(); currSynonyms.removeAll(existingSynonyms); for (String syn : currSynonyms) { toUpdate.addSynonym(syn); needUpdate = true; } // if no formula or formula contains n, append "n" to keggid for now. // this avoids adding any reactions that'll involve these chemicals if ((currFormula == null || currFormula.contains(")n")) && !toUpdate.isCofactor()) { JSONObject o = (JSONObject) toUpdate.getRef(Chemical.REFS.KEGG); JSONArray list = (JSONArray) o.get("id"); jsonArrayRemove(list, currKeggID); System.out.printf("KeggParser.parseChemicalsDetailed: Needs parameter: %s, %s\n", currKeggID, currFormula); if (!jsonArrayContains(list, currKeggID + "n")) { list.put(currKeggID + "n"); } if (!o.has("url")) { o.put("url", keggXrefUrlPrefix + currKeggID); } needUpdate = true; } if (needUpdate) { numEntriesUpdated++; db.updateActChemical(toUpdate, currActID); } } currKeggID = null; currSynonyms = null; currFormula = null; } } br.close(); System.out.format("KeggParser.parseChemicalsDetailed: Num total entries added %d, Num no inchi but added %d\n", numEntriesUpdated, numFailedToFind); } /** * Parses KEGG file with reaction details. * Add all reactions that involve only chemicals in our database. * @param filename * @param db * @throws IOException */ public static void parseReactionsDetailed(String filename, MongoDB db) throws IOException { Map<String, Long> keggID_ActID = db.getKeggID_ActID(false); BufferedReader br = new BufferedReader( new InputStreamReader( new DataInputStream(new FileInputStream(filename)))); int numEntriesAdded = 0; int failed = 0; // set of variables to keep track of when parsing one entry String currKeggID = null; String currName = null; Map<Long, Integer> currProducts = null, currReactants = null; //maps chemicals to coefficients Map<Long, Integer> currCofactorProducts = null, currCofactorReactants = null; //maps chemicals to coefficients Map<Long, Integer> currCoenzymes = null; String currECNum = null; String strLine; while ((strLine = br.readLine()) != null) { if (strLine.startsWith(" ")) continue; //in middle of a field we don't need now String[] field_val = strLine.split(" +", 2); String key = field_val[0]; if (key.equals("ENTRY")) { currName = null; currProducts = null; currReactants = null; currCofactorProducts = null; currCofactorReactants = null; currCoenzymes = null; currECNum = null; currKeggID = field_val[1].split(" +")[0]; } else if (key.equals("ENZYME")) { currECNum = field_val[1]; } else if (key.equals("///")) { if (currKeggID == null) { failed++; continue; } if (currProducts == null || currReactants == null) continue; Long[] productArr = (Long[]) currProducts.keySet().toArray(new Long[1]); Long[] reactantArr = (Long[]) currReactants.keySet().toArray(new Long[1]); Long[] productCofactorArr = (Long[]) currCofactorProducts.keySet().toArray(new Long[1]); Long[] reactantCofactorArr = (Long[]) currCofactorReactants.keySet().toArray(new Long[1]); Long[] coenzymeArr = (Long[]) currCoenzymes.keySet().toArray(new Long[1]); Reaction toAdd = new Reaction(-1L, reactantArr, productArr, reactantCofactorArr, productCofactorArr, coenzymeArr, currECNum, ConversionDirectionType.LEFT_TO_RIGHT, StepDirection.LEFT_TO_RIGHT, currName, Reaction.RxnDetailType.CONCRETE ); toAdd.addReference(Reaction.RefDataSource.KEGG, currKeggID); toAdd.addReference(Reaction.RefDataSource.KEGG, keggXrefUrlPrefix + currKeggID); for (Long p : productArr) toAdd.setProductCoefficient(p, currProducts.get(p)); for (Long r : reactantArr) toAdd.setSubstrateCoefficient(r, currReactants.get(r)); numEntriesAdded++; toAdd.setDataSource(Reaction.RxnDataSource.KEGG); db.submitToActReactionDB(toAdd); currKeggID = null; } else if (key.equals("DEFINITION")) { currName = field_val[1]; if (currName == null) currName = ""; currName = "{} " + currName.replace("<=>", "->") + " <KEGG:" + currKeggID + ">"; } else if (key.equals("EQUATION")) { String[] tokens = field_val[1].split(" +"); currProducts = new HashMap<Long, Integer>(); currReactants = new HashMap<Long, Integer>(); // does KEGG provide consumed/modified cofactor information? // currently unpopulated currCofactorProducts = new HashMap<Long, Integer>(); currCofactorReactants = new HashMap<Long, Integer>(); // does KEGG provide auxiliary coenzyme information? // currently unpopulated currCoenzymes = new HashMap<Long, Integer>(); boolean isProducts = false; for (int t = 0; t < tokens.length; t++) { String token = tokens[t]; if (token.equals("+")) continue; if (token.equals("<=>")) { isProducts = true; continue; } Integer coeff = 1; if (token.matches("\\d+")) { //is numeric coeff = Integer.parseInt(token); t++; token = tokens[t]; } if (token.startsWith("C") || token.startsWith("G")) { token = token.substring(0, 6); //remove the possible parameters Long actID = keggID_ActID.get(token); if (actID == null) { //allow use of chemicals with parameter actID = keggID_ActID.get(token + "n"); } if (actID != null) { if (isProducts) { currProducts.put(actID, coeff); } else { currReactants.put(actID, coeff); } } else { /*if (token.startsWith("G") && currKeggID != null) { failed--; //not parsing glycans now so ignoring those failures }*/ currKeggID = null; } } } if (currKeggID == null) { System.out.println("KeggParser.parseReactionsDetailed failed" + currName); } } } br.close(); System.out.format("KeggParser.parseReactionsDetailed: Num entries added %d, Failed %d\n", numEntriesAdded, failed); } public static void main(String[] args) { MongoDB db = new MongoDB(); parseKegg("data/kegg/reaction.lst", "data/kegg/compound.inchi", "data/kegg/compound", "data/kegg/reaction", "data/kegg/cofactors.txt", db); } }