KeggParser.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.kegg;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.ggasoftware.indigo.Indigo;
import com.ggasoftware.indigo.IndigoInchi;
import com.ggasoftware.indigo.IndigoObject;

import org.json.JSONObject;
import org.json.JSONArray;

import act.shared.ConsistentInChI;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.helpers.P;

import org.biopax.paxtools.model.level3.ConversionDirectionType;
import org.biopax.paxtools.model.level3.StepDirection;

public class KeggParser {
  private static final String keggXrefUrlPrefix = "http://www.kegg.jp/entry/";

  /**
   * All the params are file names from KEGG
   * See data/kegg
   * @param reactionList
   * @param compound
   * @param reactions
   * @param cofactors
   * @throws IOException
   */
  public static void parseKegg(String reactionList, String compoundInchi,
      String compound, String reactions, String cofactors, MongoDB db) {

    try {
      // First figure out what compounds are used in reactions
      Set<String> requiredKeggCompounds = parseReactions(reactionList, db);

      // Get kegg compounds to inchi mapping
      Map<String, String> keggIDInchi = parseChemicalInchis(compoundInchi, db);

      // Get required kegg compounds to inchi mapping
      Map<String, String> requiredKeggInchi = new HashMap<String, String>();
      for (String c : requiredKeggCompounds) requiredKeggInchi.put(c, keggIDInchi.get(c));

      Set<String> cofactorsSet = parseCofactors(cofactors);
      // Add chemicals to db
      parseChemicalsDetailed(compound, db, requiredKeggInchi, cofactorsSet);

      // Add reactions to db
      parseReactionsDetailed(reactions, db);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  /**
   * @param filename - File containing list of KEGG ids marked as cofactors
   * @return set of KEGG ids marked as cofactors
   */
  public static Set<String> parseCofactors(String filename) {
    Set<String> result = new HashSet<String>();
    try{
      BufferedReader br = new BufferedReader(
          new InputStreamReader(
              new DataInputStream(new FileInputStream(filename))));
      String strLine = "";
      while ((strLine = br.readLine()) != null) {
        result.add(strLine.trim());
      }
      br.close();
    } catch(Exception e) {

    }
    return result;
  }

  /**
   * Takes the KEGG reaction.lst file.
   * @param filename
   * @param db
   * @return set of KEGG compounds involved in kegg reactions
   * @throws IOException
   */
  public static Set<String> parseReactions(String filename, MongoDB db) throws IOException {
    //set up existing reactions
    List<Long> ids = db.getAllReactionUUIDs();
    Set<Long> cofactorIDs = Chemical.getChemicalIDs(db.getCofactorChemicals());
    Set<P> pairs = new HashSet<P>();
    for (Long id : ids) {
      Reaction reaction = db.getReactionFromUUID(id);
      Set<Long> substrateIDs = new HashSet(Arrays.asList(reaction.getSubstrates()));
      substrateIDs.removeAll(cofactorIDs);
      Set<Long> productIDs = new HashSet(Arrays.asList(reaction.getProducts()));
      productIDs.removeAll(cofactorIDs);
      pairs.add(new P(substrateIDs, productIDs));
    }
    int numExisting = pairs.size();

    Set<String> newReactionsToAdd = new HashSet<String>(); //keggIDs to add
    Set<String> missingKeggCompounds = new HashSet<String>(); //kegg compound ids we don't have
    Set<String> reactionsMissingKeggCompounds = new HashSet<String>(); //kegg reactions that require the above ids
    Set<String> allRequiredKeggCompounds = new HashSet<String>();

    Map<String, Long> keggID_ActID = db.getKeggID_ActID(false);
    try{
      BufferedReader br = new BufferedReader(
          new InputStreamReader(
              new DataInputStream(new FileInputStream(filename))));

      String strLine;
      int numGood = 0, numNew = 0, i = 0;
      while ((strLine = br.readLine()) != null) {
        String[] splitted = strLine.split("\\s+");
        String keggID = splitted[0].substring(0, 6); //removes colon after id
        boolean bad = false, productSide = false;

        Set<Long> reactantIDs = new HashSet<Long>();
        Set<Long> productIDs = new HashSet<Long>();
        // does KEGG provide consumed/modified cofactor information?
        // currently unpopulated
        Set<Long> reactantCofactorIDs = new HashSet<Long>();
        Set<Long> productCofactorIDs = new HashSet<Long>();
        // does KEGG provide auxiliary coenzyme information?
        // currently unpopulated
        Set<Long> coenzymeIDs = new HashSet<Long>();
        for (String s : splitted) {
          if (s.charAt(0) == 'C' || s.charAt(0) == 'G') { //is it a chemical or glycan?
            s = s.substring(0, 6); //remove the possible parameters
            allRequiredKeggCompounds.add(s);
            if (!keggID_ActID.containsKey(s)) {
              bad = true;
              missingKeggCompounds.add(s);
            } else if (!productSide)
              reactantIDs.add(keggID_ActID.get(s));
            else
              productIDs.add(keggID_ActID.get(s));
          } else if (s.contains("=")) {
            productSide = true;
          }
        }

        if (!bad) {
          numGood++;
          if (reactantIDs.isEmpty() || productIDs.isEmpty()) {
            System.out.println("KeggParser.parseReactions: no reactants or products" + strLine);

          } else {
            Reaction toAdd = new Reaction(-1,
                reactantIDs.toArray(new Long[0]),
                productIDs.toArray(new Long[0]),
                reactantCofactorIDs.toArray(new Long[0]),
                productCofactorIDs.toArray(new Long[0]),
                coenzymeIDs.toArray(new Long[0]),
                null, // ecnum
                ConversionDirectionType.LEFT_TO_RIGHT,
                StepDirection.LEFT_TO_RIGHT,
                keggID, // readable name
                Reaction.RxnDetailType.CONCRETE
                );
            reactantIDs.removeAll(cofactorIDs);
            productIDs.removeAll(cofactorIDs);
            P<Set, Set> pair = new P(reactantIDs, productIDs);
            if (!pairs.contains(pair))
              pair = new P(productIDs, reactantIDs);

            if (pairs.add(pair)) {
              newReactionsToAdd.add(keggID);
            }
          }
        } else {
          // add reactions with a chemical not in database into this set
          reactionsMissingKeggCompounds.add(keggID);
        }
        if (i % 1000 == 0) System.out.println("KeggParser.parseReactions: Done " + i);
        i++;
      }
      System.out.println("KeggParser.parseReactions: Num Missing KEGG Chemical IDs: " + missingKeggCompounds.size());
      System.out.println("KeggParser.parseReactions: Num Missing KEGG Reactions: " + reactionsMissingKeggCompounds.size());
      System.out.println("KeggParser.parseReactions: Num Found Reactions: " + numGood + " New: " + (pairs.size() - numExisting));
    } catch (Exception e){
      e.printStackTrace();
    }
    return allRequiredKeggCompounds;
  }


  public static Set<String> getAllKeggIDsWInchis(String filename, MongoDB db) {
    return parseChemicalInchis(filename, db).keySet();
  }

  /**
   * Parses the KEGG file mapping from KEGG ID to InChIs
   * and creates a chemical entry in db for each not already in db
   * @param filename
   * @param db
   * @return map from KEGG id to consistent InChI
   */
  public static Map<String, String> parseChemicalInchis(String filename, MongoDB db) {
    Map<String, String> keggID_InChI = new HashMap<String, String>();
    DBIterator it = db.getIteratorOverChemicals();
    Map<String, Long> inchi_ID = new HashMap<String, Long>();
    while (it.hasNext()) {
      Chemical chemical = db.getNextChemical(it);
      inchi_ID.put(chemical.getInChI(), chemical.getUuid());
    }

    try{
      BufferedReader br = new BufferedReader(
          new InputStreamReader(
              new DataInputStream(new FileInputStream(filename))));
      FileWriter fstream = new FileWriter("keggCompoundsNotFound.log.html");
      BufferedWriter notFound = new BufferedWriter(fstream);
      notFound.write("<html><head></head><body>");
      String strLine;
      int i = 0;
      int numFound = 0;
      Indigo indigo = new Indigo();
      IndigoInchi indigoInchi = new IndigoInchi(indigo);
      while ((strLine = br.readLine()) != null) {
        String[] splitted = strLine.split("\\s+");
        String keggID = splitted[0];
        String inchi = splitted[1];
        keggID_InChI.put(keggID, inchi);
        inchi = ConsistentInChI.consistentInChI(inchi, "Kegg Parser");
        String inchiKey = indigoInchi.getInchiKey(inchi);
        Chemical chemical = db.getChemicalFromInChI(inchi);
        i++;
        if (chemical == null) {
          try {
            if (inchi_ID.containsKey(inchi)) {
              chemical = db.getChemicalFromChemicalUUID(inchi_ID.get(inchi));
            }
            if (chemical == null) {
              IndigoObject test = indigoInchi.loadMolecule(inchi);
              if (keggID.startsWith("G"))
                notFound.write("<a href=\"http://www.kegg.jp/dbget-bin/www_bget?gl:" + keggID + "\">" + keggID + "</a>\n");
              else
                notFound.write("<a href=\"http://www.kegg.jp/dbget-bin/www_bget?cpd:" + keggID + "\">" + keggID + "</a>\n");
              Chemical newChemical = new Chemical(inchi); // calls setInchi which sets the inchikey
              newChemical.setSmiles(test.canonicalSmiles());
              addKeggRef(keggID, newChemical);
              db.submitToActChemicalDB(newChemical, db.getNextAvailableChemicalDBid());
              continue;
            }
          } catch (Exception e){
            if (keggID.startsWith("G"))
              notFound.write("<i><a href=\"http://www.kegg.jp/dbget-bin/www_bget?gl:" + keggID + "\">" + keggID + "</a></i>\n");
            else
              notFound.write("<i><a href=\"http://www.kegg.jp/dbget-bin/www_bget?cpd:" + keggID + "\">" + keggID + "</a></i>\n");
            Chemical newChemical = new Chemical(inchi);
            addKeggRef(keggID, newChemical);
            db.submitToActChemicalDB(newChemical, db.getNextAvailableChemicalDBid());
            continue;
          }
        }

        numFound++;
        addKeggRef(keggID, chemical);
        db.updateActChemical(chemical, chemical.getUuid());
        if (i % 1000 == 0) System.out.println("KeggParser.parseChemicalInchis: Done " + i);
      }
      notFound.write("</body><html>");

      br.close();
      notFound.close();
      System.out.println("KeggParser.parseChemicalInchis: # already in DB " + numFound);

    } catch (Exception e){
      e.printStackTrace();
    }
    return keggID_InChI;
  }

  /**
   * Adds keggID reference to chemical if it doesn't exist already
   * @param keggID
   * @param chemical
   */
  private static void addKeggRef(String keggID, Chemical chemical) {
    JSONObject existing = (JSONObject) chemical.getRef(Chemical.REFS.KEGG);
    if (existing != null) {
      JSONArray ids = (JSONArray) existing.get("id");
      if (!jsonArrayContains(ids, keggID))
        ids.put(keggID);
      if (!existing.has("url"))
        existing.put("url", keggXrefUrlPrefix + keggID);
    } else {
      JSONObject entry = new JSONObject();
      JSONArray ids = new JSONArray();
      ids.put(keggID);
      entry.put("id", ids);
      if (!entry.has("url"))
        entry.put("url", keggXrefUrlPrefix + keggID);
      chemical.putRef(Chemical.REFS.KEGG, entry);
    }
  }

  private static boolean jsonArrayContains(JSONArray a, Object contains) {
    for (int i = 0; i < a.length(); i++) {
      if (a.get(i).equals(contains))
        return true;
    }
    return false;
  }

  private static void jsonArrayRemove(JSONArray a, Object toRemove) {
    for (int i = 0; i< a.length(); i++) {
      if (a.get(i).equals(toRemove))
        a.remove(i);
    }
  }

  /**
   * Parses file KEGG file with chemical details.
   * Anything in requiredIdInchi not already in db is added.
   * @param filename
   * @param db
   * @param requiredIdInchi
   * @param cofactorsSet
   * @throws IOException
   */
  public static void parseChemicalsDetailed(String filename, MongoDB db,
      Map<String, String> requiredIdInchi, Set<String> cofactorsSet) throws IOException {
    System.out.println("KeggParser.parseChemicalsDetailed: start");
    BufferedReader br = new BufferedReader(
        new InputStreamReader(
            new DataInputStream(new FileInputStream(filename))));
    Map<String, Long> keggID_ActID = db.getKeggID_ActID(false);
    int numEntriesUpdated = 0;
    int numFailedToFind = 0;
    int numNewKegg = 0;
    // set of variables to keep track of when parsing one entry
    String currKeggID = null;
    Long currActID = null;
    String currFormula = null;
    List<String> currSynonyms = null;

    String strLine;
    while ((strLine = br.readLine()) != null) {
      if (strLine.startsWith(" ")) continue; //in middle of a field we don't need now

      String[] field_val = strLine.split(" +", 2);

      String key = field_val[0];
      if (key.equals("ENTRY")) {
        currKeggID = field_val[1].split(" +")[0];
        currActID = keggID_ActID.get(currKeggID);
        if (currActID == null) {
          currActID = keggID_ActID.get(currKeggID + "n");
          keggID_ActID.put(currKeggID, currActID);
        }
        currFormula = null;
        currSynonyms = null;
      } else if (key.equals("FORMULA")) {
        currFormula = field_val[1];
      } else if (key.equals("NAME")) {
        currSynonyms = new ArrayList<String>();
        String currName = field_val[1];
        while (true) {
          if (currName.endsWith(";")) {
            currSynonyms.add(currName.substring(0, currName.length() - 1));
            strLine = br.readLine();
            currName = strLine.split(" +", 2)[1];
          } else {
            currSynonyms.add(currName);
            break;
          }
        }
      } else if (key.equals("///")) {
        if (requiredIdInchi.containsKey(currKeggID)) {
          boolean needUpdate = false;
          //if (requiredIdInchi.get(currKeggID) == null)
          //  System.out.println("NAMES " + currSynonyms + " ACT_ID " + currActID + " FOR " + currKeggID + " " + currFormula);
          if (currActID == null) {
            if (currSynonyms != null) {
              for (String s : currSynonyms) {
                Long id = db.getChemicalIDFromName(s);
                if (id != null && id != -1L) {
                  Chemical chemical = db.getChemicalFromChemicalUUID(id);
                  addKeggRef(currKeggID, chemical);
                  db.updateActChemical(chemical, id);
                  keggID_ActID.put(currKeggID, id);
                  currActID = id;
                  break;
                }
              }
            }

            if (currActID == null ) {
              // chemical not in compound.inchi and not in db
              currActID = db.getNextAvailableChemicalDBid();
              Chemical chemical = new Chemical(currActID);
              chemical.setInchi("none " + currKeggID);
              addKeggRef(currKeggID, chemical);
              db.submitToActChemicalDB(chemical, currActID);
              numFailedToFind++;
            }
            needUpdate = true;
          }

          Chemical toUpdate = db.getChemicalFromChemicalUUID(currActID);
          if (cofactorsSet.contains(currKeggID) && !toUpdate.isCofactor()) {
            toUpdate.setAsCofactor();
            needUpdate = true;
          }
          if (currSynonyms == null) currSynonyms = new ArrayList<String>();
          List<String> existingSynonyms = toUpdate.getSynonyms();
          currSynonyms.removeAll(existingSynonyms);
          for (String syn : currSynonyms) {
            toUpdate.addSynonym(syn);
            needUpdate = true;
          }

          // if no formula or formula contains n, append "n" to keggid for now.
          // this avoids adding any reactions that'll involve these chemicals
          if ((currFormula == null || currFormula.contains(")n")) && !toUpdate.isCofactor()) {
            JSONObject o = (JSONObject) toUpdate.getRef(Chemical.REFS.KEGG);
            JSONArray list = (JSONArray) o.get("id");
            jsonArrayRemove(list, currKeggID);
            System.out.printf("KeggParser.parseChemicalsDetailed: Needs parameter: %s, %s\n", currKeggID, currFormula);
            if (!jsonArrayContains(list, currKeggID + "n")) {
              list.put(currKeggID + "n");
            }
            if (!o.has("url")) {
              o.put("url", keggXrefUrlPrefix + currKeggID);
            }
            needUpdate = true;
          }
          if (needUpdate) {
            numEntriesUpdated++;
            db.updateActChemical(toUpdate, currActID);
          }
        }
        currKeggID = null;
        currSynonyms = null;
        currFormula = null;
      }
    }
    br.close();
    System.out.format("KeggParser.parseChemicalsDetailed: Num total entries added %d, Num no inchi but added %d\n", numEntriesUpdated, numFailedToFind);
  }

  /**
   * Parses KEGG file with reaction details.
   * Add all reactions that involve only chemicals in our database.
   * @param filename
   * @param db
   * @throws IOException
   */
  public static void parseReactionsDetailed(String filename, MongoDB db) throws IOException {
    Map<String, Long> keggID_ActID = db.getKeggID_ActID(false);
    BufferedReader br = new BufferedReader(
        new InputStreamReader(
            new DataInputStream(new FileInputStream(filename))));

    int numEntriesAdded = 0;
    int failed = 0;
    // set of variables to keep track of when parsing one entry
    String currKeggID = null;
    String currName = null;
    Map<Long, Integer> currProducts = null, currReactants = null; //maps chemicals to coefficients
    Map<Long, Integer> currCofactorProducts = null, currCofactorReactants = null; //maps chemicals to coefficients
    Map<Long, Integer> currCoenzymes = null;
    String currECNum = null;

    String strLine;
    while ((strLine = br.readLine()) != null) {
      if (strLine.startsWith(" ")) continue; //in middle of a field we don't need now

      String[] field_val = strLine.split(" +", 2);

      String key = field_val[0];
      if (key.equals("ENTRY")) {
        currName = null;
        currProducts = null;
        currReactants = null;
        currCofactorProducts = null;
        currCofactorReactants = null;
        currCoenzymes = null;
        currECNum = null;

        currKeggID = field_val[1].split(" +")[0];
      } else if (key.equals("ENZYME")) {
        currECNum = field_val[1];
      } else if (key.equals("///")) {
        if (currKeggID == null) {
          failed++;
          continue;
        }
        if (currProducts == null || currReactants == null) continue;

        Long[] productArr = (Long[]) currProducts.keySet().toArray(new Long[1]);
        Long[] reactantArr = (Long[]) currReactants.keySet().toArray(new Long[1]);
        Long[] productCofactorArr = (Long[]) currCofactorProducts.keySet().toArray(new Long[1]);
        Long[] reactantCofactorArr = (Long[]) currCofactorReactants.keySet().toArray(new Long[1]);
        Long[] coenzymeArr = (Long[]) currCoenzymes.keySet().toArray(new Long[1]);
        Reaction toAdd = new Reaction(-1L,
            reactantArr, productArr,
            reactantCofactorArr, productCofactorArr,
            coenzymeArr,
            currECNum,
            ConversionDirectionType.LEFT_TO_RIGHT,
            StepDirection.LEFT_TO_RIGHT,
            currName,
            Reaction.RxnDetailType.CONCRETE
            );
        toAdd.addReference(Reaction.RefDataSource.KEGG, currKeggID);
        toAdd.addReference(Reaction.RefDataSource.KEGG, keggXrefUrlPrefix + currKeggID);
        for (Long p : productArr) toAdd.setProductCoefficient(p, currProducts.get(p));
        for (Long r : reactantArr) toAdd.setSubstrateCoefficient(r, currReactants.get(r));
        numEntriesAdded++;
        toAdd.setDataSource(Reaction.RxnDataSource.KEGG);
        db.submitToActReactionDB(toAdd);
        currKeggID = null;
      } else if (key.equals("DEFINITION")) {
        currName = field_val[1];
        if (currName == null) currName = "";
        currName = "{} " + currName.replace("<=>", "->") + " <KEGG:" + currKeggID + ">";
      } else if (key.equals("EQUATION")) {
        String[] tokens = field_val[1].split(" +");
        currProducts = new HashMap<Long, Integer>();
        currReactants = new HashMap<Long, Integer>();
        // does KEGG provide consumed/modified cofactor information?
        // currently unpopulated
        currCofactorProducts = new HashMap<Long, Integer>();
        currCofactorReactants = new HashMap<Long, Integer>();
        // does KEGG provide auxiliary coenzyme information?
        // currently unpopulated
        currCoenzymes = new HashMap<Long, Integer>();
        boolean isProducts = false;
        for (int t = 0; t < tokens.length; t++) {
          String token = tokens[t];
          if (token.equals("+")) continue;
          if (token.equals("<=>")) {
            isProducts = true;
            continue;
          }

          Integer coeff = 1;
          if (token.matches("\\d+")) { //is numeric
            coeff = Integer.parseInt(token);
            t++;
            token = tokens[t];
          }

          if (token.startsWith("C") || token.startsWith("G")) {
            token = token.substring(0, 6); //remove the possible parameters
            Long actID = keggID_ActID.get(token);
            if (actID == null) { //allow use of chemicals with parameter
              actID = keggID_ActID.get(token + "n");
            }
            if (actID != null) {
              if (isProducts) {
                currProducts.put(actID, coeff);
              } else {
                currReactants.put(actID, coeff);
              }
            } else {
              /*if (token.startsWith("G") && currKeggID != null) {
                failed--; //not parsing glycans now so ignoring those failures
              }*/
              currKeggID = null;
            }

          }
        }
        if (currKeggID == null) {
          System.out.println("KeggParser.parseReactionsDetailed failed" + currName);
        }
      }
    }
    br.close();
    System.out.format("KeggParser.parseReactionsDetailed: Num entries added %d, Failed %d\n", numEntriesAdded, failed);
  }

  public static void main(String[] args) {
    MongoDB db = new MongoDB();
    parseKegg("data/kegg/reaction.lst", "data/kegg/compound.inchi", "data/kegg/compound", "data/kegg/reaction", "data/kegg/cofactors.txt", db);
  }
}