OrganismCompositionMongoWriter.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.metacyc;

import act.installer.metacyc.annotations.BioSource;
import act.installer.metacyc.annotations.Stoichiometry;
import act.installer.metacyc.annotations.Term;
import act.installer.metacyc.entities.ChemicalStructure;
import act.installer.metacyc.entities.ProteinRNARef;
import act.installer.metacyc.entities.SmallMolecule;
import act.installer.metacyc.entities.SmallMoleculeRef;
import act.installer.metacyc.processes.BiochemicalPathwayStep;
import act.installer.metacyc.processes.Catalysis;
import act.installer.metacyc.processes.Conversion;
import act.installer.metacyc.references.Publication;
import act.installer.metacyc.references.Relationship;
import act.installer.metacyc.references.Unification;
import act.installer.sequence.MetacycEntry;
import act.installer.sequence.SequenceEntry;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.ConsistentInChI;
import act.shared.Reaction;
import act.shared.Seq;
import com.ggasoftware.indigo.Indigo;
import com.ggasoftware.indigo.IndigoInchi;
import com.ggasoftware.indigo.IndigoObject;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.biopax.paxtools.model.level3.CatalysisDirectionType;
import org.biopax.paxtools.model.level3.StepDirection;
import org.json.JSONArray;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class OrganismCompositionMongoWriter {
  MongoDB db;
  OrganismComposition src;
  Chemical.REFS originDB;
  String originDBSubID;
  HashMap<Resource, SmallMolecule> smallmolecules;
  HashMap<Resource, Catalysis> enzyme_catalysis;
  HashMap<Resource, BiochemicalPathwayStep> biochemicalPathwaySteps;
  HashMap<String, String> uniqueKeyToInChImap;
  boolean debugFails = false;

  // Cache these values as they'll base the same throughout.
  private Map<String, Long> organismNameToIdCache = new LinkedHashMap<String, Long>(101, 1.0f, true) {
    // Believe it or not, this is all that is required to create an LRU cache!
    @Override
    protected boolean removeEldestEntry(Map.Entry eldest) {
      return this.size() > 100; // Retain last 100 used organisms.
    }
  };

  // metacyc id's are in Unification DB=~name of origin, ID.matches(METACYC_URI_PREFIX)
  String METACYC_URI_IDS = "^[A-Z0-9-]+$"; //
  // to get valid Metacyc website URL
  String METACYC_URI_PREFIX = "http://www.metacyc.org/META/NEW-IMAGE?object=";

  // Pattern to extract ecnums from metacyc standard names
  private final static Pattern metacycStandardNameEcnum = Pattern.compile("\\(EC ([0-9a-zA-Z_.-]+)[^)]*\\)");

  // Metacyc ids/metadata will be written to these fields in the DB.
  public static final String METACYC_OBJECT_MODEL_XREF_ID_PATH = "xref.METACYC.id";
  public static final String METACYC_OBJECT_MODEL_XREF_METADATA_PATH = "xref.METACYC.meta";

  Indigo indigo = new Indigo();
  IndigoInchi indigoInchi = new IndigoInchi(indigo);

  int ignoredMoleculesWithMultipleStructures = 0;
  int totalSmallMolecules = 0;

  OrganismCompositionMongoWriter(MongoDB db, OrganismComposition o, String origin, Chemical.REFS originDB) {
    System.out.println("Writing DB: " + origin);
    this.db = db;
    this.src = o;
    this.originDB = originDB;
    this.originDBSubID = origin;
    smallmolecules = o.getMap(SmallMolecule.class);
    enzyme_catalysis = o.getMap(Catalysis.class);
    this.biochemicalPathwaySteps = o.getMap(BiochemicalPathwayStep.class);
    this.uniqueKeyToInChImap = o.getUniqueKeyToInChImap();
  }

  /**
   * Each Metacyc biopax file contains collections of reactions and chemicals, organized by organism.
   * The reactions reference the chemicals using biopax-specific (or Metacyc-specific?) identifiers that don't match
   * our internal id scheme (for good reason--our identifier approach is far less complex!).  This method writes the
   * contents of one organism's reactions and chemicals to the DB.  The chemicals are written first so that we can
   * accumulate a mapping of Metacyc small molecule reference ids to our DB's chemical ids.  The reactions' substrates
   * and products are then written to the DB using our internal chemical IDs, allowing us to unify Metacyc's chemical
   * and reaction data with whatever has already been written. */
  public void write() {


    if (false)
      writeStdout(); // for debugging, if you need a full copy of the data in stdout

    // while going through this organisms chemicals (optionally installing
    // into db if required), we map its rdfID to the inchi (in db)
    HashMap<String, Long> rdfID2MongoID = new HashMap<String, Long>();
    // for debugging, we log only the number of new reactions with sequences seen
    int newRxns = 0;
    int resolvedViaDirectInChISpecified = 0;
    int resolvedViaSmallMoleculeRelationship = 0;

    // Stores chemical strings derived from CML to avoid repeated processing for reused small molecule references.
    HashMap<Resource, ChemInfoContainer> smRefsCollections = new HashMap<>();

    for (Resource id : smallmolecules.keySet()) {
      SmallMolecule sm = (SmallMolecule) smallmolecules.get(id);
      SmallMoleculeRef smref = (SmallMoleculeRef) this.src.resolve(sm.getSMRef());
      if (smref == null) {
        continue; // only happens in one case standardName="a ribonucleic acid"
      }

      /* De-duplicate structureToChemStrs calls by storing already accessed small molecule structures in a hash.
       * If we find the same molecule in our hash, we don't need to process it again! */
      ChemInfoContainer chemInfoContainer = smRefsCollections.get(sm.getSMRef());
      if (chemInfoContainer == null) {
        ChemicalStructure c = (ChemicalStructure) this.src.resolve(smref.getChemicalStructure());

        ChemStrs chemStrs = null;
        if (c != null) { // Only produce ChemStrs if we have a chemical structure to store.
          String lookupInChI;
          if (c.getInChI() != null) {
            chemStrs = new ChemStrs(c.getInChI(), null, null);
            resolvedViaDirectInChISpecified++;
          } else if ((lookupInChI = lookupInChIByXRefs(sm)) != null) {
            // TODO: should we track these?  They could just be bogus compounds or compound classes.
            chemStrs = new ChemStrs(lookupInChI, null, null);
            resolvedViaSmallMoleculeRelationship++;
          } else {
            // Extract various canonical representations (like InChI) for this molecule based on the structure.
            chemStrs = structureToChemStrs(c);
          }
        } else {
          /* This occurs for Metacyc entries that are treated as classes of molecules rather than individual molecules.
           * See https://github.com/20n/act/issues/40. */
          System.out.format("--- warning, null ChemicalStructure for %s; %s; %s\n",
              smref.getStandardName(), smref.getID(), smref.getChemicalStructure());
          // TODO: we could probably call `continue` here safely.
        }

        // Wrap all of the nominal/structural information for this molecule together for de-duplication.
        chemInfoContainer = new ChemInfoContainer(smref, chemStrs, c);
        smRefsCollections.put(sm.getSMRef(), chemInfoContainer);
      }

      if (chemInfoContainer.c == null) {
        if (debugFails) System.out.println("No structure: " + smref.expandedJSON(this.src).toString(2));
        continue; // mostly big molecules (e.g., a ureido compound, a sulfhydryl reagent, a macrolide antibiotic), but sometimes complexes (their members fields has small molecule structures), and sometimes just no structure given (colanic acid, a reduced nitroaromatic compound)
      }

      SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref);

      chemInfoContainer.addSmallMolMetaData(meta);
    }

    System.out.format("*** Resolved %d of %d small molecules' InChIs via InChI structures.\n",
        resolvedViaDirectInChISpecified, smallmolecules.size());
    System.out.format("*** Resolved %d of %d small molecules' InChIs via compounds.dat lookup.\n",
        resolvedViaSmallMoleculeRelationship, smallmolecules.size());
    System.out.format("--- writing chemicals for %d collections from %d molecules\n",
        smRefsCollections.size(), smallmolecules.size());

    // Write all referenced small molecules only once.  We de-duplicated while reading, so we should be ready to go!
    for (ChemInfoContainer cic : smRefsCollections.values()) {
      // actually add chemical to DB
      Long dbId = writeChemicalToDB(cic.structure, cic.c, cic.metas);
      if (dbId == null) {
        System.err.format("ERROR: unable to find/write chemical '%s'\n",
            cic.smRef == null ? null : cic.smRef.getStandardName());
        continue;
      }

      /* Put rdfID -> mongodb ID in rdfID2MongoID map.  These ids will be used to reference the chemicals in Metacyc
       * substrates/products entries, so it's important to get them right (and for the mapping to be complete). */
      rdfID2MongoID.put(cic.c.getID().getLocal(), dbId);
    }

    /* It appears that Catalysis objects can appear outside of BiochemicalPathwaySteps in biopax files.  Record which
     * catalyses we've installed from BiochemicalPathwaySteps so that we can ensure full coverage without duplicating
     * reactions in the DB. */
    Set<Resource> seenCatalyses = new HashSet<>(this.enzyme_catalysis.size());

    // Iterate over the BiochemicalPathwaySteps, extracting either Catalyses if available or the raw Conversion if not.
    for (Map.Entry<Resource, BiochemicalPathwayStep> entry : this.biochemicalPathwaySteps.entrySet()) {
      BiochemicalPathwayStep bps = entry.getValue();

      // TODO: does this correctly handle the case where the process consists only of Modulations?  Is that possible?
      Set<Resource> catalyses = bps.getProcess();
      if (catalyses == null || catalyses.size() == 0) {
        System.out.format("%s: No catalyses, falling back to conversion %s\n",
            bps.getID(), bps.getConversion());
        Conversion c = (Conversion)this.src.resolve(bps.getConversion());
        if (c == null) {
          System.err.format("ERROR: could not find expected conversion %s for %s\n", bps.getConversion(), bps.getID());
        } else {
          addReaction(c, rdfID2MongoID, bps.getDirection());
        }
      } else {
        System.out.format("%s: Found %d catalyses\n", bps.getID(), catalyses.size());
        for (Resource res : catalyses) {
          Catalysis c = this.enzyme_catalysis.get(res);
          // Don't warn here, as the stepProcess could be a Modulation and we don't necessarily care about those.
          if (c != null) {
            seenCatalyses.add(res);
            addReaction(c, rdfID2MongoID, bps.getDirection());
          }
        }
        newRxns++;
      }
    }

    /* Some Catalysis objects exist outside BiochemicalPathwaySteps, so iterate over all the Catalyses in this file
     * and install any we haven't already seen. */
    for (Map.Entry<Resource, Catalysis> entry : enzyme_catalysis.entrySet()) {
      // Don't re-install Catalysis objects that were part of BiochemicalPathwaySteps, but make sure we get 'em all.
      if (seenCatalyses.contains(entry.getKey())) {
        continue;
      }
      // actually add reaction to DB
      addReaction(entry.getValue(), rdfID2MongoID, null);
      newRxns++;
    }

    // Output stats:
    System.out.format("New writes: %s (%d) :: (rxns)\n", this.originDBSubID, newRxns);
    System.out.format("Ignored %d of %d small molecules with multiple chemical structures\n",
        ignoredMoleculesWithMultipleStructures, totalSmallMolecules);
  }

  // A container for SMRefs and their associated Indigo-derived ChemStrs.  Used for deduplication of chemical entries.
  private class ChemInfoContainer {
    public SmallMoleculeRef smRef;
    public ChemStrs structure;
    public ChemicalStructure c;
    public List<SmallMolMetaData> metas; // This list of `metas` will become the xref metadata on the DB chemical entry.

    public ChemInfoContainer(SmallMoleculeRef smRef, ChemStrs structure, ChemicalStructure c) {
      this.smRef = smRef;
      this.structure = structure;
      this.c = c;
      this.metas = new LinkedList<>();
    }

    public void addSmallMolMetaData(SmallMolMetaData meta) {
      metas.add(meta);
    }
  }

  private ChemStrs structureToChemStrs(ChemicalStructure c) {
    ChemStrs structure = getChemStrsFromChemicalStructure(c);
    if (structure == null) {
      // do some hack, put something in inchi, inchikey and smiles so that
      // we do not end up loosing the reactions that have R groups in them
      structure = hackAllowingNonSmallMolecule(c);
    }
    return structure;
  }

  private Long writeChemicalToDB(ChemStrs structure, ChemicalStructure c, List<SmallMolMetaData> metas) {
    if (structure == null) {
      return null;
    }
    // Do an indexed query to determine whether the chemical already exists in the DB.
    Long dbId = db.getExistingDBIdForInChI(structure.inchi);
    if (dbId == null) { // InChI doesn't appear in DB.
      // DB does not contain chemical as yet, create and install.
      // TODO: if needed, we can optimize this by querying the DB count on construction and incrementing locally.
      Chemical dbChem = new Chemical(-1l);
      dbChem.setInchi(structure.inchi); // we compute our own InchiKey under setInchi (well, now only InChI!)
      dbChem.setSmiles(structure.smiles);
      // Be sure to create the initial set of references in the initial object write to avoid another query.
      dbChem = addReferences(dbChem, c, metas, originDB);
      Long installid = db.getNextAvailableChemicalDBid();
      db.submitToActChemicalDB(dbChem, installid);
      dbId = installid;
    } else { // We found the chemical in our DB already, so add on Metacyc xref data.
      /* If the chemical already exists, just add the xref id and metadata entries.  Mongo will do the heavy lifting
       * for us, so this should hopefully be fast. */
      String id = c.getID().getLocal();
      BasicDBList dbMetas = metaReferencesToDBList(id, metas);
      db.appendChemicalXRefMetadata(
          structure.inchi,
          METACYC_OBJECT_MODEL_XREF_ID_PATH, id, // Specify the paths where the Metacyc xref fields should be added.
          METACYC_OBJECT_MODEL_XREF_METADATA_PATH, dbMetas
      );
    }
    return dbId;
  }

  /* Add a reaction to the DB based on a complete Catalysis.  This will extract the underlying Conversion and append
   * available sequence/organism data.  This is preferred over the Conversion variant of this function as we want the
   * extra data to appear in the DB. */
  private Reaction addReaction(Catalysis c, HashMap<String, Long> rdfID2MongoID, StepDirection pathwayStepDirection) {
    // using the map of chemical rdfID->mongodb id, construct a Reaction object
    Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection);
    // set the datasource
    rxn.setDataSource(Reaction.RxnDataSource.METACYC);

    // pass the Reaction to the mongodb driver to insert into act.actfamilies
    int rxnid = db.submitToActReactionDB(rxn);

    // construct protein info object to be installed into the rxn
    Pair<List<Long>, List<Long>> seqAndOrgIds = createCatalyzingSequences(c, rxn, rxnid);
    JSONObject proteinInfo = constructProteinInfo(c, seqAndOrgIds.getRight(), seqAndOrgIds.getLeft());

    // add it to the in-memory object
    rxn.addProteinData(proteinInfo);

    for (Long orgId : seqAndOrgIds.getRight()) {
      rxn.addReference(Reaction.RefDataSource.METACYC, String.format("OrganismId:%d", orgId));
    }

    // rewrite the rxn to update the protein data
    // ** Reason for double write: It is the wierdness of us
    // wanting to install a back pointer from the db.seq
    // entries back to metacyc db.actfamilies rxns
    // which is why we first write and get a _id of the
    // written metacyc rxn, and then construct db.seq entries
    // (which have the _id installed) and then write those
    // pointers under actfamilies.protein.
    //
    // ** Now note in brenda we do not do this wierd back
    // pointer stuff from db.seq. In brenda actfamilies entries
    // the actfamilies entry itself has the protein seq directly
    // there. Not ideal. TODO: FIX THAT.
    db.updateActReaction(rxn, rxnid);

    return rxn;
  }

  // Add a Conversion to the DB without sequence or organism data.
  private Reaction addReaction(Conversion c, HashMap<String, Long> rdfID2MongoID, StepDirection pathwayStepDirection) {
    Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection);
    rxn.setDataSource(Reaction.RxnDataSource.METACYC);
    // There's no organism/sequence information available on Conversions, so just write the reaction without it.
    int rxnid = db.submitToActReactionDB(rxn);
    db.updateActReaction(rxn, rxnid);

    return rxn;
  }

  private JSONObject constructProteinInfo(Catalysis c, List<Long> orgs, List<Long> seqs) {
    JSONObject protein = new JSONObject();
    JSONArray orglist = new JSONArray();
    for (Long o : orgs) orglist.put(o);
    protein.put("organisms", orglist);
    JSONArray seqlist = new JSONArray();
    for (Long s : seqs) seqlist.put(s);
    protein.put("sequences", seqlist);
    protein.put("datasource", "METACYC");
    CatalysisDirectionType cdt = c.getDirection();
    protein.put("catalysis_direction", cdt == null ? null : cdt.toString());

    return protein;
  }

  private BasicDBList metaReferencesToDBList(String id, List<SmallMolMetaData> metas) {
    BasicDBList dbList = new BasicDBList();
    for (SmallMolMetaData meta : metas) {
      DBObject metaObj = meta.getDBObject();
      metaObj.put("id", id);
      dbList.add(metaObj);
    }
    return dbList;
  }

  private Chemical addReferences(Chemical dbc, ChemicalStructure c, List<SmallMolMetaData> metas, Chemical.REFS originDB) {
    JSONObject ref = dbc.getRef(originDB);
    JSONArray idlist = null;
    String chemID = c.getID().getLocal();
    if (ref == null) {
      // great, this db's ref is not already in place. just create a new one and put it in
      ref = new JSONObject();
      idlist = new JSONArray();
      idlist.put(chemID);
    } else {
      // a ref exists, maybe it is from installing this exact same chem,
      // or from a replicate chemical from another organism. add the DB's ID
      // to the chemical's xref field
      idlist = ref.has("id") ? (JSONArray)ref.get("id") : new JSONArray();
      boolean contains = false;
      for (int i = 0; i < idlist.length(); i++)
        if (idlist.get(i).equals(chemID))
          contains = true;
      if (!contains)
        idlist.put(chemID);
      // else do nothing, since the idlist already contains the id of this chem.
    }

    // install the idlist into the xref.KEGG/METACYC field
    ref.put("id", idlist);

    Object existing = null;
    if (ref.has("meta"))
      existing = ref.get("meta");
    JSONArray newMeta = addAllToExistingMetaList(chemID, existing, metas);
    ref.put("meta", newMeta);

    // update the chemical with the new ref
    dbc.putRef(originDB, ref);

    // return the updated chemical
    return dbc;
  }

  private JSONArray addAllToExistingMetaList(String id, Object existing, List<SmallMolMetaData> metas) {
    JSONArray metaData = null;
    if (existing == null) {
      metaData = new JSONArray();
    } else if (existing instanceof JSONArray) {
      metaData = (JSONArray)existing;
    } else {
      System.out.println("SmallMolMetaDataList[0] = " + metas.get(0).toString());
      System.out.println("Existing Chemical.refs[Chemical.REFS.METACYC] not a list! = " + existing);
      System.out.println("It is of type " + existing.getClass().getSimpleName());
      System.out.println("Want to add SmallMolMetaData to list, but its not a list!");
      System.exit(-1);
      return null;
    }

    for (SmallMolMetaData meta : metas) {
      DBObject metaDBObject = meta.getDBObject();
      metaDBObject.put("id", id);
      metaData.put(metaDBObject);
    }
    return metaData;
  }

  // Extract the conversion from a Catalysis object, and use the Catalysis + Conversion to construct a reaction.
  private Reaction constructReaction(Catalysis c, HashMap<String, Long> toDBID, StepDirection pathwayStepDirection) {
    Conversion catalyzed = getConversion(c);
    Map<Resource, Stoichiometry> stoichiometry = catalyzed.getRawStoichiometry(this.src);

    List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry);
    List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry);
    List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry);
    return constructReactionHelper(catalyzed, toDBID,
        substratesPair, productsPair, cofactorsPair, pathwayStepDirection);
  }

  // If no Catalysis is available, extract the substrates/products/cofactors from a raw Conversion.
  private Reaction constructReaction(Conversion c, HashMap<String, Long> toDBID, StepDirection pathwayStepDirection) {
    Map<Resource, Stoichiometry> stoichiometry = c.getRawStoichiometry(this.src);

    List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry);
    List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry);
    List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry);
    return constructReactionHelper(c, toDBID, substratesPair, productsPair, cofactorsPair, pathwayStepDirection);
  }

  private Reaction constructReactionHelper(Conversion catalyzed, HashMap<String, Long> toDBID,
                                           List<Pair<Long, Integer>> substratesPair,
                                           List<Pair<Long, Integer>> productsPair,
                                           List<Pair<Long, Integer>> cofactorsPair,
                                           StepDirection pathwayStepDirection) {
    Long[] substrates, products, substrateCofactors, productCofactors, coenzymes;
    String ec, readable, dir, spont, typ;

    String metacycURL = getMetaCycURL(catalyzed);
    Boolean isSpontaneous = catalyzed.getSpontaneous(); // BioPaxFile should guarantee this is non-null.
    Object dirO = catalyzed.getDir();
    Object typO = catalyzed.getTyp();
    ec = singletonSet2Str(catalyzed.getEc(), metacycURL);
    spont = isSpontaneous ? "Spontaneous" : "";
    dir = dirO == null ? "" : dirO.toString(); // L->R, L<->R, or L<-R
    typ = typO == null ? "" : typO.toString(); // bioc_rxn, transport, or transport+bioc

    coenzymes = getLefts(cofactorsPair);

    // for now just write out the source RDFId as the identifier,
    // later, we can additionally get the names of reactants and products
    // and make a s1 + s2 -> p1 string (c.controlled.left.ref
    readable = rmHTML(catalyzed.getStandardName());
    readable += " (" + catalyzed.getID().getLocal() + ": " + ec + " " + spont + " " + dir + " " + typ + " cofactors:" +
        Arrays.asList(coenzymes).toString() + " stoichiometry:" + catalyzed.getStoichiometry(this.src) + ")";

    substrates = getLefts(substratesPair);
    products = getLefts(productsPair);
    substrateCofactors = new Long[0];
    productCofactors = new Long[0];

    Reaction rxn = new Reaction(-1L, substrates, products, substrateCofactors, productCofactors, coenzymes, ec, 
        catalyzed.getDir(), pathwayStepDirection, readable, Reaction.RxnDetailType.CONCRETE);

    for (int i = 0; i < substratesPair.size(); i++) {
      Pair<Long, Integer> s = substratesPair.get(i);
      rxn.setSubstrateCoefficient(s.getLeft(), s.getRight());
    }
    for (int i = 0; i < productsPair.size(); i++) {
      Pair<Long, Integer> p = productsPair.get(i);
      rxn.setProductCoefficient(p.getLeft(), p.getRight());
    }

    rxn.addReference(Reaction.RefDataSource.METACYC, this.originDB + " " + this.originDBSubID);
    rxn.addReference(Reaction.RefDataSource.METACYC, metacycURL);
    if (isSpontaneous) {
      rxn.addReference(Reaction.RefDataSource.METACYC, "isSpontaneous");
    }

    return rxn;
  }

  private Long[] getLefts(List<Pair<Long, Integer>> pairs) {
    Long[] lefts = new Long[pairs.size()];
    for (int i = 0; i<pairs.size(); i++) {
      lefts[i] = pairs.get(i).getLeft();
    }
    return lefts;
  }

  private String singletonSet2Str(Set<String> ecnums, String metadata) {
    switch (ecnums.size()) {
      case 0:
        return "";
      case 1:
        return ecnums.toArray(new String[0])[0];
      default:
        return ecnums.toString(); // e.g., [2.7.1.74 , 2.7.1.76 , 2.7.1.145] for http://www.metacyc.org/META/NEW-IMAGE?object=DEOXYADENOSINE-KINASE-RXN
    }
  }

  private String rmHTML(String s) {
    return s
            .replaceAll("<SUP>","").replaceAll("<sup>", "").replaceAll("<SUP>", "").replaceAll("<sup>", "").replaceAll("</SUP>","").replaceAll("</sup>", "").replaceAll("</SUP>", "").replaceAll("</sup>", "")
            .replaceAll("<SUB>","").replaceAll("<sub>", "").replaceAll("<SUB>", "").replaceAll("<sub>", "").replaceAll("</SUB>","").replaceAll("</sub>", "").replaceAll("</SUB>", "").replaceAll("</sub>", "")
            .replaceAll("→", "->")
            .replaceAll("←", "<-")
            .replaceAll("↔", "<->")
            .replaceAll("&rarr;", "->")
            .replaceAll("&larr;", "<-")
            .replaceAll("&harr;", "<->");
  }

  Conversion getConversion(Catalysis c) {
    List<NXT> path = Arrays.asList( NXT.controlled ); // get the controlled Conversion
    Set<BPElement> convs = this.src.traverse(c, path);
    if (convs.size() == 0)
      return null;
    if (convs.size() == 1)
      for (BPElement conversion : convs)
        return (Conversion)conversion;

    // size>1!!??
    System.out.println("More than one controlled conversion (abort):" + c.expandedJSON(this.src)); System.exit(-1); return null;
  }

  List<Pair<Long, Integer>> getCofactors(Catalysis c, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry) {
    // cofactors = c.cofactors.smallmoleculeref.structure
    // but we retrieve it in two steps:
    //    1) get the small molecule,
    //    2) get the structure associated with the small molecule
    // this is because from `1)` we can also lookup the stoichiometry

    // here is the path to the small molecule reference:
    List<NXT> smmol_path = Arrays.asList(
        NXT.cofactors // get the SmallMolecule
    );

    // here is the path to the chemical structure within that small molecule:
    List<NXT> struct_path = Arrays.asList(
        NXT.ref, // get the SmallMoleculeRef
        NXT.structure // get the ChemicalStructure
    );

    List<Pair<Long, Integer>> cofactors = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false);

    return cofactors;
  }

  /* Get cofactors for a stand-alone Conversion when a Catalysis object is not available.  Raw conversions don't
   * reference cofactors, so this is always an empty list.  `unmodifiableList` ensures this list is always empty. */
  private static final List<Pair<Long, Integer>> EMPTY_COFACTORS = Collections.unmodifiableList(new ArrayList<>(0));
  List<Pair<Long, Integer>> getCofactors(Conversion c, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry) {
    return EMPTY_COFACTORS;
  }

  private static final List<NXT> STRUCT_PATH = Collections.unmodifiableList(Arrays.asList(
      NXT.ref, // get the SmallMoleculeRef
      NXT.structure
  ));
  private static final List<NXT> STRUCT_PATH_ALT = Collections.unmodifiableList(Arrays.asList(
      NXT.ref, // get the SmallMoleculeRef
      NXT.members, // sometimes instead there are multiple members (e.g., in transports) instead of the small mol directly.
      NXT.structure
  ));
  List<Pair<Long, Integer>> getReactants(Catalysis c, HashMap<String, Long> toDBID, boolean left, Map<Resource, Stoichiometry> stoichiometry) {

    List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>();

    // default cases:
    // substrates/products = c.controlled.left.smallmolecule.smallmoleculeref.structure

    // but we retrieve it in two steps:
    //    1) get the small molecule,
    //    2) get the structure associated with the small molecule
    // this is because from `1)` we can also lookup the stoichiometry

    // here is the path to the small molecule reference:
    List<NXT> smmol_path = Arrays.asList(
        NXT.controlled, // get the controlled Conversion
        left ? NXT.left : NXT.right // get the left or right SmallMolecules
    );
    // here is the path to the chemical structure within that small molecule:
    List<NXT> struct_path = STRUCT_PATH;
    List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false);
    reactants.addAll(mappedChems);

    // we repeat something similar, but for cases where the small molecule ref
    // contains multiple members, e.g., in transports. This usually does
    // not lead to reactant elements, but in edge cases where it does
    // we add them to the reactants

    // here is the path to the small molecule reference:
    List <NXT> smmol_path_alt = Arrays.asList(
        NXT.controlled, // get the controlled Conversion
        left ? NXT.left : NXT.right // get the left or right SmallMolecules
    );
    // here is the path to the chemical structure within that small molecule:
    // (notice the difference from the above: this is ref.members.structure)
    List <NXT> struct_path_alt = STRUCT_PATH_ALT;
    mappedChems = getMappedChems(c, smmol_path_alt, struct_path_alt, toDBID, stoichiometry, true);
    reactants.addAll(mappedChems);

    return reactants;
  }

  List<Pair<Long, Integer>> getReactants(Conversion c, HashMap<String, Long> toDBID, boolean left, Map<Resource, Stoichiometry> stoichiometry) {
    // See getReactions(Catalysis c, ...) for documentation on this function's behavior.
    List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>();

    List<NXT> smmol_path = Collections.singletonList(
        // A raw Conversion doesn't have `controller`/`controlled` child nodes.
        left ? NXT.left : NXT.right // get the left or right SmallMolecules
    );
    // SmallMolecule lookup works the same within a Conversion.
    List<NXT> struct_path = STRUCT_PATH;
    List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false);
    reactants.addAll(mappedChems);

    // The smmol_path is the same in the alternative case: Conversions only have `left` and `right`.

    // The struct_path_alt is the same as Catalysis since we're looking at the left/right side of the conversion.
    List <NXT> struct_path_alt = STRUCT_PATH_ALT;
    mappedChems = getMappedChems(c, smmol_path, struct_path_alt, toDBID, stoichiometry, true);
    reactants.addAll(mappedChems);

    return reactants;
  }

  /**
   * Stoichiometry entries in raw Metacyc XML contain SmallMolecule objects that then contain ChemicalStructure objects.
   * Once the XML is parsed, stoichiometry coefficients are available via SmallMolecule ids.  The ChemicalStructure
   * objects, however, contain the chemical information we want to store in the DB.  In order to associate the
   * substrates and products in a reaction to their stoichiometric coefficients, we need to link the containing
   * SmallMolecule's id with its ChemicalStructure child.  The smmol_path allows us to traverse the Catalysis objects
   * (which represents the substrates and products of reactions) to find the SmallMolecules on one side of a reaction;
   * we then traverse those SmallMolecules to find their ChemicalStructures.  This gives us a mapping like:
   * <pre>Stoichiometry (with coefficient) <-> SmallMolecule <-> ChemicalStructure <-> DB ID.</pre>
   *
   * The output of this function is a list of the DB ids of the chemicals on whatever side of the reaction the specified
   * smmol_path represents, paired with their respective stoichiometric coefficients.
   *
   * @param catalysisOrConversion The Catalysis or Conversion (reaction) object whose substrates or products we're inspecting.
   * @param smmol_path A path to fetch the desired collection of small molecules from the reaction.
   * @param struct_path A path to fetch the chemical structures from the extracted small molecules.
   * @param toDBID A map from chemical structure id to DB id.
   * @param stoichiometry A map from small molecule id to Stoichiometry object that we'll use to extract coefficients.
   * @return A list of pairs of (DB id, stoichiometry coefficient) for the chemicals found via the specified path.
   */
  private List<Pair<Long, Integer>> getMappedChems(
      BPElement catalysisOrConversion, List<NXT> smmol_path, List<NXT> struct_path, HashMap<String, Long> toDBID,
      Map<Resource, Stoichiometry> stoichiometry, boolean expectedMultipleStructures) {
    /* TODO: since this is a private method, this check ought to be unnecessary (if we've written everything correctly).
     * Remove it once we're sure it's unnecessary. */
    if (!(catalysisOrConversion instanceof Catalysis || catalysisOrConversion instanceof Conversion)) {
      throw new RuntimeException(String.format(
          "getMappedChems passed unexpected BPElement subclass %s with id %s",
          catalysisOrConversion.getClass(), catalysisOrConversion.getID()));
    }

    List<Pair<Long, Integer>> chemids = new ArrayList<Pair<Long, Integer>>();

    Set<BPElement> smmols = this.src.traverse(catalysisOrConversion, smmol_path);
    for (BPElement smmol : smmols) {
      Resource smres = smmol.getID();
      Integer coeff = getStoichiometry(smres, stoichiometry);

      Set<BPElement> chems = this.src.traverse(smmol, struct_path);
      if (chems.size() > 1) {
        if (!expectedMultipleStructures) {
          /* Abort if we find an unexpected molecule with multiple chemical structures.  If we don't anticipate these
           * appearing and we ignore them, then we may be incorrectly ignoring good data. */
          throw new RuntimeException(String.format(
              "SEVERE WARNING: small molecule %s has multiple chemical structures " +
              "when only one is expected; ignoring.\n", smmol.getID())
          );
        } else {
          System.err.format("WARNING: small molecule %s has multiple chemical structures; ignoring.\n", smmol.getID());
        }
        ignoredMoleculesWithMultipleStructures++;
      } else {
        for (BPElement chem : chems) {
          // chem == null can happen if the path led to a smallmoleculeref
          // that is composed of other things and does not have a structure
          // of itself, we handle that by querying other paths later
          if (chem == null)
            continue;

          String id = chem.getID().getLocal();
          Long dbid = toDBID.get(id);
          if (dbid == null) {
            System.err.format("ERROR: Missing DB ID for %s\n", id);
          }
          chemids.add(Pair.of(dbid, coeff));
        }
      }
      totalSmallMolecules++;
    }

    return chemids;
  }

  private Map<Resource, Integer> tointvals(Map<Resource, Stoichiometry> st) {
    Map<Resource, Integer> intvals = new HashMap<Resource, Integer>();
    for (Resource r : st.keySet())
      intvals.put(r, st.get(r).getCoefficient().intValue());

    return intvals;
  }

  private Integer getStoichiometry(Resource res, Map<Resource, Stoichiometry> stoichiometry) {
    // lookup the stoichiometry in the global map
    Stoichiometry s = stoichiometry.get(res);

    if (s == null) {
      System.err.format("ERROR: missing stoichiometry entry for metacyc resource %s\n", res.getLocal());
      return null;
    }

    // pick out the integer coefficient with the stoichiometry object
    Integer coeff = s.getCoefficient().intValue();

    return coeff;
  }

  private Long getOrganismNameIdByNameFromDB(String organismName) {
    // Try the cache first.
    if (this.organismNameToIdCache.containsKey(organismName)) {
      return this.organismNameToIdCache.get(organismName);
    }

    // Fall back to the DB.
    Long id = db.getOrganismId(organismName);
    // Create a new entry if missing.
    if (id == null || id == -1) {
      id = db.submitToActOrganismNameDB(organismName);
    }
    // Write through to cache.
    this.organismNameToIdCache.put(organismName, id);
    return id;
  }

  /**
   * Extracts organism names from a BP element at some sub path, submits them to the DB, and returns a mapping of their
   * names to DB ids.  **Does not do anything with NCBI ids at this time**.
   * @param rootElement The root path from which to search.
   * @param path The sub path to search for organisms.
   * @return A map from organism name to organism name DB id.
   */
  private Map<String, Long> extractOrganismsAtPath(BPElement rootElement, List<NXT> path) {
    Set<String> organismNames = new HashSet<>();
    for (BPElement biosrc : this.src.traverse(rootElement, path)) {
      if (biosrc == null) {
        System.err.format("WARNING: got null organism for %s\n", rootElement.getID());
        continue;
      }

      if (biosrc instanceof BioSource) {
        BioSource bs = (BioSource) biosrc;
        if (bs.getName().size() != 1) {
          // Assume only one name per BioSource entity.
          System.err.format("WARNING: found a BioSource with multiple names (%s): %s\n",
              bs.getID(), StringUtils.join(bs.getName(), ", "));
        }
        organismNames.addAll(bs.getName());
      } else {
        System.err.format("WARNING: found a non-BioSource organism (%s) for %s, using anyway\n",
            biosrc.getID(), rootElement.getID());
        organismNames.addAll(biosrc.getName());
      }
      // Ignore NCBI Taxonomy x-refs for now, as we don't have any use for them in our current model.
    }

    Map<String, Long> results = new HashMap<>();
    organismNames.forEach(name -> results.put(name, this.getOrganismNameIdByNameFromDB(name)));
    return results;
  }

  private static final String DEFAULT_ORG_NAME = "Unknown";
  private Map<String, Long> ensureNonEmptyOrganismSet(Map<String, Long> orgsToTest) {
    return orgsToTest.size() > 0 ?
        orgsToTest :
        Collections.singletonMap(DEFAULT_ORG_NAME, this.getOrganismNameIdByNameFromDB(DEFAULT_ORG_NAME));
  }

  // Note: this is not code!  This is the path through the biopax schema to protein data.  Keep this around!
  // c.controller(type: Protein).proteinRef(type ProteinRNARef).sequence
  // c.controller(type: Complex).component(type: Protein) .. as above
  final List<NXT> proteinPath = Collections.unmodifiableList(Arrays.asList(NXT.controller, NXT.ref));
  final List<NXT> complexPath = Collections.unmodifiableList(Arrays.asList(NXT.controller, NXT.components, NXT.ref));
  final List<NXT> organismSubPath = Collections.unmodifiableList(Collections.singletonList(NXT.organism));

  /**
   * Installs sequences for a reaction, collecting sequence and organism ids as it goes.
   * @param c The catalysis whose sequences to extract.
   * @param rxn The reaction object that will represent that catalysis.
   * @param rxnid The id of that reaction object.
   * @return A list of sequence ids and a list of organism ids (in that order) collected for the specified catalysis.
   */
  Pair<List<Long>, List<Long>> createCatalyzingSequences(Catalysis c, Reaction rxn, long rxnid) {

    Set<Long> seqs = new TreeSet<>(); // Preserve order for sanity's sake.
    Set<Long> orgs = new TreeSet<>();

    // extract the sequence of proteins that control the rxn
    for (BPElement seqRef : this.src.traverse(c, proteinPath)) {
      Map<String, Long> organisms = ensureNonEmptyOrganismSet(extractOrganismsAtPath(seqRef, organismSubPath));
      TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values());
      orgs.addAll(uniqueOrgs);
      seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs));
    }
    // extract the sequences of proteins that make up complexes that control the rxn
    for (BPElement seqRef : this.src.traverse(c, complexPath)) {
      Map<String, Long> organisms = ensureNonEmptyOrganismSet(extractOrganismsAtPath(seqRef, organismSubPath));
      TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values());
      orgs.addAll(uniqueOrgs);
      seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs));
    }

    return Pair.of(new ArrayList<>(seqs), new ArrayList<>(orgs));
  }

  List<Long> writeCatalyzingSequenceToDb(Catalysis c, ProteinRNARef seqRef, Reaction rxn, long rxnid, Set<Long> orgIds) {
    // the Catalysis object has ACTIVATION/INHIBITION and L->R or R->L annotations
    // put them alongside the sequence that controls the Conversion
    org.biopax.paxtools.model.level3.ControlType actInhibit = c.getControlType();
    org.biopax.paxtools.model.level3.CatalysisDirectionType direction = c.getDirection();
    String seq = seqRef.getSeq();
    Resource org = seqRef.getOrg();
    Set<String> comments = seqRef.getComments();
    String name = seqRef.getStandardName();
    Set<JSONObject> refs = toJSONObject(seqRef.getRefs()); // this contains things like UniProt accession#s, other db references etc.

    String ecnum = null;
    if (name != null) {
      Matcher ecnumMatcher = metacycStandardNameEcnum.matcher(name);
      // Sometimes more than 1 EC Number exists.
      // However, we only grab the first one for now to keep ecnum as a single value field.
      if (ecnumMatcher.find()) {
        ecnum = ecnumMatcher.group(1);
      }
    }

    if (orgIds.size() > 1) {
      System.err.format("WARNING: found multiple organisms for sequence %s: %s",
          seqRef.getID(), StringUtils.join(orgIds, ", "));
    }
    if (orgIds.size() == 0) {
      throw new RuntimeException(
          String.format("ERROR: no organisms found for sequence %s, should not be possible", seqRef.getID()));
    }

    List<Long> seqIds = new ArrayList<>(orgIds.size());
    for (Long orgId : orgIds) {
      String dir = direction == null ? "NULL" : direction.toString();
      String actInh = actInhibit == null ? "NULL" : actInhibit.toString();
      SequenceEntry entry = MetacycEntry.initFromMetacycEntry(seq, orgId, name, ecnum, comments, refs, rxnid, rxn, actInh, dir);
      seqIds.add(Long.valueOf(entry.writeToDB(db, Seq.AccDB.metacyc)));
    }

    return seqIds;
  }

  Set<JSONObject> toJSONObject(Set<Resource> resources) {
    Set<JSONObject> rsrc = new HashSet<JSONObject>();
    for (Resource r : resources)
      rsrc.add(this.src.resolve(r).expandedJSON(this.src));
    return rsrc;
  }

  String getMetaCycURL(Conversion c) {
    for (BPElement x : this.src.resolve(c.getXrefs())) {
      if (x instanceof Unification) {
        Unification u = (Unification)x;
        // we dont check for the "DB" in the catalysis unification xref since there
        // is only one xref and that points directly to the metacyc ID
        if (u.getUnifID().matches(this.METACYC_URI_IDS))
          return this.METACYC_URI_PREFIX + u.getUnifID();
      }
    }
    return null;
  }

  public void writeStdout() {
    for (Resource id : smallmolecules.keySet()) {
      SmallMolecule sm = (SmallMolecule)smallmolecules.get(id);
      SmallMoleculeRef smref = (SmallMoleculeRef)this.src.resolve(sm.getSMRef());
      SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref);
      ChemicalStructure c = (ChemicalStructure)this.src.resolve(smref.getChemicalStructure());
      ChemStrs str = getChemStrsFromChemicalStructure(c);
      if (str == null) continue;
      System.out.println(str.inchi);
    }

    // we go through each Catalysis and Modulation, both of which refer
    // to a controller (protein/complex) and controlled (reaction)
    // for each controlled reaction we pull up its Conversion (BioCRxn, Trans, Trans+BioCRxn)
    // Conversion has left, right and other details of the reaction

    for (Resource id : enzyme_catalysis.keySet()) {
      Catalysis c = enzyme_catalysis.get(id);
      System.out.println(c.expandedJSON(this.src).toString(2));
    }

    System.out.println("******************************************************");
    System.out.println("From file: " + this.originDBSubID);
    System.out.println("Extracted " + smallmolecules.size() + " small molecule structures.");
    System.out.println();
    System.out.println("******************************************************");
    System.out.println("From file: " + this.originDBSubID);
    System.out.println("Extracted " + enzyme_catalysis.size() + " catalysis observations.");
    System.out.println();
    System.out.format("Chems: %d (fail inchi: %d)\n", smallmolecules.size(), fail_inchi);
  }

  private SmallMolMetaData getSmallMoleculeMetaData(SmallMolecule sm, SmallMoleculeRef smref) {
    Term t = (Term)this.src.resolve(sm.getCellularLocation());
    String cellLoc = t != null ? t.getTerms().toString() : null; // returns a Set<String>, flatten it

    Set<String> names = new HashSet<String>();
    names.addAll(smref.getName());
    names.addAll(sm.getName());

    String metacycURL = null;
    HashMap<String, String> dbid = new HashMap<String, String>();
    for (BPElement elem : this.src.resolve(smref.getXrefs())) {
      if (elem instanceof Unification) {
        Unification u = (Unification) elem;
        dbid.put(u.getUnifDB(), u.getUnifID());
        if (u.getUnifDB().endsWith("yc") &&
            (u.getUnifID() != null && u.getUnifID().matches(this.METACYC_URI_IDS)))
          metacycURL = this.METACYC_URI_PREFIX + u.getUnifID();
      } else if (elem instanceof Publication) {
        Publication p = (Publication) elem;
        dbid.put(p.dbid(), p.citation());
      } else if (elem instanceof Relationship) {
        Relationship u = (Relationship) elem;
        dbid.put(u.getRelnDB(), u.getRelnID());
      } else {
        System.out.println("Other xref:" + elem.expandedJSON(this.src).toString(2));
        System.exit(-1);
      }
    }
    return new SmallMolMetaData(
      smref.getStandardName(), // smref and sm should have duplicate standardName fields
      names,
      smref.getMolecularWeight(),
      cellLoc,
      metacycURL,
      dbid);
  }

  private class SmallMolMetaData {
    String standardName;
    String cellularLoc;
    Set<String> names;
    Float molweight;
    HashMap<String, String> dbid;
    String metacycURL;
    SmallMolMetaData(String s, Set<String> n, Float mw, String cellLoc, String url, HashMap<String, String> dbid) {
      this.standardName = s; this.names = n; this.molweight = mw; this.cellularLoc = cellLoc; this.dbid = dbid; this.metacycURL = url;
    }

    private DBObject getDBObject() {
      DBObject o = new BasicDBObject();
      o.put("sname", standardName);
      o.put("names", names);
      if (cellularLoc != null) o.put("loc", cellularLoc);
      if (metacycURL != null) o.put("url", metacycURL);
      o.put("molw", molweight);
      BasicDBList reflist = new BasicDBList();
      for (String db : dbid.keySet()) {
        BasicDBObject ro = new BasicDBObject();
        ro.put("db", db);
        ro.put("id", dbid.get(db));
        reflist.add(ro);
      }
      o.put("refs", reflist);
      return o;
    }

    @Override
    public String toString() {
      return this.getDBObject().toString();
    }
  }

  private class ChemStrs {
    String inchi, smiles, inchikey;
    ChemStrs(String i, String ikey, String s) {
      this.inchi = i; this.inchikey = ikey; this.smiles = s;
    }
  }

  private String lookupInChIByXRefs(SmallMolecule sm) {
    Set<Resource> xrefs = sm.getXrefs();
    String firstInchi = null;
    if (xrefs == null) {
      throw new RuntimeException("No x-refs for " + sm.getID());
    }
    for (Resource xref : xrefs) {
      BPElement bpe = this.src.resolve(xref);
      if (bpe instanceof Relationship) {
        /* TODO: it's not clear how to link up the ontology name with the DB identifiers in these relationship objects.
         * For now we'll just look up by ID in the hash and hope that things work out okay. :-/
         */
        String id = ((Relationship) bpe).getRelnID();
        String db = ((Relationship) bpe).getRelnDB();
        String lookupResult = this.uniqueKeyToInChImap.get(id);
        if (lookupResult != null) {
          // Just store the first one and bail; we didn't see multiple InChIs for one molecule in testing.
          firstInchi = lookupResult;
          break;
        }
      }
    }

    return firstInchi;
  }

  private int fail_inchi = 0; // logging statistics

  private ChemStrs getChemStrsFromChemicalStructure(ChemicalStructure c) {
    String inc = null, smiles = null, incKey = null;

    /* Always prefer InChI over CML if available.  The Metacyc-defined InChIs are more precise than what we get from
     * parsing CML (which seems to lack stereochemistry details). */
    if (c.getInChI() != null) {
      // TODO: ditch InChI-Key and SMILES, as they're never really used.
      return new ChemStrs(c.getInChI(), incKey, smiles);
    }
    /* Note: this assumes the structure is always CML, but the ChemicalStructure class also expects SMILES.
     * Do we see both in practice? */

    String cml = c.getStructure().replaceAll("atomRefs","atomRefs2");
    // We can a CML description of the chemical structure.
    // Attempt to pass it through indigo to get the inchi
    // Then additionally pass it through consistentInChI
    // which in the integration step (as of the moment)
    // is a NOOP.
    try {
      IndigoObject mol = indigo.loadMolecule(cml);
      inc = indigoInchi.getInchi(mol);

      inc = ConsistentInChI.consistentInChI(inc, "MetaCyc install");
    } catch (Exception e) {
      if (debugFails) System.out.format("Failed to get inchi for %s\n", c.getID());
      fail_inchi++;
      return null;
    }

    // TODO: later check if we need to compute the inchikey and
    // smiles or we can leave them null. It looks like leaving them
    // null does result in a right install output (CMLs are stuffed
    // into the SMILES field and inchikeys are computed downstream.
    // So it looks ok to leave them null.
    //
    // incKey = indigoInchi.getInchiKey(inc);
    // smiles = mol.canonicalSmiles();

    if (cml != null && inc == null) {
      if (debugFails) System.out.println("Failed to get inchi:\n" + cml);
      fail_inchi++;
      return null;
    }

    return new ChemStrs(inc, incKey, smiles);

    // there seem to be some valid cases of failures because the CML contains the
    // following, non small-molecule, entities (R groups, bigger mols, just names):
    // cat out | grep cml | grep -v "\[R1\]" | grep -v "\[R\]" | grep -v "RNA" | grep -v "a nucleobase" | grep -v "DNA" | grep -v "Protein" | grep -v "RPL3" | grep -v "Purine-Bases" | grep -v "ETR-Quinones" | grep -v "Deaminated-Amine-Donors" | grep -v "Release-factors" | grep -v Acceptor | grep -v "\[R2\]" | grep -v "Peptides" | grep -v "Siderophore" | grep -v "Lipopolysaccharides" | wc -l
    // but then there are some 115/1901 (ecocyc) that are valid when converted through
    // openbabel (obabel, although conversion to inchis always happens with warnings)
    // and we have sent these to the Indigo team.
  }

  private ChemStrs hackAllowingNonSmallMolecule(ChemicalStructure c) {
    String fakeinchi = "InChI=/FAKE/" + this.originDB + "/" + this.originDBSubID + "/" + c.getID().getLocal();
    String fakeinchikey = "FAKEKEY/" + fakeinchi;
    String fakesmiles = c.getStructure(); // install the CML inside SMILES
    return new ChemStrs(fakeinchi, fakeinchikey, fakesmiles);
  }

}