HMDBParser.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer;

import act.installer.pubchem.PubchemParser;
import act.server.MongoDB;
import act.shared.Chemical;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jaxen.JaxenException;
import org.jaxen.XPath;
import org.jaxen.dom.DOMXPath;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class HMDBParser {
  private static final Logger LOGGER = LogManager.getFormatterLogger(HMDBParser.class);

  private static final String OPTION_INPUT_DIRECTORY = "i";
  private static final String OPTION_DB_HOST = "H";
  private static final String OPTION_DB_PORT = "p";
  private static final String OPTION_DB_NAME = "d";

  private static final String DEFAULT_DB_HOST = "localhost";
  private static final String DEFAULT_DB_PORT = "27017";
  private static final String DEFAULT_DB_NAME = "actv01";

  public static final String HELP_MESSAGE = StringUtils.join(new String[]{
      "This class parses HMDB XML files, converts them into chemical documents, and stores them in a DB",
  }, "");

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
    add(Option.builder(OPTION_INPUT_DIRECTORY)
        .argName("input dir")
        .desc("The directory where the HMDB XML files live")
        .hasArg()
        .required()
        .longOpt("input-dir")
    );
    add(Option.builder(OPTION_DB_HOST)
        .argName("hostname")
        .desc(String.format("The DB host to which to connect (default: %s)", DEFAULT_DB_HOST))
        .hasArg()
        .longOpt("db-host")
    );
    add(Option.builder(OPTION_DB_PORT)
        .argName("port")
        .desc(String.format("The DB port to which to connect (default: %s)", DEFAULT_DB_PORT))
        .hasArg()
        .longOpt("db-port")
    );
    add(Option.builder(OPTION_DB_NAME)
        .argName("name")
        .desc(String.format("The name of the DB to which to install the HMDB chemicals (default: %s)", DEFAULT_DB_NAME))
        .hasArg()
        .longOpt("db-name")
    );
    add(Option.builder("h")
        .argName("help")
        .desc("Prints this help message")
        .longOpt("help")
    );
  }};

  public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
  static {
    HELP_FORMATTER.setWidth(100);
  }

  /* HMDB files all have five digits from 1 through 61388 as of the initial writing of this class.  I've allowed for an
   * extra digit in case the next release of the DB exceeds 100k metabolites.  We also log rejected files just in case.
   */
  private static final Pattern HMDB_FILE_REGEX = Pattern.compile("^HMDB\\d{5,6}\\.xml$");

  /* Represent the HMDB paths as enums to constrain the universe of extracted features to a fixed set of paths.
   * Most of the features are not sub-tree dependent, so we can just separate them into TEXT and NODES paths (i.e. paths
   * that return a single string or paths that return nodes containing either a string or a sub-tree that needs to be
   * re-parsed together).  For sub-trees requiring dependent parsing, where one path doesn't do the trick, we use the
   * L1/L2 convention adopted in PubchemParser:
   *   Structure: <feature name>_<level>[_<sub-feature or structure>]_<type>
   * L1 extracts a sub-tree, while L2 extracts the features of the sub-tree so they can be linked together.
   *
   * Note also the description of features we're *not* currently extracting.  There are other data in the HMDB entries
   * that may be useful at some point, but in the interest of time are being ignored for now. */
  private enum HMDB_XPATH {
    HMDB_ID_TEXT("/metabolite/accession/text()"),
    // Names
    PRIMARY_NAME_TEXT("/metabolite/name/text()"),
    IUPAC_NAME_TEXT("/metabolite/iupac_name/text()"),
    SYNONYMS_NODES("/metabolite/synonyms/synonym"),
    // Structures
    INCHI_TEXT("/metabolite/inchi/text()"),
    SMILES_TEXT("/metabolite/smiles/text()"),
    // Ontology
    ONTOLOGY_STATUS_TEXT("/metabolite/ontology/status/text()"),
    ONTOLOGY_ORIGINS_NODES("/metabolite/ontology/origins/origin"),
    ONTOLOGY_FUNCTIONS_NODES("/metabolite/ontology/functions/function"),
    ONTOLOGY_APPLICATIONS_NODES("/metabolite/ontology/applications/application"),
    ONTOLOGY_LOCATIONS_NODES("/metabolite/ontology/cellular_locations/cellular_location"),
    // Physiological locality
    LOCATIONS_FLUID_NODES("/metabolite/biofluid_locations/biofluid"),
    LOCATIONS_TISSUE_NODES("/metabolite/tissue_locations/tissue"),
    // Metabolic pathways
    PATHWAY_NAME_NODES("/metabolite/pathways/pathway/name"),
    // Diseases
    DISEASE_NAME_NODES("/metabolite/diseases/disease/name"),
    // External IDs
    METLIN_ID_TEXT("/metabolite/metlin_id/text()"),
    PUBCHEM_ID_TEXT("/metabolite/pubchem_compound_id/text()"),
    CHEBI_ID_TEXT("/metabolite/chebi_id/text()"),
    // Proteins
    PROTEIN_L1_NODES("/metabolite/protein_associations/protein"),
    PROTEIN_L2_NAME_TEXT("/protein/name/text()"),
    PROTEIN_L2_UNIPROT_ID_TEXT("/protein/uniprot_id/text()"),
    PROTEIN_L2_GENE_NAME_TEXT("/protein/gene_name/text()"),

    /* Features we're not extracting right now:
     * * Normal/abnormal concentrations in different fluids/tissues (too many different kinds of expression/units)
     * * Experimentally derived and predicted properties (many of the latter come from Chemaxon anyway)
     * * "specdb" ids, which represent NRM/MS2 data out there, not sure how useful this is right now
     * * Pathway details and ids, which hopefully are already captured via Metacyc
     * * Literature references, which we'd only inspect manually at present and we can always return to the source
     */
    ;

    String path;

    HMDB_XPATH(String path) {
      this.path = path;
    }

    public String getPath() {
      return path;
    }

    // We rely on Jaxen because we've experienced performance problems using the built-in/xerces XPath implementation.
    DOMXPath compile() throws JaxenException {
      return new DOMXPath(this.getPath());
    }
  }

  public static void main(String[] args) throws Exception {
    // Parse the command line options
    Options opts = new Options();
    for (Option.Builder b : OPTION_BUILDERS) {
      opts.addOption(b.build());
    }

    CommandLine cl = null;
    try {
      CommandLineParser parser = new DefaultParser();
      cl = parser.parse(opts, args);
    } catch (ParseException e) {
      System.err.format("Argument parsing failed: %s\n", e.getMessage());
      HELP_FORMATTER.printHelp(PubchemParser.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    if (cl.hasOption("help")) {
      HELP_FORMATTER.printHelp(PubchemParser.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      return;
    }

    File inputDir = new File(cl.getOptionValue(OPTION_INPUT_DIRECTORY));
    if (!inputDir.isDirectory()) {
      System.err.format("Input directory at %s is not a directory\n", inputDir.getAbsolutePath());
      System.exit(1);
    }

    String dbName = cl.getOptionValue(OPTION_DB_NAME, DEFAULT_DB_NAME);
    String dbHost = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_DB_HOST);
    Integer dbPort = Integer.valueOf(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_DB_PORT));

    LOGGER.info("Connecting to %s:%d/%s", dbHost, dbPort, dbName);
    MongoDB db = new MongoDB(dbHost, dbPort, dbName);
    HMDBParser parser = Factory.makeParser(db);

    LOGGER.info("Starting parser");
    parser.run(inputDir);
    LOGGER.info("Done");
  }

  private final Map<HMDB_XPATH, XPath> xpaths = new HashMap<>();
  private MongoDB db;
  // This is required for extracting locality-dependent features of sub-trees using XPath.
  private DocumentBuilder documentBuilder;

  protected HMDBParser(MongoDB db) {
    this.db = db;
  }

  protected void init() throws JaxenException, ParserConfigurationException {
    for (HMDB_XPATH xpath : HMDB_XPATH.values()) {
      xpaths.put(xpath, xpath.compile());
    }

    // This bit pilfered from PubchemParser.java.
    // TODO: next time we use this, put it in a common super class (do it once, do it again, then do it right!).
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    documentBuilder = factory.newDocumentBuilder();
  }


  // TODO: add some constraints on HMDB_XPATHs that allow us to programmatically check they're being applied correctly.
  /**
   * Get the text contents contained in a list of nodes.  Used for multi-valued fields that are siblings in the tree.
   * @param n A list of nodes whose text content should be extracted.
   * @return The text for each node.
   */
  private static List<String> extractNodesText(List<Node> n) {
    return n.stream().map(Node::getTextContent).collect(Collectors.toList());
  }

  private List<Node> getNodes(HMDB_XPATH xpath, Document doc) throws JaxenException {
    return (List<Node>) xpaths.get(xpath).selectNodes(doc); // No check, but guaranteed to return List<Node>.
  }

  /**
   * Extract the textual content for a set of sibling nodes appearing at some path in the specified document.
   * @param xpath The path to use as a query.
   * @param doc The document to query.
   * @return The textual content of the nodes that live at the specified path.
   * @throws JaxenException
   */
  private List<String> getTextFromNodes(HMDB_XPATH xpath, Document doc) throws JaxenException {
    return extractNodesText(getNodes(xpath, doc));
  }

  private String getText(HMDB_XPATH xpath, Document doc) throws JaxenException {
    return xpaths.get(xpath).stringValueOf(doc);
  }

  /**
   * Convert an HMDB XML document into a Chemical object.  Expects one chemical per document.
   * @param doc A parsed HMDB XML doc.
   * @return The corresponding chemical to store in the DB.
   * @throws JaxenException
   * @throws JSONException
   */
  protected Chemical extractChemicalFromXMLDocument(Document doc) throws JaxenException, JSONException {
    String hmdbId = getText(HMDB_XPATH.HMDB_ID_TEXT, doc);

    String primaryName = getText(HMDB_XPATH.PRIMARY_NAME_TEXT, doc);
    String iupacName = getText(HMDB_XPATH.IUPAC_NAME_TEXT, doc);
    List<String> synonyms = getTextFromNodes(HMDB_XPATH.SYNONYMS_NODES, doc);

    String inchi = getText(HMDB_XPATH.INCHI_TEXT, doc);
    String smiles = getText(HMDB_XPATH.SMILES_TEXT, doc);

    // Require an InChI if we're going to consume this molecule.
    if (inchi == null || inchi.isEmpty()) {
      LOGGER.warn("No InChI found for HMDB chemical %s, aborting", hmdbId);
      return null;
    }

    String ontologyStatus = getText(HMDB_XPATH.ONTOLOGY_STATUS_TEXT, doc);
    List<String> ontologyOrigins = getTextFromNodes(HMDB_XPATH.ONTOLOGY_ORIGINS_NODES, doc);
    List<String> ontologyFunctions = getTextFromNodes(HMDB_XPATH.ONTOLOGY_FUNCTIONS_NODES, doc);
    List<String> ontologyApplications = getTextFromNodes(HMDB_XPATH.ONTOLOGY_APPLICATIONS_NODES, doc);
    List<String> ontologyLocations = getTextFromNodes(HMDB_XPATH.ONTOLOGY_LOCATIONS_NODES, doc);

    List<String> locationFluids = getTextFromNodes(HMDB_XPATH.LOCATIONS_FLUID_NODES, doc);
    List<String> locationTissues = getTextFromNodes(HMDB_XPATH.LOCATIONS_TISSUE_NODES, doc);

    List<String> pathwayNames = getTextFromNodes(HMDB_XPATH.PATHWAY_NAME_NODES, doc);

    List<String> diseaseNames = getTextFromNodes(HMDB_XPATH.DISEASE_NAME_NODES, doc);

    String metlinId = getText(HMDB_XPATH.METLIN_ID_TEXT, doc);
    String pubchemId = getText(HMDB_XPATH.PUBCHEM_ID_TEXT, doc);
    String chebiId = getText(HMDB_XPATH.CHEBI_ID_TEXT, doc);

    List<Node> proteins = getNodes(HMDB_XPATH.PROTEIN_L1_NODES, doc);
    // Simple triples of name, uniprot id, gene name.
    List<Triple<String, String, String>> proteinAttributes = new ArrayList<>(proteins.size());

    for (Node n : proteins) {
      /* In order to run XPath on a sub-document, we have to Extract the relevant nodes into their own document object.
       * If we try to run evaluate on `n` instead of this new document, we'll get matching paths for the original
       * document `d` but not for the nodes we're looking at right now.  Very weird. */
      Document proteinDoc = documentBuilder.newDocument();
      proteinDoc.adoptNode(n);
      proteinDoc.appendChild(n);
      String name = getText(HMDB_XPATH.PROTEIN_L2_NAME_TEXT, proteinDoc);
      String uniprotId = getText(HMDB_XPATH.PROTEIN_L2_UNIPROT_ID_TEXT, proteinDoc);
      String geneName = getText(HMDB_XPATH.PROTEIN_L2_GENE_NAME_TEXT, proteinDoc);
      proteinAttributes.add(Triple.of(name, uniprotId, geneName));
    }

    // Assumption: when we reach this point there will always be an InChI.
    Chemical chem = new Chemical(inchi);
    chem.setSmiles(smiles);

    chem.setCanon(primaryName);

    if (pubchemId != null && !pubchemId.isEmpty()) {
      chem.setPubchem(Long.valueOf(pubchemId));
    }

    synonyms.forEach(chem::addSynonym);
    chem.addSynonym(iupacName); // TODO: is there a better place for this?

    JSONObject meta = new JSONObject()
        .put("hmdb_id", hmdbId)
        .put("ontology", new JSONObject()
            .put("status", ontologyStatus)
            .put("origins", new JSONArray(ontologyOrigins))
            .put("functions", new JSONArray(ontologyFunctions))
            .put("applications", new JSONArray(ontologyApplications))
            .put("locations", new JSONArray(ontologyLocations))
        )
        .put("location", new JSONObject()
            .put("fluid", new JSONArray(locationFluids))
            .put("tissue", new JSONArray(locationTissues))
        )
        .put("pathway_names", new JSONArray(pathwayNames))
        .put("disease_names", new JSONArray(diseaseNames))
        .put("metlin_id", metlinId)
        .put("chebi_id", chebiId)
        .put("proteins", new JSONArray(proteinAttributes.stream()
            .map(t -> new JSONObject().
                put("name", t.getLeft()).
                put("uniprot_id", t.getMiddle()).
                put("gene_name", t.getRight())
            ).collect(Collectors.toList())
        )
    );

    chem.putRef(Chemical.REFS.HMDB, meta);

    return chem;
  }

  protected SortedSet<File> findHMDBFilesInDirectory(File dir) throws IOException {
    // Sort for consistency + sanity.
    SortedSet<File> results = new TreeSet<>((a, b) -> a.getName().compareTo(b.getName()));
    for (File file : dir.listFiles()) { // Do our own filtering so we can log rejects, of which we expect very few.
      if (HMDB_FILE_REGEX.matcher(file.getName()).matches()) {
        results.add(file);
      } else {
        LOGGER.warn("Found non-conforming HMDB file in directory %s: %s", dir.getAbsolutePath(), file.getName());
      }
    }
    return results;
  }

  /**
   * Extract all chemicals from HMDB XML files that live in the specified directory and save them in the DB.
   * Note that this search is not recursive: documents in sub-directories will be ignored.
   * @param inputDir The directory to scan for HMDB XML files.
   * @throws IOException
   * @throws IllegalArgumentException
   */
  public void run(File inputDir) throws IOException, IllegalArgumentException {
    if (inputDir == null || !inputDir.isDirectory()) {
      String msg = String.format("Unable to read input directory at %s",
          inputDir == null ? "null" : inputDir.getAbsolutePath());
      LOGGER.error(msg);
      throw new RuntimeException(msg);
    }

    SortedSet<File> files = findHMDBFilesInDirectory(inputDir);
    LOGGER.info("Found %d HMDB XML files in directory %s", files.size(), inputDir.getAbsolutePath());

    for (File file : files) {
      LOGGER.debug("Processing HMDB XML file %s", file.getAbsolutePath());

      /* Promote our XML-specific exceptions to generic IllegalArgumentExceptions to reduce error handling surface
       * area for the caller. */
      Document d;
      try {
        d = documentBuilder.parse(file);
      } catch (SAXException e) {
        String msg = String.format("Unable to parse XML file at %s: %s", file.getAbsolutePath(), e.getMessage());
        throw new IllegalArgumentException(msg, e);
      }

      /* Jaxen doesn't throw exceptions if it can't find a path, so a JaxenException here is completely unexpected.
       * It might mean corrupted XML or some unrecoverable XPath problem that we don't expect.  In any case, promote
       * the exception to the caller as it's unclear how we could deal with such an error here. */
      Chemical chem;
      try {
        chem = extractChemicalFromXMLDocument(d);
      } catch (JaxenException e) {
        String msg = String.format("Unable to extract features from XML file at %s: %s",
            file.getAbsolutePath(), e.getMessage());
        throw new IllegalArgumentException(msg, e);
      }

      // Not all HMDB entries contain
      if (chem == null) {
        LOGGER.warn("Unable to create chemical from file %s", file.getAbsolutePath());
        continue;
      }

      // submitToActChemicalDB creates or merges as necessary.
      Long id = db.getNextAvailableChemicalDBid();
      db.submitToActChemicalDB(chem, id);
      LOGGER.debug("Submitted chemical %d to the DB", id);
    }
    LOGGER.info("Loaded %d HMDB chemicals into DB", files.size());
  }

  public static class Factory {
    public static HMDBParser makeParser(MongoDB db) {
      HMDBParser parser = new HMDBParser(db);
      // Promote XML-specific exceptions from parser initialization to runtime exceptions, as they are definite bugs.
      try {
        parser.init();
      } catch (JaxenException e) {
        LOGGER.error("BUG: caught JaxenException on initialization, which means programmer error: %s",
            e.getMessage());
        throw new RuntimeException(e);
      } catch (ParserConfigurationException e) {
        LOGGER.error("BUG: caught ParserConfigurationException on initialization, which means programmer error: %s",
            e.getMessage());
        throw new RuntimeException(e);
      }

      return parser;
    }
  }
}