UmamiSearch.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.analysis.similarity;

import chemaxon.formats.MolExporter;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.struc.Molecule;
import chemaxon.util.MolHandler;
import com.act.utils.TSVWriter;
import com.act.utils.TSVParser;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * This class is based on Chris's substructure search from the biointerpretation branch.  The list of substructures
 * is hard-coded to find molecules that might contribute to umami-enhancement behavior.
 *
 * TODO: abstract the common parts of this and FattyAcidSearch into a shared base class.
 */
public class UmamiSearch {

  // These live as lists to keep them ordered.
  public static final List<String> UMAMI_INCHIS = new ArrayList<String>() {{
    // These structures are from http://dx.doi.org/10.1016/B978-0-12-384947-2.00297-X.
    add("InChI=1/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)");
    add("InChI=1/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1");
    add("InChI=1/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-2");
    add("InChI=1/C5H9NO5/c6-4(5(10)11)2(7)1-3(8)9/h2,4,7H,1,6H2,(H,8,9)(H,10,11)");
    add("InChI=1/C4H9NO5S.Na/c5-3(4(6)7)1-2-11(8,9)10;/h3H,1-2,5H2,(H,6,7)(H,8,9,10);/q;+1/p-1");
    add("InChI=1/C4H7NO4.Na/c5-2(4(8)9)1-3(6)7;/h2H,1,5H2,(H,6,7)(H,8,9);/q;+1/p-1");
    add("InChI=1/C5H8N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h2,4H,1,6H2,(H,7,8)(H,9,10);/q;+1/p-1");
    add("InChI=1/C5H6N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h1,4H,6H2,(H,7,8)(H,9,10);/q;+1/p-1");
    add("InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)");
    add("InChI=1S/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1/t3-;/m0./s1");
    add("InChI=1S/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1/t3-;/m0./s1");
    add("InChI=1S/2C5H9NO4.Ca/c2*6-3(5(9)10)1-2-4(7)8;/h2*3H,1-2,6H2,(H,7,8)(H,9,10);/q;;+2/p-2/t2*3-;/m00./s1");
    add("InChI=1S/C5H9NO4.H3N/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);1H3");
    add("InChI=1S/2C5H9NO4.Mg/c2*6-3(5(9)10)1-2-4(7)8;/h2*3H,1-2,6H2,(H,7,8)(H,9,10);/q;;+2/p-2");
    add("InChI=1S/C10H14N5O8P/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18)/t3-,5-,6-,9-/m1/s1");
    add("InChI=1S/C10H14N5O8P.2Na/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);;/q;2*+1/p-2/t3-,5-,6-,9-;;/m1../s1");
    add("InChI=1S/C10H14N5O8P.2K/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);;/q;2*+1/p-2/t3-,5-,6-,9-;;/m1../s1");
    add("InChI=1S/C10H14N5O8P.Ca/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);/q;+2/p-2/t3-,5-,6-,9-;/m1./s1");
    add("InChI=1S/C10H13N4O8P/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20)/t4-,6-,7-,10-/m1/s1");
    add("InChI=1S/C10H13N4O8P.2Na/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17;;/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20);;/q;2*+1/p-2/t4-,6-,7-,10-;;/m1../s1");
    add("InChI=1S/C10H13N4O8P.Ca/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17;/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20);/q;+2/p-2/t4-,6-,7-,10-;/m1./s1");
    add("InChI=1S/C4H8O3/c5-3-1-2-4(6)7/h5H,1-3H2,(H,6,7)");
    add("InChI=1S/C5H10O3/c6-4-2-1-3-5(7)8/h6H,1-4H2,(H,7,8)");
    add("InChI=1S/C4H10O2S/c5-3-1-2-4-7-6/h5,7H,1-4H2");
    add("InChI=1S/C3H8O2S/c4-2-1-3-6-5/h4,6H,1-3H2");
  }};

  // Hack to fix InChI canonicalization messing with smiles.
  public static final Map<String, String> UMAMI_OVERRIDE_SMILES = new HashMap<String, String>() {{
    put("InChI=1/C5H8N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h2,4H,1,6H2,(H,7,8)(H,9,10);/q;+1/p-1", "NC(C1CC(=O)NO1)C([O-])=O");
    put("InChI=1/C5H6N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h1,4H,6H2,(H,7,8)(H,9,10);/q;+1/p-1", "NC(C([O-])=O)C1=CC(O)=NO1");
    put("InChI=1S/C4H8O3/c5-3-1-2-4(6)7/h5H,1-3H2,(H,6,7)", "OCCCC(=O)[OH]");
    put("InChI=1S/C5H10O3/c6-4-2-1-3-5(7)8/h6H,1-4H2,(H,7,8)", "OCCCCC(=O)[OH]");
    put("InChI=1S/C4H10O2S/c5-3-1-2-4-7-6/h5,7H,1-4H2", "OCCCCS(=O)");
    put("InChI=1S/C3H8O2S/c4-2-1-3-6-5/h4,6H,1-3H2", "OCCCS(=O)");
  }};

  // Names for each structure, the first eight of which are based on the paper captions.
  public static final List<String> UMAMI_HEADER_FIELDS = new ArrayList<String>() {{
    add("example_a");
    add("example_b");
    add("example_c");
    add("example_d");
    add("example_e");
    add("example_f");
    add("example_g");
    add("example_h");
    add("glutamic_acid");
    add("msg");
    add("monopotassium_glutamate");
    add("calcium_diglutamate");
    add("monoammonium_glutamate");
    add("magnesium_diglutamate");
    add("guanosine_monophosphate");
    add("disodium_guanylate");
    add("dipotassium_guanylate");
    add("calcium_guanylate");
    add("inosinic_acid");
    add("disodium_inosinate");
    add("calcium_inosinate");
    add("chris_example_1");
    add("chris_example_2");
    add("chris_example_3");
    add("chris_example_4");
  }};

  // From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
  public static final MolSearchOptions SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
  static {
    SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
  }

  private Map<String, Molecule> umamiMolecules = new HashMap<>(UMAMI_INCHIS.size());
  private List<String> umamiSmiles = new ArrayList<String>();
  private Map<String, MolSearch> umamiSearches = new HashMap<>(UMAMI_INCHIS.size());

  public Molecule findLargestFragment(Molecule[] molecules) {
    Molecule largest = null;
    for (Molecule m : molecules) {
      if (largest == null || largest.getAtomCount() < m.getAtomCount()) {
        largest = m;
      }
    }
    return largest;
  }

  public void init() throws IOException, MolFormatException {
    for (String inchi : UMAMI_INCHIS) {
      Molecule mol = MolImporter.importMol(inchi);
      Molecule largestFragment = findLargestFragment(mol.convertToFrags());
      umamiMolecules.put(inchi, largestFragment);
      String smiles = null;
      if (UMAMI_OVERRIDE_SMILES.containsKey(inchi)) {
        System.err.format("Using override smiles for %s\n", inchi);
        smiles = UMAMI_OVERRIDE_SMILES.get(inchi);
      } else {
        smiles = MolExporter.exportToFormat(largestFragment, "smiles");
      }
      umamiSmiles.add(smiles);
      MolSearch ms = new MolSearch();
      ms.setSearchOptions(SEARCH_OPTIONS);
      ms.setQuery(new MolHandler(smiles, true).getMolecule());
      umamiSearches.put(inchi, ms);
    }
  }

  public Map<String, Double> matchVague(Molecule target) throws Exception {
    MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
    searchOptions.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);

    Map<String, Double> results = new HashMap<>();
    for (Map.Entry<String, MolSearch> entry : umamiSearches.entrySet()) {
      MolSearch searcher = entry.getValue();
      searcher.setTarget(target);
      int[][] hits = searcher.findAll();
      int longestHit = 0;
      if (hits != null) {
        for (int i = 0; i < hits.length; i++) {
          if (hits[i].length > longestHit) {
            longestHit = hits[i].length;
          }
        }
      }
      results.put(entry.getKey(), Integer.valueOf(longestHit).doubleValue() /
          Integer.valueOf(umamiMolecules.get(entry.getKey()).getAtomCount()).doubleValue());
    }

    return results;
  }

  public static void main(String[] args) throws Exception {
    LicenseManager.setLicenseFile(args[0]);

    TSVParser parser = new TSVParser();
    parser.parse(new File(args[1]));
    List<String> header = parser.getHeader();

    header.addAll(UMAMI_HEADER_FIELDS);
    TSVWriter<String, String> writer = new TSVWriter<>(header);
    writer.open(new File(args[2]));

    try {
      UmamiSearch matcher = new UmamiSearch();
      matcher.init();
      int rowNum = 0;
      for (Map<String, String> row : parser.getResults()) {
        rowNum++;
        try {
          String inchi = row.get("inchi");
          Molecule target = null;
          try {
            target = MolImporter.importMol(inchi);
          } catch (Exception e) {
            System.err.format("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage());
            continue;
          }
          Map<String, Double> results = matcher.matchVague(target);
          for (int i = 0; i < UMAMI_INCHIS.size(); i++) {
            row.put(UMAMI_HEADER_FIELDS.get(i), String.format("%.3f", results.get(UMAMI_INCHIS.get(i))));
          }

          writer.append(row);
          writer.flush();
        } catch (Exception e) {
          System.err.format("Exception on input line %d\n", rowNum);
          throw e;
        }
      }
    } finally {
      writer.close();
    }
  }
}