/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.analysis.similarity; import chemaxon.formats.MolExporter; import chemaxon.formats.MolFormatException; import chemaxon.formats.MolImporter; import chemaxon.license.LicenseManager; import chemaxon.sss.SearchConstants; import chemaxon.sss.search.MolSearch; import chemaxon.sss.search.MolSearchOptions; import chemaxon.struc.Molecule; import chemaxon.util.MolHandler; import com.act.utils.TSVWriter; import com.act.utils.TSVParser; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This class is based on Chris's substructure search from the biointerpretation branch. The list of substructures * is hard-coded to find molecules that might contribute to umami-enhancement behavior. * * TODO: abstract the common parts of this and FattyAcidSearch into a shared base class. */ public class UmamiSearch { // These live as lists to keep them ordered. public static final List<String> UMAMI_INCHIS = new ArrayList<String>() {{ // These structures are from http://dx.doi.org/10.1016/B978-0-12-384947-2.00297-X. add("InChI=1/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)"); add("InChI=1/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1"); add("InChI=1/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-2"); add("InChI=1/C5H9NO5/c6-4(5(10)11)2(7)1-3(8)9/h2,4,7H,1,6H2,(H,8,9)(H,10,11)"); add("InChI=1/C4H9NO5S.Na/c5-3(4(6)7)1-2-11(8,9)10;/h3H,1-2,5H2,(H,6,7)(H,8,9,10);/q;+1/p-1"); add("InChI=1/C4H7NO4.Na/c5-2(4(8)9)1-3(6)7;/h2H,1,5H2,(H,6,7)(H,8,9);/q;+1/p-1"); add("InChI=1/C5H8N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h2,4H,1,6H2,(H,7,8)(H,9,10);/q;+1/p-1"); add("InChI=1/C5H6N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h1,4H,6H2,(H,7,8)(H,9,10);/q;+1/p-1"); add("InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)"); add("InChI=1S/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1/t3-;/m0./s1"); add("InChI=1S/C5H9NO4.Na/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);/q;+1/p-1/t3-;/m0./s1"); add("InChI=1S/2C5H9NO4.Ca/c2*6-3(5(9)10)1-2-4(7)8;/h2*3H,1-2,6H2,(H,7,8)(H,9,10);/q;;+2/p-2/t2*3-;/m00./s1"); add("InChI=1S/C5H9NO4.H3N/c6-3(5(9)10)1-2-4(7)8;/h3H,1-2,6H2,(H,7,8)(H,9,10);1H3"); add("InChI=1S/2C5H9NO4.Mg/c2*6-3(5(9)10)1-2-4(7)8;/h2*3H,1-2,6H2,(H,7,8)(H,9,10);/q;;+2/p-2"); add("InChI=1S/C10H14N5O8P/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18)/t3-,5-,6-,9-/m1/s1"); add("InChI=1S/C10H14N5O8P.2Na/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);;/q;2*+1/p-2/t3-,5-,6-,9-;;/m1../s1"); add("InChI=1S/C10H14N5O8P.2K/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);;/q;2*+1/p-2/t3-,5-,6-,9-;;/m1../s1"); add("InChI=1S/C10H14N5O8P.Ca/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(23-9)1-22-24(19,20)21;/h2-3,5-6,9,16-17H,1H2,(H2,19,20,21)(H3,11,13,14,18);/q;+2/p-2/t3-,5-,6-,9-;/m1./s1"); add("InChI=1S/C10H13N4O8P/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20)/t4-,6-,7-,10-/m1/s1"); add("InChI=1S/C10H13N4O8P.2Na/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17;;/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20);;/q;2*+1/p-2/t4-,6-,7-,10-;;/m1../s1"); add("InChI=1S/C10H13N4O8P.Ca/c15-6-4(1-21-23(18,19)20)22-10(7(6)16)14-3-13-5-8(14)11-2-12-9(5)17;/h2-4,6-7,10,15-16H,1H2,(H,11,12,17)(H2,18,19,20);/q;+2/p-2/t4-,6-,7-,10-;/m1./s1"); add("InChI=1S/C4H8O3/c5-3-1-2-4(6)7/h5H,1-3H2,(H,6,7)"); add("InChI=1S/C5H10O3/c6-4-2-1-3-5(7)8/h6H,1-4H2,(H,7,8)"); add("InChI=1S/C4H10O2S/c5-3-1-2-4-7-6/h5,7H,1-4H2"); add("InChI=1S/C3H8O2S/c4-2-1-3-6-5/h4,6H,1-3H2"); }}; // Hack to fix InChI canonicalization messing with smiles. public static final Map<String, String> UMAMI_OVERRIDE_SMILES = new HashMap<String, String>() {{ put("InChI=1/C5H8N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h2,4H,1,6H2,(H,7,8)(H,9,10);/q;+1/p-1", "NC(C1CC(=O)NO1)C([O-])=O"); put("InChI=1/C5H6N2O4.Na/c6-4(5(9)10)2-1-3(8)7-11-2;/h1,4H,6H2,(H,7,8)(H,9,10);/q;+1/p-1", "NC(C([O-])=O)C1=CC(O)=NO1"); put("InChI=1S/C4H8O3/c5-3-1-2-4(6)7/h5H,1-3H2,(H,6,7)", "OCCCC(=O)[OH]"); put("InChI=1S/C5H10O3/c6-4-2-1-3-5(7)8/h6H,1-4H2,(H,7,8)", "OCCCCC(=O)[OH]"); put("InChI=1S/C4H10O2S/c5-3-1-2-4-7-6/h5,7H,1-4H2", "OCCCCS(=O)"); put("InChI=1S/C3H8O2S/c4-2-1-3-6-5/h4,6H,1-3H2", "OCCCS(=O)"); }}; // Names for each structure, the first eight of which are based on the paper captions. public static final List<String> UMAMI_HEADER_FIELDS = new ArrayList<String>() {{ add("example_a"); add("example_b"); add("example_c"); add("example_d"); add("example_e"); add("example_f"); add("example_g"); add("example_h"); add("glutamic_acid"); add("msg"); add("monopotassium_glutamate"); add("calcium_diglutamate"); add("monoammonium_glutamate"); add("magnesium_diglutamate"); add("guanosine_monophosphate"); add("disodium_guanylate"); add("dipotassium_guanylate"); add("calcium_guanylate"); add("inosinic_acid"); add("disodium_inosinate"); add("calcium_inosinate"); add("chris_example_1"); add("chris_example_2"); add("chris_example_3"); add("chris_example_4"); }}; // From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options. public static final MolSearchOptions SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); static { SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4); } private Map<String, Molecule> umamiMolecules = new HashMap<>(UMAMI_INCHIS.size()); private List<String> umamiSmiles = new ArrayList<String>(); private Map<String, MolSearch> umamiSearches = new HashMap<>(UMAMI_INCHIS.size()); public Molecule findLargestFragment(Molecule[] molecules) { Molecule largest = null; for (Molecule m : molecules) { if (largest == null || largest.getAtomCount() < m.getAtomCount()) { largest = m; } } return largest; } public void init() throws IOException, MolFormatException { for (String inchi : UMAMI_INCHIS) { Molecule mol = MolImporter.importMol(inchi); Molecule largestFragment = findLargestFragment(mol.convertToFrags()); umamiMolecules.put(inchi, largestFragment); String smiles = null; if (UMAMI_OVERRIDE_SMILES.containsKey(inchi)) { System.err.format("Using override smiles for %s\n", inchi); smiles = UMAMI_OVERRIDE_SMILES.get(inchi); } else { smiles = MolExporter.exportToFormat(largestFragment, "smiles"); } umamiSmiles.add(smiles); MolSearch ms = new MolSearch(); ms.setSearchOptions(SEARCH_OPTIONS); ms.setQuery(new MolHandler(smiles, true).getMolecule()); umamiSearches.put(inchi, ms); } } public Map<String, Double> matchVague(Molecule target) throws Exception { MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); searchOptions.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4); Map<String, Double> results = new HashMap<>(); for (Map.Entry<String, MolSearch> entry : umamiSearches.entrySet()) { MolSearch searcher = entry.getValue(); searcher.setTarget(target); int[][] hits = searcher.findAll(); int longestHit = 0; if (hits != null) { for (int i = 0; i < hits.length; i++) { if (hits[i].length > longestHit) { longestHit = hits[i].length; } } } results.put(entry.getKey(), Integer.valueOf(longestHit).doubleValue() / Integer.valueOf(umamiMolecules.get(entry.getKey()).getAtomCount()).doubleValue()); } return results; } public static void main(String[] args) throws Exception { LicenseManager.setLicenseFile(args[0]); TSVParser parser = new TSVParser(); parser.parse(new File(args[1])); List<String> header = parser.getHeader(); header.addAll(UMAMI_HEADER_FIELDS); TSVWriter<String, String> writer = new TSVWriter<>(header); writer.open(new File(args[2])); try { UmamiSearch matcher = new UmamiSearch(); matcher.init(); int rowNum = 0; for (Map<String, String> row : parser.getResults()) { rowNum++; try { String inchi = row.get("inchi"); Molecule target = null; try { target = MolImporter.importMol(inchi); } catch (Exception e) { System.err.format("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage()); continue; } Map<String, Double> results = matcher.matchVague(target); for (int i = 0; i < UMAMI_INCHIS.size(); i++) { row.put(UMAMI_HEADER_FIELDS.get(i), String.format("%.3f", results.get(UMAMI_INCHIS.get(i)))); } writer.append(row); writer.flush(); } catch (Exception e) { System.err.format("Exception on input line %d\n", rowNum); throw e; } } } finally { writer.close(); } } }