/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.analysis.similarity;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.struc.Molecule;
import chemaxon.struc.MoleculeGraph;
import chemaxon.util.MolHandler;
import com.act.utils.TSVWriter;
import com.act.utils.TSVParser;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This class is based on Chris's substructure search from com/act/biointerpretation/operators (see commit
* e7fc12d7d8017949d83c42aca276bcf1b76fa802). The list of substructures
* is hard-coded to find molecules that look like saccharides.
*
* TODO: abstract the common parts of this and FattyAcidSearch into a shared base class.
*/
public class SaccharideSearch {
// These live as lists to keep them ordered.
public static final List<String> SACCHARIDE_SMARTS = new ArrayList<String>() {{
add("CO[C@H:4]1[C@H:3](O)[C@@H:2](O)[C@H:1](O)O[C@@H:5]1[CH2:6]O");
add("CO[C@@H:3]1[C@@H:2](O)[C@H:1](O)O[C@H:5]([CH2:6]O)[C@H:4]1O");
add("CO[CH2:6][C@H:5]1O[C@@H:1](O)[C@H:2](O)[C@@H:3](O)[C@@H:4]1O");
add("CO[C:1][C@@:2]1(O)O[C@H:5]([CH2:6]O)[C@@H:4](O)[C@@H:3]1O");
add("CO[CH2:6][C@H:5]1O[C@H:1](O)[C@H:2](O)[C@@H:3](O)[C@@H:4]1O");
add("CO[C@@H:2]1[C@@H:1](O)O[C@H:5]([CH2:6]O)[C@@H:4](O)[C@@H:3]1O");
add("CO[C@@H:3]1[C@H:2](O)[C@@H:1](O)O[C@H:5]([CH2:6]O)[C@H:4]1O");
add("CO[CH2:6][C@H:5]1O[C@H:1](O)[C@@H:2](O)[C@@H:3](O)[C@@H:4]1O");
add("CO[C@@H:3]1[C@@H:2](O)[C@H:1](O)O[C@H:5]([CH2:6]O)[C@@H:4]1O");
add("CO[C@@H:4]1[C@H:3](O)[C@@H:2](O)[C@H:1](O)O[C@@H:5]1[CH2:6]O");
add("CO[CH2:6][C@H:5]1O[C@@H:1](O)[C@H:2](O)[C@@H:3](O)[C@H:4]1O");
}};
// Names for each structure, the first eight of which are based on the paper captions.
public static final List<String> SACCHARIDE_HEADER_FIELDS = new ArrayList<String>() {{
add("(1-->4)b-D-Glucopyranose");
add("(1-->3)b-D-Glucopyranose");
add("(1-->6)b-D-Glucopyranose");
add("(2-->1)b-D-Fructofuranose");
add("(1-->6)a-D-Glucopyranose");
add("(1-->2)a-D-Mannopyranose");
add("(1-->3)a-D-Mannopyranose");
add("(1-->6)a-D-Mannopyranose");
add("(1-->3)b-D-Galactopyranose");
add("(1-->4)b-D-Galactopyranose");
add("(1-->6)b-D-Galactopyranose");
}};
// Note: to relax these search criteria, see https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
private Map<String, MolSearch> saccharideSearches = new HashMap<>(SACCHARIDE_SMARTS.size());
public void init() throws IOException, MolFormatException {
for (String smarts : SACCHARIDE_SMARTS) {
MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
searchOptions.setStereoModel(SearchConstants.STEREO_MODEL_LOCAL);
searchOptions.setStereoSearchType(SearchConstants.STEREO_EXACT);
MolSearch ms = new MolSearch();
ms.setSearchOptions(searchOptions);
ms.setQuery(new MolHandler(smarts, true).getMolecule());
saccharideSearches.put(smarts, ms);
}
}
public Map<String, Double> matchVague(Molecule target) throws Exception {
Map<String, Double> results = new HashMap<>();
for (Map.Entry<String, MolSearch> entry : saccharideSearches.entrySet()) {
MolSearch searcher = entry.getValue();
searcher.setTarget(target);
int[][] hits = searcher.findAll();
int longestHit = 0;
if (hits != null) {
for (int i = 0; i < hits.length; i++) {
if (hits[i].length > longestHit) {
longestHit = hits[i].length;
}
}
}
// Normalize all non-zero values to 1.0, and all zero-ish values to 0.0.
results.put(entry.getKey(), Integer.valueOf(longestHit).doubleValue() > 0.1 ? 1.0 : 0.0);
}
return results;
}
public static void main(String[] args) throws Exception {
LicenseManager.setLicenseFile(args[0]);
TSVParser parser = new TSVParser();
parser.parse(new File(args[1]));
List<String> header = parser.getHeader();
header.addAll(SACCHARIDE_HEADER_FIELDS);
TSVWriter<String, String> writer = new TSVWriter<>(header);
writer.open(new File(args[2]));
try {
SaccharideSearch matcher = new SaccharideSearch();
matcher.init();
int rowNum = 0;
int ambiguousStereoChemistryCount = 0;
for (Map<String, String> row : parser.getResults()) {
rowNum++;
try {
String inchi = row.get("inchi");
Molecule target = null;
try {
target = MolImporter.importMol(inchi);
} catch (Exception e) {
System.err.format("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage());
continue;
}
boolean hasAmbiguousStereoChemistry = false;
for (int i = 0; i < target.getAtomCount(); i++) {
if (target.getChirality(i) == MoleculeGraph.PARITY_EITHER) {
hasAmbiguousStereoChemistry = true;
break;
}
}
if (hasAmbiguousStereoChemistry) {
ambiguousStereoChemistryCount++;
System.err.format("Molecule %s has ambiguous stereochemistry, skipping\n", row.get("id"));
continue;
}
Map<String, Double> results = matcher.matchVague(target);
for (int i = 0; i < SACCHARIDE_SMARTS.size(); i++) {
row.put(SACCHARIDE_HEADER_FIELDS.get(i), String.format("%.3f", results.get(SACCHARIDE_SMARTS.get(i))));
}
writer.append(row);
writer.flush();
} catch (Exception e) {
System.err.format("Exception on input line %d\n", rowNum);
throw e;
}
}
System.err.format("Molecules with ambiguous stereochemistry: %d\n", ambiguousStereoChemistryCount);
} finally {
writer.close();
}
}
}