/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.analysis.similarity;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.struc.Molecule;
import chemaxon.util.MolHandler;
import com.act.utils.TSVWriter;
import com.act.utils.TSVParser;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This class is based on Chris's substructure search from com/act/biointerpretation/operators (see commit
* e7fc12d7d8017949d83c42aca276bcf1b76fa802). The list of substructures is hard-coded to find molecules that are based
* on C5-C12 fatty acids.
*
* TODO: abstract the common parts of this and UmamiSearch into a shared base class.
*/
public class FattyAcidSearch {
public static final List<String> ACIDS_5_THROUGH_12 = new ArrayList<String>() {{
add("[CH3][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2]C(=O)O");
add("[CH3][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2]C(=O)O");
}};
public static final Map<String, Double> ACIDS_5_THROUGH_12_LENGTH = new HashMap<String, Double>() {{
// Ignore hydrogens, just count other molecules. TODO: let Marvin do the counting.
put(ACIDS_5_THROUGH_12.get(0), 7.0);
put(ACIDS_5_THROUGH_12.get(1), 8.0);
put(ACIDS_5_THROUGH_12.get(2), 9.0);
put(ACIDS_5_THROUGH_12.get(3), 10.0);
put(ACIDS_5_THROUGH_12.get(4), 11.0);
put(ACIDS_5_THROUGH_12.get(5), 12.0);
put(ACIDS_5_THROUGH_12.get(6), 13.0);
put(ACIDS_5_THROUGH_12.get(7), 14.0);
}};
// From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
public static final MolSearchOptions SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
static {
SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
}
private Map<String, MolSearch> fattyAcidSearches = new HashMap<>(ACIDS_5_THROUGH_12.size());
void init() throws MolFormatException {
for (String smarts : ACIDS_5_THROUGH_12) {
MolSearch ms = new MolSearch();
ms.setSearchOptions(SEARCH_OPTIONS);
ms.setQuery(new MolHandler(smarts, true).getMolecule());
fattyAcidSearches.put(smarts, ms);
}
}
public Map<String, Double> matchVague(Molecule target) throws Exception {
MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
searchOptions.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
Map<String, Double> results = new HashMap<>();
for (Map.Entry<String, MolSearch> entry : fattyAcidSearches.entrySet()) {
MolSearch searcher = entry.getValue();
searcher.setTarget(target);
/* This returns a list of atom-to-atom mappings identified by the searcher. If the entire substructure was found
* in the target molecule, the length of the array should match the number of atoms in the query. See
* https://www.chemaxon.com/jchem/doc/dev/java/api/index.html?chemaxon/sss/search/MolSearch.html
* for more details. */
int[][] hits = searcher.findAll();
int longestHit = 0;
if (hits != null) {
for (int i = 0; i < hits.length; i++) {
if (hits[i].length > longestHit) {
longestHit = hits[i].length;
}
}
}
results.put(entry.getKey(), Integer.valueOf(longestHit).doubleValue() /
ACIDS_5_THROUGH_12_LENGTH.get(entry.getKey()));
}
return results;
}
public static void main(String[] args) throws Exception {
LicenseManager.setLicenseFile(args[0]);
TSVParser parser = new TSVParser();
parser.parse(new File(args[1]));
List<String> header = parser.getHeader();
final List<String> fattyAcidHeaderFields = new ArrayList<String>() {{
add("fa_5");
add("fa_6");
add("fa_7");
add("fa_8");
add("fa_9");
add("fa_10");
add("fa_11");
add("fa_12");
}};
header.addAll(fattyAcidHeaderFields);
TSVWriter<String, String> writer = new TSVWriter<>(header);
writer.open(new File(args[2]));
try {
FattyAcidSearch matcher = new FattyAcidSearch();
matcher.init();
int rowNum = 0;
for (Map<String, String> row : parser.getResults()) {
rowNum++;
try {
String inchi = row.get("inchi");
Molecule target = null;
try {
target = MolImporter.importMol(inchi);
} catch (Exception e) {
System.err.format("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage());
continue;
}
Map<String, Double> results = matcher.matchVague(target);
for (int i = 0; i < ACIDS_5_THROUGH_12.size(); i++) {
row.put(fattyAcidHeaderFields.get(i), String.format("%.3f", results.get(ACIDS_5_THROUGH_12.get(i))));
}
writer.append(row);
writer.flush();
} catch (Exception e) {
System.err.format("Exception on input line %d\n", rowNum);
throw e;
}
}
} finally {
writer.close();
}
}
}