/* $Revision$ $Author$ $Date$
*
* Copyright (C) 2007 Rajarshi Guha <rajarshi@users.sourceforge.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.smiles.smarts;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IRingSet;
import org.openscience.cdk.isomorphism.UniversalIsomorphismTester;
import org.openscience.cdk.isomorphism.matchers.IQueryAtom;
import org.openscience.cdk.isomorphism.matchers.QueryAtomContainer;
import org.openscience.cdk.isomorphism.matchers.smarts.HydrogenAtom;
import org.openscience.cdk.isomorphism.matchers.smarts.LogicalOperatorAtom;
import org.openscience.cdk.isomorphism.matchers.smarts.RecursiveSmartsAtom;
import org.openscience.cdk.isomorphism.mcss.RMap;
import org.openscience.cdk.ringsearch.AllRingsFinder;
import org.openscience.cdk.ringsearch.SSSRFinder;
import org.openscience.cdk.smiles.smarts.parser.SMARTSParser;
import org.openscience.cdk.smiles.smarts.parser.TokenMgrError;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
/**
* This class provides a easy to use wrapper around SMARTS matching
* functionality. <p/> User code that wants to do SMARTS matching should use
* this rather than using SMARTSParser (and UniversalIsomorphismTester)
* directly. Example usage would be
* <p/>
* <pre>
* SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance());
* IAtomContainer atomContainer = sp.parseSmiles("CC(=O)OC(=O)C");
* SMARTSQueryTool querytool = new SMARTSQueryTool("O=CO");
* boolean status = querytool.matches(atomContainer);
* if (status) {
* int nmatch = querytool.countMatches();
* List mappings = querytool.getMatchingAtoms();
* for (int i = 0; i < nmatch; i++) {
* List atomIndices = (List) mappings.get(i);
* }
* }
* </pre>
* <h3>Unsupported Features</h3>
* <ul>
* <li>Component level grouping
* <li>Stereochemistry
* <li>Reaction support
* </ul>
* <h3>SMARTS Extensions</h3>
*
* Currently the CDK supports the following SMARTS symbols, that are not described
* in the Daylight specification. However they are supported by other packages and
* are noted as such.
*
* <table border=1 cellpadding=3>
* <thead>
* <tr>
* <th>Symbol</th><th>Meaning</th><th>Default</th><th>Notes</th>
* </tr>
* </thead>
* <tbody>
* <tr>
* <td>Gx</td><td>Periodic group number</td><td>None</td><td>x must be specified and must be a number between
* 1 and 18. This symbol is supported by the MOE SMARTS implementation</td>
* <tr>
* <td>#X</td><td>Any non-carbon heavy element</td><td>None</td><td>This
* symbol is supported by the MOE SMARTS implementation</td>
* </tr>
* <tr>
* <td>^x</td><td>Any atom with the a specified hybridization state</td><td>None</td><td>x must be specified and
* should be between 1 and 8 (inclusive), corresponding to SP1, SP2, SP3, SP3D1, SP3D2
* SP3D3, SP3D4 and SP3D5. Supported by the OpenEye SMARTS implementation</td>
* </tr>
* </tbody>
* </table>
*
* <h3>Notes</h3>
* <ul>
* <li>As <a href="http://sourceforge.net/mailarchive/message.php?msg_name=4964F605.1070502%40emolecules.com">described</a>
* by Craig James the <code>h<n></code> SMARTS pattern should not be used. It was included in the Daylight spec for
* backwards compatibility. To match hydrogens, use the <code>H<n></cod> pattern.</li>
* <li>The wild card pattern (<code>*</code>) will not match hydrogens (explicit or implicit) unless an isotope is specified.
* In other words, <code>*</code> gives two hits against <code>C[2H]</code> but 1 hit against
* <code>C[H]</code>. This also means that
* it gives no hits against <code>[H][H]</code>. This is contrary to what is shown by Daylights
* <a href="http://www.daylight.com/daycgi_tutorials/depictmatch.cgi">depictmatch</a> service, but is based on this
* <a href="https://sourceforge.net/mailarchive/message.php?msg_name=4964FF9D.3040004%40emolecules.com">discussion</a>.
* A work around to get <code>*</code> to match <code>[H][H]</code> is to write it in the form <code>[1H][1H]</code>.
* <p>
* It's not entirely clear what the behavior of * should be with respect to hydrogens.
* it is possible that the code will be updated so that <code>*</code> will not match <i>any</i> hydrogen in the future.</li>
* <li>The {@link org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector} only considers single rings and
* two fused non-spiro rings. As a result, it does not properly detect aromaticity in polycyclic systems such
* as <code>[O-]C(=O)c1ccccc1c2c3ccc([O-])cc3oc4cc(=O)ccc24</code>. Thus SMARTS patterns that depend on proper
* aromaticity detection may not work correctly in such polycyclic systems</li>
* </ul>
*
* @author Rajarshi Guha
* @cdk.created 2007-04-08
* @cdk.module smarts
* @cdk.githash
* @cdk.keyword SMARTS
* @cdk.keyword substructure search
* @cdk.bug 1760973
* @cdk.bug 1761027
*/
@TestClass("org.openscience.cdk.smiles.smarts.SMARTSQueryToolTest")
public class SMARTSQueryTool {
private static ILoggingTool logger =
LoggingToolFactory.createLoggingTool(SMARTSQueryTool.class);
private String smarts;
private IAtomContainer atomContainer = null;
private QueryAtomContainer query = null;
private List<List<Integer>> matchingAtoms = null;
public SMARTSQueryTool(String smarts) throws CDKException {
this.smarts = smarts;
try {
initializeQuery();
} catch (TokenMgrError error) {
throw new CDKException("Error parsing SMARTS", error);
}
}
/**
* Returns the current SMARTS pattern being used.
*
* @return The SMARTS pattern
*/
@TestMethod("testQueryTool")
public String getSmarts() {
return smarts;
}
/**
* Set a new SMARTS pattern.
*
* @param smarts The new SMARTS pattern
* @throws CDKException if there is an error in parsing the pattern
*/
@TestMethod("testQueryTool, testQueryToolResetSmart")
public void setSmarts(String smarts) throws CDKException {
this.smarts = smarts;
initializeQuery();
}
/**
* Perform a SMARTS match and check whether the query is present in the
* target molecule. <p/> This function simply checks whether the query
* pattern matches the specified molecule. However the function will also,
* internally, save the mapping of query atoms to the target molecule
* <p>
* <b>Note</b>: This method performs a simple caching scheme, by comparing the current
* molecule to the previous molecule by reference. If you repeatedly match
* different SMARTS on the same molecule, this method will avoid initializing (
* ring perception, aromaticity etc.) the
* molecule each time. If however, you modify the molecule between such multiple
* matchings you should use the other form of this method to force initialization.
*
* @param atomContainer The target moleculoe
* @return true if the pattern is found in the target molecule, false
* otherwise
* @throws CDKException if there is an error in ring, aromaticity or isomorphism
* perception
* @see #getMatchingAtoms()
* @see #countMatches()
* @see #matches(org.openscience.cdk.interfaces.IAtomContainer, boolean)
*/
public boolean matches(IAtomContainer atomContainer) throws CDKException {
return matches(atomContainer, false);
}
/**
* Perform a SMARTS match and check whether the query is present in the
* target molecule. <p/> This function simply checks whether the query
* pattern matches the specified molecule. However the function will also,
* internally, save the mapping of query atoms to the target molecule
*
* @param atomContainer The target moleculoe
* @param forceInitialization If true, then the molecule is initialized (ring perception, aromaticity etc).
* If false, the molecule is only initialized if it is different (in terms of object reference)
* than one supplied in a previous call to this method.
* @return true if the pattern is found in the target molecule, false
* otherwise
* @throws CDKException if there is an error in ring, aromaticity or isomorphism
* perception
* @see #getMatchingAtoms()
* @see #countMatches()
* @see #matches(org.openscience.cdk.interfaces.IAtomContainer)
*/
@TestMethod("testQueryTool, testQueryToolSingleAtomCase, testQuery")
public boolean matches(IAtomContainer atomContainer, boolean forceInitialization) throws CDKException {
if (this.atomContainer == atomContainer) {
if (forceInitialization) initializeMolecule();
} else {
this.atomContainer = atomContainer;
initializeMolecule();
}
// First calculate the recursive smarts
initializeRecursiveSmarts(this.atomContainer);
// lets see if we have a single atom query
if (query.getAtomCount() == 1) {
// lets get the query atom
IQueryAtom queryAtom = (IQueryAtom) query.getAtom(0);
matchingAtoms = new ArrayList<List<Integer>>();
for (IAtom atom : this.atomContainer.atoms()) {
if (queryAtom.matches(atom)) {
List<Integer> tmp = new ArrayList<Integer>();
tmp.add(this.atomContainer.getAtomNumber(atom));
matchingAtoms.add(tmp);
}
}
} else {
List bondMapping = UniversalIsomorphismTester.getSubgraphMaps(this.atomContainer, query);
matchingAtoms = getAtomMappings(bondMapping, this.atomContainer);
}
return matchingAtoms.size() != 0;
}
/**
* Returns the number of times the pattern was found in the target molecule.
* <p/> This function should be called after
* {@link #matches(org.openscience.cdk.interfaces.IAtomContainer)}. If not,
* the results may be undefined.
*
* @return The number of times the pattern was found in the target molecule
*/
@TestMethod("testQueryTool")
public int countMatches() {
return matchingAtoms.size();
}
/**
* Get the atoms in the target molecule that match the query pattern. <p/>
* Since there may be multiple matches, the return value is a List of List
* objects. Each List object contains the indices of the atoms in the target
* molecule, that match the query pattern
*
* @return A List of List of atom indices in the target molecule
*/
@TestMethod("testQueryTool")
public List<List<Integer>> getMatchingAtoms() {
return matchingAtoms;
}
/**
* Get the atoms in the target molecule that match the query pattern. <p/>
* Since there may be multiple matches, the return value is a List of List
* objects. Each List object contains the unique set of indices of the atoms in the target
* molecule, that match the query pattern
*
* @return A List of List of atom indices in the target molecule
*/
@TestMethod("testUniqueQueries")
public List<List<Integer>> getUniqueMatchingAtoms() {
List<List<Integer>> ret = new ArrayList<List<Integer>>();
for (List<Integer> atomMapping : matchingAtoms) {
Collections.sort(atomMapping);
// see if this sequence of atom indices is present
// in the return container
boolean present = false;
for (List<Integer> r : ret) {
if (r.size() != atomMapping.size()) continue;
Collections.sort(r);
boolean matches = true;
for (int i = 0; i < atomMapping.size(); i++) {
int index1 = atomMapping.get(i);
int index2 = r.get(i);
if (index1 != index2) {
matches = false;
break;
}
}
if (matches) {
present = true;
break;
}
}
if (!present) ret.add(atomMapping);
}
return ret;
}
/**
* Prepare the target molecule for analysis. <p/> We perform ring perception
* and aromaticity detection and set up the appropriate properties. Right
* now, this function is called each time we need to do a query and this is
* inefficient.
*
* @throws CDKException if there is a problem in ring perception or aromaticity
* detection, which is usually related to a timeout in the ring
* finding code.
*/
private void initializeMolecule() throws CDKException {
// Code copied from
// org.openscience.cdk.qsar.descriptors.atomic.AtomValenceDescriptor;
Map<String, Integer> valencesTable = new HashMap<String, Integer>();
valencesTable.put("H", 1);
valencesTable.put("Li", 1);
valencesTable.put("Be", 2);
valencesTable.put("B", 3);
valencesTable.put("C", 4);
valencesTable.put("N", 5);
valencesTable.put("O", 6);
valencesTable.put("F", 7);
valencesTable.put("Na", 1);
valencesTable.put("Mg", 2);
valencesTable.put("Al", 3);
valencesTable.put("Si", 4);
valencesTable.put("P", 5);
valencesTable.put("S", 6);
valencesTable.put("Cl", 7);
valencesTable.put("K", 1);
valencesTable.put("Ca", 2);
valencesTable.put("Ga", 3);
valencesTable.put("Ge", 4);
valencesTable.put("As", 5);
valencesTable.put("Se", 6);
valencesTable.put("Br", 7);
valencesTable.put("Rb", 1);
valencesTable.put("Sr", 2);
valencesTable.put("In", 3);
valencesTable.put("Sn", 4);
valencesTable.put("Sb", 5);
valencesTable.put("Te", 6);
valencesTable.put("I", 7);
valencesTable.put("Cs", 1);
valencesTable.put("Ba", 2);
valencesTable.put("Tl", 3);
valencesTable.put("Pb", 4);
valencesTable.put("Bi", 5);
valencesTable.put("Po", 6);
valencesTable.put("At", 7);
valencesTable.put("Fr", 1);
valencesTable.put("Ra", 2);
valencesTable.put("Cu", 2);
valencesTable.put("Mn", 2);
valencesTable.put("Co", 2);
// do all ring perception
AllRingsFinder arf = new AllRingsFinder();
IRingSet allRings;
try {
allRings = arf.findAllRings(atomContainer);
} catch (CDKException e) {
logger.debug(e.toString());
throw new CDKException(e.toString(), e);
}
// sets SSSR information
SSSRFinder finder = new SSSRFinder(atomContainer);
IRingSet sssr = finder.findEssentialRings();
for (IAtom atom : atomContainer.atoms()) {
// add a property to each ring atom that will be an array of
// Integers, indicating what size ring the given atom belongs to
// Add SSSR ring counts
if (allRings.contains(atom)) { // it's in a ring
atom.setFlag(CDKConstants.ISINRING, true);
// lets find which ring sets it is a part of
List<Integer> ringsizes = new ArrayList<Integer>();
IRingSet currentRings = allRings.getRings(atom);
int min = 0;
for (int i = 0; i < currentRings.getAtomContainerCount(); i++) {
int size = currentRings.getAtomContainer(i).getAtomCount();
if (min > size) min = size;
ringsizes.add(size);
}
atom.setProperty(CDKConstants.RING_SIZES, ringsizes);
atom.setProperty(CDKConstants.SMALLEST_RINGS, sssr.getRings(atom));
} else {
atom.setFlag(CDKConstants.ISINRING, false);
}
// determine how many rings bonds each atom is a part of
int hCount;
if (atom.getHydrogenCount() == CDKConstants.UNSET) hCount = 0;
else hCount = atom.getHydrogenCount();
List<IAtom> connectedAtoms = atomContainer.getConnectedAtomsList(atom);
int total = hCount + connectedAtoms.size();
for (IAtom connectedAtom : connectedAtoms) {
if (connectedAtom.getSymbol().equals("H")) {
hCount++;
}
}
atom.setProperty(CDKConstants.TOTAL_CONNECTIONS, total);
atom.setProperty(CDKConstants.TOTAL_H_COUNT, hCount);
if (valencesTable.get(atom.getSymbol()) != null) {
int formalCharge = atom.getFormalCharge() == CDKConstants.UNSET ? 0 : atom.getFormalCharge();
atom.setValency(valencesTable.get(atom.getSymbol()) - formalCharge);
}
}
for (IBond bond : atomContainer.bonds()) {
if (allRings.getRings(bond).getAtomContainerCount() > 0) {
bond.setFlag(CDKConstants.ISINRING, true);
}
}
for (IAtom atom : atomContainer.atoms()) {
List<IAtom> connectedAtoms = atomContainer.getConnectedAtomsList(atom);
int counter = 0;
IAtom any;
for (IAtom connectedAtom : connectedAtoms) {
any = connectedAtom;
if (any.getFlag(CDKConstants.ISINRING)) {
counter++;
}
}
atom.setProperty(CDKConstants.RING_CONNECTIONS, counter);
}
// check for atomaticity
try {
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(atomContainer);
CDKHueckelAromaticityDetector.detectAromaticity(atomContainer);
} catch (CDKException e) {
logger.debug(e.toString());
throw new CDKException(e.toString(), e);
}
}
/**
* Initializes recursive smarts atoms in the query.
*
* We loop over the SMARTS atoms in the query and associate the
* target molecule with each of the SMARTS atoms that need it
*
* @param atomContainer
* @throws CDKException
*/
private void initializeRecursiveSmarts(IAtomContainer atomContainer) throws CDKException {
for (IAtom atom : query.atoms()) {
initializeRecursiveSmartsAtom(atom, atomContainer);
}
}
/**
* Recursively initializes recursive smarts atoms
*
* @param atom
* @param atomContainer
* @throws CDKException
*/
private void initializeRecursiveSmartsAtom(IAtom atom, IAtomContainer atomContainer) throws CDKException {
if (atom instanceof LogicalOperatorAtom) {
initializeRecursiveSmartsAtom(((LogicalOperatorAtom) atom).getLeft(), atomContainer);
if (((LogicalOperatorAtom) atom).getRight() != null) {
initializeRecursiveSmartsAtom(((LogicalOperatorAtom) atom).getRight(), atomContainer);
}
} else if (atom instanceof RecursiveSmartsAtom) {
((RecursiveSmartsAtom) atom).setAtomContainer(atomContainer);
} else if (atom instanceof HydrogenAtom) {
((HydrogenAtom) atom).setAtomContainer(atomContainer);
}
}
private void initializeQuery() throws CDKException {
matchingAtoms = null;
query = SMARTSParser.parse(smarts);
}
private List<List<Integer>> getAtomMappings(List bondMapping, IAtomContainer atomContainer) {
List<List<Integer>> atomMapping = new ArrayList<List<Integer>>();
// loop over each mapping
for (Object aBondMapping : bondMapping) {
List list = (List) aBondMapping;
List<Integer> tmp = new ArrayList<Integer>();
IAtom atom1 = null;
IAtom atom2 = null;
// loop over this mapping
for (Object aList : list) {
RMap map = (RMap) aList;
int bondID = map.getId1();
// get the atoms in this bond
IBond bond = atomContainer.getBond(bondID);
atom1 = bond.getAtom(0);
atom2 = bond.getAtom(1);
Integer idx1 = atomContainer.getAtomNumber(atom1);
Integer idx2 = atomContainer.getAtomNumber(atom2);
if (!tmp.contains(idx1)) tmp.add(idx1);
if (!tmp.contains(idx2)) tmp.add(idx2);
}
if (tmp.size() > 0) atomMapping.add(tmp);
// If there is only one bond, check if it matches both ways.
if (list.size() == 1 && atom1.getAtomicNumber() == atom2.getAtomicNumber()) {
List<Integer> tmp2 = new ArrayList<Integer>();
tmp2.add(tmp.get(0));
tmp2.add(tmp.get(1));
atomMapping.add(tmp2);
}
}
return atomMapping;
}
}