/* $Revision$ $Author$ $Date$ * * Copyright (C) 2002-2007 Christoph Steinbeck <steinbeck@users.sf.net> * * Contact: cdk-devel@lists.sourceforge.net * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * All we ask is that proper credit is given for our work, which includes * - but is not limited to - adding the above copyright notice to the beginning * of your source code files, and to any copyright notice that you may distribute * with programs based on this work. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. */ package org.openscience.cdk.fingerprint; import org.openscience.cdk.CDKConstants; import org.openscience.cdk.annotations.TestClass; import org.openscience.cdk.annotations.TestMethod; import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.PathTools; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.ringsearch.AllRingsFinder; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.periodictable.PeriodicTable; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * Generates a fingerprint for a given AtomContainer. Fingerprints are * one-dimensional bit arrays, where bits are set according to a the * occurrence of a particular structural feature (See for example the * Daylight inc. theory manual for more information). Fingerprints allow for * a fast screening step to exclude candidates for a substructure search in a * database. They are also a means for determining the similarity of chemical * structures. <p> * * A fingerprint is generated for an AtomContainer with this code: <pre> * Molecule molecule = new Molecule(); * BitSet fingerprint = Fingerprinter.getFingerprint(molecule); * fingerprint.size(); // returns 1024 by default * fingerprint.length(); // returns the highest set bit * </pre> <p> * * The FingerPrinter assumes that hydrogens are explicitly given! Furthermore, * if pseudo atoms or atoms with malformed symbols are present, their atomic * number is taken as one more than the last element currently supported in * {@link org.openscience.cdk.tools.periodictable.PeriodicTable}. * * <font color="#FF0000">Warning: The aromaticity detection for this * FingerPrinter relies on AllRingsFinder, which is known to take very long * for some molecules with many cycles or special cyclic topologies. Thus, * the AllRingsFinder has a built-in timeout of 5 seconds after which it * aborts and throws an Exception. If you want your SMILES generated at any * expense, you need to create your own AllRingsFinder, set the timeout to a * higher value, and assign it to this FingerPrinter. In the vast majority of * cases, however, the defaults will be fine. </font> <p> * * <font color="#FF0000">Another Warning : The daylight manual says: * "Fingerprints are not so definite: if a fingerprint indicates a pattern is * missing then it certainly is, but it can only indicate a pattern's presence * with some probability." In the case of very small molecules, the * probability that you get the same fingerprint for different molecules is * high. </font> * </p> * * @author steinbeck * @cdk.created 2002-02-24 * @cdk.keyword fingerprint * @cdk.keyword similarity * @cdk.module standard * @cdk.githash */ @TestClass("org.openscience.cdk.fingerprint.FingerprinterTest") public class Fingerprinter implements IFingerprinter { /** The default length of created fingerprints. */ public final static int DEFAULT_SIZE = 1024; /** The default search depth used to create the fingerprints. */ public final static int DEFAULT_SEARCH_DEPTH = 8; private int size; private int searchDepth; static int debugCounter = 0; private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(Fingerprinter.class); private static final Map<String, String> queryReplace = new HashMap<String, String>() { private static final long serialVersionUID = 1L; { put("Cl", "X"); put("Br", "Z"); put("Si", "Y"); put("As", "D"); put("Li", "L"); put("Se", "E"); put("Na", "G"); put("Ca", "J"); put("Al", "A"); } }; /** * Creates a fingerprint generator of length <code>DEFAULT_SIZE</code> * and with a search depth of <code>DEFAULT_SEARCH_DEPTH</code>. */ public Fingerprinter() { this(DEFAULT_SIZE, DEFAULT_SEARCH_DEPTH); } public Fingerprinter(int size) { this(size, DEFAULT_SEARCH_DEPTH); } /** * Constructs a fingerprint generator that creates fingerprints of * the given size, using a generation algorithm with the given search * depth. * * @param size The desired size of the fingerprint * @param searchDepth The desired depth of search */ public Fingerprinter(int size, int searchDepth) { this.size = size; this.searchDepth = searchDepth; } /** * Generates a fingerprint of the default size for the given AtomContainer. * * @param container The AtomContainer for which a Fingerprint is generated * @param ringFinder An instance of * {@link org.openscience.cdk.ringsearch.AllRingsFinder} * @exception CDKException if there is a timeout in ring or aromaticity * perception * @return A {@link BitSet} representing the fingerprint */ @TestMethod("testGetFingerprint_IAtomContainer") public BitSet getFingerprint(IAtomContainer container, AllRingsFinder ringFinder) throws CDKException { int position = -1; logger.debug("Entering Fingerprinter"); logger.debug("Starting Aromaticity Detection"); long before = System.currentTimeMillis(); AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(container); CDKHueckelAromaticityDetector.detectAromaticity(container); long after = System.currentTimeMillis(); logger.debug("time for aromaticity calculation: " + (after - before) + " milliseconds"); logger.debug("Finished Aromaticity Detection"); BitSet bitSet = new BitSet(size); int[] hashes = findPathes(container, searchDepth); for (int hash : hashes) { position = new java.util.Random(hash).nextInt(size); bitSet.set(position); } return bitSet; } /** * Generates a fingerprint of the default size for the given AtomContainer. * *@param container The AtomContainer for which a Fingerprint is generated */ @TestMethod("testGetFingerprint_IAtomContainer") public BitSet getFingerprint(IAtomContainer container) throws CDKException { return getFingerprint(container, null); } /** * Get all paths of lengths 0 to the specified length. * * This method will find all paths upto length N starting from each * atom in the molecule and return the unique set of such paths. * * @param container The molecule to search * @param searchDepth The maximum path length desired * @return A Map of path strings, keyed on themselves */ protected int[] findPathes(IAtomContainer container, int searchDepth) { List<StringBuffer> allPaths = new ArrayList<StringBuffer>(); Map<IAtom,Map<IAtom, IBond>> cache = new HashMap<IAtom, Map<IAtom,IBond>>(); for (IAtom startAtom : container.atoms()) { List<List<IAtom>> p = PathTools.getPathsOfLengthUpto(container, startAtom, searchDepth); for (List<IAtom> path : p) { StringBuffer sb = new StringBuffer(); IAtom x = path.get(0); // TODO if we ever get more than 255 elements, this will // fail maybe we should use 0 for pseudo atoms and // malformed symbols? if (x instanceof IPseudoAtom) sb.append((char) PeriodicTable.getElementCount() + 1); else { Integer atnum = PeriodicTable.getAtomicNumber(x.getSymbol()); if (atnum != null) sb.append(convertSymbol(x.getSymbol())); else sb.append((char) PeriodicTable.getElementCount() + 1); } for (int i = 1; i < path.size(); i++) { final IAtom[] y = {path.get(i)}; Map<IAtom, IBond> m = cache.get( x ); final IBond[] b = { m != null ? m.get( y[0] ) : null }; if ( b[0] == null ) { b[0] = container.getBond(x, y[0]); cache.put( x, new HashMap<IAtom, IBond>() { {put(y[0], b[0]); } } ); } sb.append(getBondSymbol(b[0])); sb.append(convertSymbol(y[0].getSymbol())); x = y[0]; } // we store the lexicographically lower one of the // string and its reverse StringBuffer revForm = new StringBuffer(sb); revForm.reverse(); if (sb.toString().compareTo(revForm.toString()) <= 0) allPaths.add(sb); else allPaths.add(revForm); } } // now lets clean stuff up Set<String> cleanPath = new HashSet<String>(); for (StringBuffer s : allPaths) { String s1 = s.toString().trim(); if (s1.equals("")) continue; if (cleanPath.contains(s1)) continue; String s2 = s.reverse().toString().trim(); if (cleanPath.contains(s2)) continue; cleanPath.add(s2); } // convert paths to hashes int[] hashes = new int[cleanPath.size()]; int i= 0; for (String s: cleanPath) hashes[i++] = s.hashCode(); return hashes; } private String convertSymbol(String symbol) { String returnSymbol = queryReplace.get( symbol ); return returnSymbol == null ? symbol : returnSymbol; } /** * Gets the bondSymbol attribute of the Fingerprinter class * *@param bond Description of the Parameter *@return The bondSymbol value */ protected String getBondSymbol(IBond bond) { String bondSymbol = ""; if (bond.getFlag(CDKConstants.ISAROMATIC)) { bondSymbol = ":"; } else if (bond.getOrder() == IBond.Order.SINGLE) { bondSymbol = "-"; } else if (bond.getOrder() == IBond.Order.DOUBLE) { bondSymbol = "="; } else if (bond.getOrder() == IBond.Order.TRIPLE) { bondSymbol = "#"; } return bondSymbol; } @TestMethod("testGetSearchDepth") public int getSearchDepth() { return searchDepth; } @TestMethod("testGetSize") public int getSize() { return size; } }