/* $Revision: 11674 $ $Author: rajarshi $ $Date: 2008-07-20 22:05:08 -0400 (Sun, 20 Jul 2008) $ * * Copyright (C) 2008 Rajarshi Guha <rajarshi@users.sourceforge.net> * * Contact: cdk-devel@lists.sourceforge.net * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * All we ask is that proper credit is given for our work, which includes * - but is not limited to - adding the above copyright notice to the beginning * of your source code files, and to any copyright notice that you may distribute * with programs based on this work. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. */ package org.openscience.cdk.fingerprint; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.BitSet; import java.util.Iterator; import java.util.List; import org.openscience.cdk.CDKConstants; import org.openscience.cdk.annotations.TestClass; import org.openscience.cdk.annotations.TestMethod; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.ConnectivityChecker; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.interfaces.IMoleculeSet; import org.openscience.cdk.interfaces.IRingSet; import org.openscience.cdk.ringsearch.AllRingsFinder; import org.openscience.cdk.smiles.smarts.SMARTSQueryTool; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; /** * This fingerprinter generates 166 bit MACCS keys. * <p/> * The SMARTS patterns for each of the features was taken from * <a href="http://www.rdkit.org"> RDKit</a>. However given that there is no * official and explicit listing of the original key definitions, the results * of this implementation may differ from others. * * This class assumes that aromaticity perception and atom typing have been * performed prior to generating the fingerprint * * <b>Note</b> Currently bits 1 and 44 are completely ignored since the RDKit * defs do not provide a definition and I can't find an official description * of them * * @author Rajarshi Guha * @cdk.created 2008-07-23 * @cdk.keyword fingerprint * @cdk.keyword similarity * @cdk.module fingerprint * @cdk.githash */ @TestClass("org.openscience.cdk.fingerprint.MACCSFingerprinterTest") public class MACCSFingerprinter implements IFingerprinter { private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(MACCSFingerprinter.class); private MaccsKey[] keys = null; @TestMethod("testFingerprint") public MACCSFingerprinter() { try { keys = readKeyDef(); } catch (IOException e) { logger.debug(e); } catch (CDKException e) { logger.debug(e); } } /** * Calculates the substructure fingerprint for the given AtomContainer. */ @TestMethod("testFingerprint,testfp2") public BitSet getFingerprint(IAtomContainer atomContainer) throws CDKException { if (keys == null) throw new CDKException("Could not setup key definitions"); int bitsetLength = keys.length; BitSet fingerPrint = new BitSet(bitsetLength); SMARTSQueryTool sqt = new SMARTSQueryTool("C"); for (int i = 0; i < keys.length; i++) { String smarts = keys[i].getSmarts(); if (smarts.equals("?")) continue; int count = keys[i].getCount(); sqt.setSmarts(smarts); boolean status = sqt.matches(atomContainer); if (status) { if (count == 0) fingerPrint.set(i, true); else { List<List<Integer>> matches = sqt.getUniqueMatchingAtoms(); if (matches.size() > count) fingerPrint.set(i, true); } } } // at this point we have skipped the entries whose pattern is "?" // (bits 1,44,125,166) so let try and do those features by hand // bit 125 aromatic ring count > 1 AllRingsFinder ringFinder = new AllRingsFinder(); IRingSet rings = ringFinder.findAllRings(atomContainer); int ringCount = 0; for (int i = 0; i < rings.getAtomContainerCount(); i++) { IAtomContainer ring = rings.getAtomContainer(i); boolean allAromatic = true; Iterator<IBond> bonds = ring.bonds().iterator(); while (bonds.hasNext()) { IBond bond = bonds.next(); if (!bond.getFlag(CDKConstants.ISAROMATIC)) { allAromatic = false; break; } } if (allAromatic) ringCount++; if (ringCount > 1) { fingerPrint.set(124, true); break; } } // bit 166 (*).(*) IMoleculeSet part = ConnectivityChecker.partitionIntoMolecules(atomContainer); if (part.getMoleculeCount() > 1) fingerPrint.set(165,true); return fingerPrint; } @TestMethod("getsize") public int getSize() { if (keys != null) return keys.length; else return 0; } private MaccsKey[] readKeyDef() throws IOException, CDKException { List<MaccsKey> keys = new ArrayList<MaccsKey>(); String filename = "org/openscience/cdk/fingerprint/data/maccs.txt"; InputStream ins = this.getClass().getClassLoader().getResourceAsStream(filename); BufferedReader reader = new BufferedReader(new InputStreamReader(ins)); for (int i = 0; i < 32; i++) reader.readLine(); // now process the keys String line; while ((line = reader.readLine()) != null) { String data = line.trim().split("\\|")[0]; String[] toks = data.trim().split("\\s"); keys.add(new MaccsKey(toks[1], Integer.parseInt(toks[2]))); } if (keys.size() != 166) throw new CDKException("Found " + keys.size() + " keys during setup. Should be 166"); return keys.toArray(new MaccsKey[]{}); } private class MaccsKey { private String smarts; private int count; private MaccsKey(String smarts, int count) { this.smarts = smarts; this.count = count; } public String getSmarts() { return smarts; } public int getCount() { return count; } } }