// $Id: MolFpFactory.java 3501 2009-10-29 16:01:09Z nguyenda $ package gov.nih.ncgc.util; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import chemaxon.struc.Molecule; import chemaxon.util.MolHandler; /** * Factory for generating molecular fingerprint */ public class MolFpFactory { final static Map<Integer, MolFpFactory> cache = new ConcurrentHashMap<Integer, MolFpFactory>(); /** * default fingerprint configuration */ static private final int FP_SIZE = 16; static private final int FP_BITS = 2; static private final int FP_DEPTH = 6; final private Cache<Molecule, int[]> fps = new Cache<Molecule, int[]>(); private int size, bits, depth; protected MolFpFactory (int size, int bits, int depth) { this.size = size; this.bits = bits; this.depth = depth; } public int[] generate (Molecule mol) { int[] fp = fps.get(mol); if (fp == null) { synchronized (this) { MolHandler mh = new MolHandler (mol); fp = mh.generateFingerprintInInts(size, bits, depth); fps.put(mol, fp); } } return fp; } // similarity metric public double tanimotoSim (Molecule a, Molecule b) { return tanimotoSim (generate (a), generate (b)); } public static double tanimotoSim (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int c = 0, a = 0, b = 0; for (int i = 0; i < fpa.length; ++i) { c += ChemUtil.countBits(fpa[i] & fpb[i]); a += ChemUtil.countBits(fpa[i]); b += ChemUtil.countBits(fpb[i]); } return (double)c/(a + b - c); } public double tanimotoDist (Molecule a, Molecule b) { return tanimotoDist (generate (a), generate (b)); } public static double tanimotoDist (int[] fpa, int[] fpb) { return 1. - tanimotoSim (fpa, fpb); } public double euclidean (Molecule a, Molecule b) { return euclidean (generate (a), generate (b)); } public static double euclidean (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int c = 0, a = 0, b = 0; for (int i = 0; i < fpa.length; ++i) { c += ChemUtil.countBits(fpa[i] & fpb[i]); a += ChemUtil.countBits(fpa[i]); b += ChemUtil.countBits(fpb[i]); } return Math.sqrt(a + b - 2.*c); } // similarity metric public double diceSim (Molecule a, Molecule b) { return diceSim (generate (a), generate (b)); } public static double diceSim (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int c = 0, a = 0, b = 0; for (int i = 0; i < fpa.length; ++i) { c += ChemUtil.countBits(fpa[i] & fpb[i]); a += ChemUtil.countBits(fpa[i]); b += ChemUtil.countBits(fpb[i]); } return (double)2.*c/(a + b); } // similarity metric public double cosineSim (Molecule a, Molecule b) { return cosineSim (generate (a), generate (b)); } public static double cosineSim (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int c = 0, a = 0, b = 0; for (int i = 0; i < fpa.length; ++i) { c += ChemUtil.countBits(fpa[i] & fpb[i]); a += ChemUtil.countBits(fpa[i]); b += ChemUtil.countBits(fpb[i]); } return (double)c/Math.sqrt(a * b); } // distance metric public double hammingDist (Molecule a, Molecule b) { return hammingDist (generate (a), generate (b)); } public static double hammingDist (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } double dist = 0.; for (int i = 0; i < fpa.length; ++i) { dist += ChemUtil.countBits(fpa[i] ^ fpb[i]); } return dist; } // distance metric public double jaccardDist (Molecule a, Molecule b) { return jaccardDist (generate (a), generate (b)); } public static double jaccardDist (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int d = 0, s = 0; for (int i = 0; i < fpa.length; ++i) { d += ChemUtil.countBits(fpa[i] ^ fpb[i]); s += ChemUtil.countBits(fpa[i] & fpb[i]); } return (double)d / (s + d); } // distance metric public double rogersTanimotoDist (Molecule a, Molecule b) { return rogersTanimotoDist (generate (a), generate (b)); } public static double rogersTanimotoDist (int[] fpa, int[] fpb) { if (fpa.length != fpb.length) { throw new IllegalArgumentException ("Arrays are not of the same size"); } int d = 0, s = 0, c = 0; for (int i = 0; i < fpa.length; ++i) { d += ChemUtil.countBits(fpa[i] ^ fpb[i]); s += ChemUtil.countBits(fpa[i] & fpb[i]); c += ChemUtil.countBits(~fpa[i] & ~fpb[i]); } return (double)2.*d/(s + c + 2.*d); } public int getNumInts () { return size; } public int getNumOnes () { return bits; } public int getNumEdges () { return depth; } public static MolFpFactory getInstance () { return getInstance (FP_SIZE, FP_BITS, FP_DEPTH); } public static MolFpFactory getInstance (int size, int bits, int depth) { int config = (size << 24) | (bits << 16) | depth; MolFpFactory factory = cache.get(config); if (factory == null) { synchronized (MolFpFactory.class) { factory = new MolFpFactory (size, bits, depth); cache.put(config, factory); } } return factory; } }