package gov.nih.ncgc.bard.resourcemgr.precomp; import gov.nih.ncgc.bard.capextract.CAPUtil; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import org.eclipse.jetty.util.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import chemaxon.formats.MolFormatException; import chemaxon.struc.Molecule; import chemaxon.util.MolHandler; public class CompoundSimilarityWorker { private static Logger logger = LoggerFactory.getLogger(CompoundSimilarityWorker.class); static double tanimoto (int[] fp1, int[] fp2) { int a = 0, b = 0, c = 0; for (int i = 0; i < fp1.length; ++i) { c += Integer.bitCount(fp1[i] & fp2[i]); a += Integer.bitCount(fp1[i]); b += Integer.bitCount(fp2[i]); } return (double)c/(a+b-c); } static double tanimoto (long[] fp1, long[] fp2) { int a = 0, b = 0, c = 0; for (int i = 0; i < fp1.length; ++i) { c += Long.bitCount(fp1[i] & fp2[i]); a += Long.bitCount(fp1[i]); b += Long.bitCount(fp2[i]); } return (double)c/(a+b-c); } /** * Refreshes all druglike and druglike cids for ALL compounds in the compound table. * * @param dbURL * @throws SQLException * @throws MolFormatException */ public void refreshDruglikeInCompound(String dbURL) throws SQLException, MolFormatException { logger.info("Starting update of druglike in compound in:"+dbURL); Connection conn = CAPUtil.connectToBARD(dbURL); int maxRings = 10; Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select b.iso_smiles, b.cid from compound_annot a, compound b where a.val like '%human approved drug%' and a.cid = b.cid"); ArrayList <long []> drugFps = new ArrayList <long []>(); ArrayList <Long> drugCIDs = new ArrayList <Long>(); MolHandler mh; int [] fpArr; long [] fpArrUnsigned; while(rs.next()) { drugCIDs.add(rs.getLong(2)); mh = new MolHandler(rs.getString(1)); mh.aromatize(); //aromatize BEFORE fingerprinting fpArr = mh.generateFingerprintInInts(16,2,6); fpArrUnsigned = new long[fpArr.length]; for(int i = 0; i < fpArr.length; i++) { fpArrUnsigned[i] = convertToUnsignedInt(fpArr[i]); } drugFps.add(fpArrUnsigned); // drugFps.add(fpArr); mh = null; } logger.info("Druglike: Have fp for all approved drugs. Drug cid cnt: "+drugFps.size()); rs.close(); rs = stmt.executeQuery("select count(cid) from compound"); long totalCmpCnt = 0l; if(rs.next()) { totalCmpCnt = rs.getLong(1); } rs.close(); long nullDruglikeCnt = 0l; rs = stmt.executeQuery("select count(cid) from compound"); if(rs.next()) { nullDruglikeCnt = rs.getLong(1); } logger.info("Druglike: compound count: "+totalCmpCnt); logger.info("Druglike: compound count to find sim to drugs: "+nullDruglikeCnt); rs.close(); int batchSize = 100000; int batchCnt = ((int)(nullDruglikeCnt/batchSize)) + 1; long currPos = 0l; logger.info("Druglike: Batch size 100000, number of batches:"+batchCnt); String sqlSelectBase = "select cid, iso_smiles from compound "; String sqlSelect = ""; double maxTaniSim; double taniSim; double [][] cidMaxArray; int cnt = 0; int [] fp; Molecule mol; int rings; long totCnt = 0; long closestCid = 0; int drugCnt = 0; double closestDrugCID; boolean testbreak = false; PreparedStatement updatePS = conn.prepareStatement("update compound set druglike = ?, druglike_cid = ? where cid = ?"); for(int batch = 0; batch < batchCnt; batch++) { if(testbreak) { break; } logger.info("Druglike: Starting batch: "+(batch + 1)); //get a batch of cids and smiles sqlSelect = sqlSelectBase + "limit " + currPos + ", " + batchSize; rs = stmt.executeQuery(sqlSelect); //for each cid, get the min tanimoto, break if tanimoto is 1.000 cidMaxArray = new double [batchSize][3]; cnt = 0; while(rs.next()) { //get the cid for the current compound cidMaxArray[cnt][0] = rs.getDouble(1); closestCid = (long)cidMaxArray[cnt][0]; //generate the fp mh = new MolHandler(rs.getString(2)); mh.aromatize(); //aromatize BEFORE fingerprinting mol = mh.getMolecule(); rings = mol.getEdgeCount() - mol.getAtomCount() + 1; //skip highly ringed structures if(rings < maxRings) { fp = mh.generateFingerprintInInts(16,2,6); fpArrUnsigned = new long[fp.length]; for(int i = 0; i < fp.length; i++) { fpArrUnsigned[i] = convertToUnsignedInt(fp[i]); } mh = null; //find max sim maxTaniSim = 0; drugCnt = 0; closestDrugCID = 0; for(long [] drugFp : drugFps) { taniSim = CompoundSimilarityWorker.tanimoto(drugFp, fpArrUnsigned); // taniSim = CompoundSimilarityWorker.tanimoto(drugFp, fp); if(taniSim > maxTaniSim) { maxTaniSim = taniSim; closestDrugCID = (double)(drugCIDs.get((int)drugCnt)); } if(maxTaniSim == 1.0) { closestDrugCID = (double)(drugCIDs.get((int)drugCnt)); break; } drugCnt++; } } else { //highly ringed, expensive fp, set low sim, 0 cid. maxTaniSim = 0.0; closestDrugCID = 0; } cidMaxArray[cnt][1] = maxTaniSim; cidMaxArray[cnt][2] = closestDrugCID; cnt++; } //close the result set rs.close(); //we have similarities for the batch cnt = 0; for(int i = 0; i < cidMaxArray.length ; i++) { cnt++; totCnt++; // update druglike max tanimoto updatePS.setDouble(1, cidMaxArray[i][1]); updatePS.setLong(2, (long)cidMaxArray[i][2]); // add cid to indicate record to update updatePS.setLong(3, (long)cidMaxArray[i][0]); updatePS.addBatch(); if(batch == 0 && i < 10) { logger.info("unsigned test cid ="+cidMaxArray[i][0]+" tanimoto="+cidMaxArray[i][1]+" close_cid="+cidMaxArray[i][2]); } if(cnt % 10000 == 0) { updatePS.executeBatch(); conn.commit(); } if(totCnt % 1000000 == 0) { logger.info("Druglike: total update cnt: "+totCnt); //test break if(totCnt == 2000000) { updatePS.executeBatch(); conn.commit(); testbreak = true; break; } } } updatePS.executeBatch(); conn.commit(); //update current position currPos += (batchSize + 1); } logger.info("Finished refresh of druglike in compound table in:"+dbURL); } public static long convertToUnsignedInt(int input) { return input & 0xFFFFFFFFL; } public static void main(String [] args) { CompoundSimilarityWorker worker = new CompoundSimilarityWorker(); String dbURL = "jdbc:mysql://bohr.fast.ncats.nih.gov:3306/bard3?zeroDateTimeBehavior=convertToNull"; // try { // worker.refreshDruglikeInCompound(dbURL); // } catch (MolFormatException e) { // e.printStackTrace(); // } catch (SQLException e) { // e.printStackTrace(); // } MolHandler mh; try { mh = new MolHandler("CC1CN(C(C2=C1OC3=C2C=C(C=C3)N)C)C.C(=CC(=O)O)C(=O)O"); mh.aromatize(); //aromatize BEFORE fingerprinting int [] fp1 = mh.generateFingerprintInInts(16,2,6); mh = new MolHandler("CN(C)CC1=CC2=C(O1)CC(NC2)CC3=CC=CC=C3"); mh.aromatize(); //aromatize BEFORE fingerprinting int [] fp2 = mh.generateFingerprintInInts(16,2,6); System.out.println(CompoundSimilarityWorker.tanimoto(fp2, fp1)); } catch (MolFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }