package gov.nih.ncgc.bard.resourcemgr.extresource.pubchem;
import gov.nih.ncgc.bard.resourcemgr.BardDBUtil;
import gov.nih.ncgc.bard.resourcemgr.util.BardResourceFetch;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Properties;
import java.util.logging.Logger;
public class BardCompoundPubchemExtrasLoader {
static final private Logger logger =
Logger.getLogger(BardCompoundPubchemExtrasLoader.class.getName());
static final String SQLInsertIntoCIDSID = "insert into cid_sid (cid, sid, rel_type) values (?,?,?)";
static final String SQLInsertIntoTempCIDSID = "insert into temp_cid_sid (cid, sid, rel_type) values (?,?,?)";
static final String SQLReplaceIntoCIDSID = "replace into cid_sid (cid, sid, rel_type) values (?,?,?)";
static final String SQLSelectMatchCIDSIDTYPE = "select cid from cid_sid where rel_type =? and cid = ? and sid = ?";
static final String SQLInsertCompoundSynonymIntoSynonym = "insert into synonyms (id, type, syn) values (?,?,?)";
static final String SQLCreateTempCIDSID = "create table if not exists temp_cid_sid like cid_sid";
PreparedStatement checkCIDSIDMatchPS;
private Connection conn;
private String databaseUrl;
private String driverClassName;
private String user;
private String pw;
private String dir = "/ifs/prod/braistedjc/db_scripts/pubchem_CID_SID/CID-SID";
public BardCompoundPubchemExtrasLoader() { }
public BardCompoundPubchemExtrasLoader(String databaseURL, String driverClassName) {
this.databaseUrl = databaseURL;
this.driverClassName = driverClassName;
}
/**
* Deprecated: This is OK for an initial load but not for generalized maintenance.
* This loads the sid to cid mappings
*/
public void loadCIDSID() {
connect(databaseUrl, driverClassName, "bard_manager", "bard_manager");
try {
BufferedReader br = new BufferedReader(new FileReader(dir));
conn.setAutoCommit(false);
//prepare the match PS
checkCIDSIDMatchPS = conn.prepareStatement(this.SQLSelectMatchCIDSIDTYPE);
PreparedStatement ps = conn.prepareStatement(SQLInsertIntoCIDSID);
String line;
String [] toks;
long insertCount = 0;
boolean executed = false;
while((line = br.readLine()) != null) {
toks = line.split("\t");
executed = false;
if(toks.length != 3)
continue;
ps.setLong(1, Long.parseLong(toks[0]));
ps.setLong(2, Long.parseLong(toks[1]));
ps.setInt(3, Integer.parseInt(toks[2]));
ps.addBatch();
insertCount++;
if(insertCount % 50000 == 0) {
ps.executeBatch();
conn.commit();
executed = true;
logger.info("Insert Count = "+insertCount);
}
}
if(!executed)
ps.executeBatch();
conn.commit();
conn.close();
br.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* This loads the sid to cid mappings, reports on the number of mappings reviewed.
* This uses replace syntax to maintain unique cid, sid, type mappings.
* The use of replace syntax requires a unique key on cid, sid, and type.
*
*/
public long updateCIDSID(Properties dbManagerProps) {
long newMapCnt = 0;
long initCnt = 0;
try {
initCnt = BardDBUtil.getTableRowCount("cid_sid");
conn = BardDBUtil.connect();
String cidsidGZIPPath = dbManagerProps.getProperty("bard.filepath.pubchem.cidsid.gzip");
String cidsidPath = cidsidGZIPPath.replace(".gz", "");
BardResourceFetch.gunzipFile(cidsidGZIPPath, cidsidPath);
BufferedReader br = new BufferedReader(new FileReader(cidsidPath));
conn.setAutoCommit(false);
//prepare the match PS
checkCIDSIDMatchPS = conn.prepareStatement(this.SQLSelectMatchCIDSIDTYPE);
PreparedStatement ps = conn.prepareStatement(SQLInsertIntoCIDSID);
String line;
String [] toks;
long insertCount = 0;
long sid, cid;
int type;
boolean executed = false;
while((line = br.readLine()) != null) {
toks = line.split("\t");
executed = false;
if(toks.length != 3)
continue;
cid = Long.parseLong(toks[0]);
sid = Long.parseLong(toks[1]);
type = Integer.parseInt(toks[2]);
//make sure it doesn't exist... costly but alternative is a unique key across all three fields.
if(!this.haveCIDSIDMapping(type, cid, sid)) {
ps.setLong(1, cid);
ps.setLong(2, sid);
ps.setInt(3, type);
ps.addBatch();
insertCount++;
if(insertCount % 50000 == 0) {
ps.executeBatch();
conn.commit();
executed = true;
logger.info("Insert Count = "+insertCount);
}
}
}
//execute any remaining batches
ps.executeBatch();
conn.commit();
conn.close();
br.close();
newMapCnt = BardDBUtil.getTableRowCount("cid-sid") - initCnt;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return newMapCnt;
}
/**
* This loads the sid to cid mappings, reports on the number of mappings reviewed.
* This uses replace syntax to maintain unique cid, sid, type mappings.
* The use of replace syntax requires a unique key on cid, sid, and type.
*
*/
public long rebuildCIDSID(Properties dbManagerProps) {
long newMapCnt = 0;
long initCnt = 0;
try {
initCnt = BardDBUtil.getTableRowCount("cid_sid");
conn = BardDBUtil.connect();
String cidsidGZIPPath = dbManagerProps.getProperty("bard.filepath.pubchem.cidsid.gzip");
String cidsidPath = cidsidGZIPPath.replace(".gz", "");
BardResourceFetch.gunzipFile(cidsidGZIPPath, cidsidPath);
//make the temp_cid_sid
Statement stmt = conn.createStatement();
stmt.execute(this.SQLCreateTempCIDSID);
stmt.execute("truncate table temp_cid_sid");
BufferedReader br = new BufferedReader(new FileReader(cidsidPath));
conn.setAutoCommit(false);
//prepare the match PS
PreparedStatement ps = conn.prepareStatement(SQLInsertIntoTempCIDSID);
String line;
String [] toks;
long insertCount = 0;
long sid, cid;
int type;
boolean executed = false;
while((line = br.readLine()) != null) {
toks = line.split("\t");
executed = false;
if(toks.length != 3)
continue;
cid = Long.parseLong(toks[0]);
sid = Long.parseLong(toks[1]);
type = Integer.parseInt(toks[2]);
ps.setLong(1, cid);
ps.setLong(2, sid);
ps.setInt(3, type);
ps.addBatch();
insertCount++;
if(insertCount % 50000 == 0) {
ps.executeBatch();
conn.commit();
executed = true;
logger.info("Insert Count = "+insertCount);
}
}
//execute any remaining batches
ps.executeBatch();
conn.commit();
conn.close();
br.close();
newMapCnt = BardDBUtil.getTableRowCount("temp-cid-sid") - initCnt;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return newMapCnt;
}
/**
* This loads the sid to cid mappings, reports on the number of mappings reviewed.
* This uses replace syntax to maintain unique cid, sid, type mappings.
* The use of replace syntax requires a unique key on cid, sid, and type.
*
*/
public long rebuildCIDSIDViaTempLoad(Properties dbManagerProps) {
long deltaMapCnt = 0;
long initCnt = 0;
long newMapCnt = 0;
try {
initCnt = BardDBUtil.getTableRowCount("cid_sid");
conn = BardDBUtil.connect();
String cidsidGZIPPath = dbManagerProps.getProperty("bard.filepath.pubchem.cidsid.gzip");
String cidsidPath = cidsidGZIPPath.replace(".gz", "");
BardResourceFetch.gunzipFile(cidsidGZIPPath, cidsidPath);
//make the temp_cid_sid
Statement stmt = conn.createStatement();
stmt.execute(this.SQLCreateTempCIDSID);
stmt.execute("truncate table temp_cid_sid");
//load the temp table
stmt.execute("load data infile '"+cidsidPath+"' into table temp_cid_sid (cid, sid, rel_type)");
newMapCnt = BardDBUtil.getTableRowCount("temp_cid_sid");
deltaMapCnt = newMapCnt - initCnt;
//swap tables iff larger or nearly the same size. Allow from some contraction due to deleted substances.
BardDBUtil.swapTempTableToProductionIfPassesSizeDelta("temp_cid_sid", "cid_sid", 0.98);
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return deltaMapCnt;
}
/**
* This loads the compound synonyms, reports on the number of synonyms added.
*/
public long updateCompoundSynonyms(Properties dbManagerProps) {
long newMapCnt = 0;
long initCnt = 0;
try {
initCnt = BardDBUtil.getTableRowCount("synonyms");
conn = BardDBUtil.connect();
String cidsynGZIPPath = dbManagerProps.getProperty("bard.filepath.pubchem.cidfilteredsynonyms.gzip");
String cidsynPath = cidsynGZIPPath.replace(".gz", "");
BardResourceFetch.gunzipFile(cidsynGZIPPath, cidsynPath);
BufferedReader br = new BufferedReader(new FileReader(cidsynPath));
conn.setAutoCommit(false);
PreparedStatement ps = conn.prepareStatement(SQLInsertCompoundSynonymIntoSynonym);
String line;
String [] toks;
long insertCount = 0;
long cid;
int type = 1;
String syn;
boolean executed = false;
while((line = br.readLine()) != null) {
toks = line.split("\t");
executed = false;
if(toks.length != 3)
continue;
cid = Long.parseLong(toks[0].trim());
syn = toks[1].trim();
//make sure it doesn't exist... costly but alternative is a unique key across all three fields.
if(!this.haveCompoundSynonym(type, cid, syn)) {
ps.setLong(1, cid);
ps.setInt(2, type);
ps.setString(3, syn);
ps.addBatch();
insertCount++;
if(insertCount % 50000 == 0) {
ps.executeBatch();
conn.commit();
executed = true;
logger.info("Insert Count = "+insertCount);
}
}
}
//execute any remaining batches
ps.executeBatch();
conn.commit();
conn.close();
br.close();
newMapCnt = BardDBUtil.getTableRowCount("synonyms") - initCnt;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return newMapCnt;
}
/**
* Precondition that the connection fields is initialized
* @param cid
* @param sid
* @param type
* @return
* @throws SQLException
*/
private boolean haveCIDSIDMapping(int type, long cid, long sid) throws SQLException {
boolean match = false;
checkCIDSIDMatchPS.setInt(1, type);
checkCIDSIDMatchPS.setLong(2, cid);
checkCIDSIDMatchPS.setLong(3, sid);
if(checkCIDSIDMatchPS.executeQuery().next()) {
match = true;
}
return match;
}
private boolean haveCompoundSynonym(int type, long cid, String syn) throws SQLException {
boolean match = false;
checkCIDSIDMatchPS.setInt(1, type);
checkCIDSIDMatchPS.setLong(2, cid);
checkCIDSIDMatchPS.setString(3, syn);
if(checkCIDSIDMatchPS.executeQuery().next()) {
match = true;
}
return match;
}
private void connect(String dataBaseUrl, String driverClassName, String user, String pw) {
try {
Class.forName(driverClassName);
conn = DriverManager.getConnection(dataBaseUrl, user, pw);
} catch (SQLException e) {
// TODO Auto-generated catch block
System.out.println("Error connecting! Exiting.");
e.printStackTrace();
System.exit(1);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
System.out.println("Can't resolve driver class:"+driverClassName);
e.printStackTrace();
System.exit(1);
}
}
public static void main(String [] args) {
String url = null, driver = null, user = null, pw = null;
// if(args.length != 4) {
// System.out.println("Error: Parameter list is incorrect. \n" +
// "Parameters: url=\'<db_url>\' driver=\'<qualified_driver_class_name>\' user=\'<user_name>\' pw=\'<user_password>\' ");
// System.exit(1);
// }
//
// String [] toks;
// for(String arg : args) {
// toks = arg.split("=");
// if(toks.length != 2) {
// System.out.println("Error: Parameter list is incorrect. \n" +
// "Parameters: url=\'<db_url>\' driver=\'<qualified_driver_class_name>\' user=\'<user_name>\' pw=\'<user_password>\' ");
// System.exit(1);
// }
//
// if(toks[0].equals("url")) {
// url = toks[1];
// } else if(toks[0].equals("driver")) {
// driver = toks[1];
// } else if(toks[0].equals("user")) {
// user = toks[1];
// } else if(toks[0].equals("pw")) {
// pw = toks[1];
// }
// }
//
// if(url == null || driver == null || user == null || pw == null) {
// System.out.println("Error: Missing parameter. Check keys and values.\n" +
// "Parameters: url=\'<db_url>\' driver=\'<qualified_driver_class_name>\' user=\'<user_name>\' pw=\'<user_password>\' ");
// System.exit(1);
// }
url = "jdbc:mysql://protein.nhgri.nih.gov:3306/bard2?zeroDateTimeBehavior=convertToNull";
driver = "com.mysql.jdbc.Driver";
BardCompoundPubchemExtrasLoader r = new BardCompoundPubchemExtrasLoader(url, driver);
r.loadCIDSID();
}
}