package gov.nih.ncgc.bard.resourcemgr.extresource.pubchem;
import ftp.FtpBean;
import ftp.FtpException;
import ftp.FtpListResult;
import gov.nih.ncgc.bard.resourcemgr.BardDBUtil;
import gov.nih.ncgc.bard.resourcemgr.BardExtResourceLoader;
import gov.nih.ncgc.bard.resourcemgr.BardExternalResource;
import gov.nih.ncgc.bard.resourcemgr.IBardExtResourceLoader;
import gov.nih.ncgc.bard.resourcemgr.util.BardResourceFetch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.Date;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.Properties;
import java.util.Vector;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.struc.Molecule;
/**
* This class loads or updates the substance table and synonyms table.
* First time batch loading requires download of all substance SDF file archives from pubmed and use of the full constructor
* that specifies db parameters and file path.
*
* The call to batchLoadSubstances() will load the table using the file path specified in the constructor.
* The process partitions the file names and moves them into a temp directory in batches for extraction and processing.
*
* *****************
*
* The class is also used for automatic updates of the sybstance and synonyms table.
* Use the default no argument constructor. This assumes the loading is into bard2 and that the files have been fetched from
* pubmed ftp and placed in the bard-scratch area.
*
* The call is to batchUpdateSubstance(Properties dbManagerProps)
*
* This option uses the replace syntax to either update if the primary key exists or insert if it's a new primary key entry.
*
* @author braistedjc
*
*/
public class PubchemSubstanceLoader extends BardExtResourceLoader implements IBardExtResourceLoader {
static final private Logger logger =
Logger.getLogger(PubchemSubstanceLoader.class.getName());
private String SERVICE_KEY_FULL_LOAD = "SUBSTANCE-REFRESH-FULL";
private String SERVICE_KEY_DAILY_LOAD = "SUBSTANCE-REFRESH-DAILY";
private String SERVICE_KEY_SPECIFIC_LOAD = "SUBSTANCE-REFRESH-SPECIFIC-DIR";
private static String sidKey = "PUBCHEM_SUBSTANCE_ID";
private static String dataSourceNameKey = "PUBCHEM_EXT_DATASOURCE_NAME";
private static String dataSourceRegidKey = "PUBCHEM_EXT_DATASOURCE_REGID";
private static String patentIDKey = "PUBCHEM_PATENT_ID";
private static String substanceURLKey = "PUBCHEM_EXT_SUBSTANCE_URL";
private static String substanceSynonymKey = "PUBCHEM_SUBSTANCE_SYNONYM";
private Hashtable <String, String> sourceNames;
//handle batches of 25 files at a time
private int fileBatchSize = 20;
//db connection
private Connection conn;
private Connection connUpdate;
//molecule sdf file parser
private MolImporter mi;
//patent id array
String [] patentArr;
//sql statements these are schema specific, perhpas a constant's class could old static versions of these test fields
private String insertSubstanceSQL = "insert into temp_substance (sid, dep_regid, source_name, substance_url, patent_ids)" +
" values (?,?,?,?,?)";
private String insertSynonymsSQL = "insert into synonyms (id, type, syn) values (?,?,?)";
//sql statements these are schema specific, perhpas a constant's class could old static versions of these test fields
private String replaceSubstanceSQL = "replace into substance (sid, dep_regid, source_name, substance_url, patent_ids)" +
" values (?,?,?,?,?)";
private String updateSubstanceDatesSQL = "update substance set deposited=?, updated=? where sid=?";
private String replaceSynonymsSQL = "replace into synonyms (id, type, syn) values (?,?,?)";
private String checkSynonymExistsSQL = "select id from synonyms where type = ? and id = ? and syn = ?";
//prepared statements
private PreparedStatement insertSubstancePS, insertSynonymsPS, replaceSubstancePS, replaceSynonymsPS, checkSynonymExistsPS,
updateSubstanceDatePS;
private String getSubstanceCountSQL = "select count(sid) from substance";
private Statement stmt;
//connection and processing global variables
private String dbURL, driver, user, pw;
private long molCount, synonymCount;
private String path;
private String tempPath;
private BardExternalResource sdfResource;
private BardExternalResource extrasResource;
public PubchemSubstanceLoader() { }
public PubchemSubstanceLoader(String dbURL, String driver, String user, String pw, String path) {
this.dbURL = dbURL;
this.driver = driver;
this.user = user;
this.pw = pw;
this.path = path;
this.tempPath = path+"/temp/";
this.molCount = 0;
synonymCount = 0;
}
public boolean load() {
boolean loaded = false;
path = service.getLocalResPath();
tempPath = path + "/Substance/temp/";
log.info("SUBSTANCE SERVICE KEY= ["+service.getServiceKey()+"]");
try {
if(service.getServiceKey().contains("SUBSTANCE-REFRESH-FULL")) {
//Manage FULL load using this code. Relies on temp_* tables.
//fetchResources();
//log.info("Entering Batch Load to Temp Tables");
//batchLoadSubstances();
} else {
log.info("Entering Batch Update to Substance and Synonym Tables");
fetchResources();
batchUpdateSubstances();
}
//update substance dates
log.info("Entering Substance Dates (deposited updated) update");
updateSubstanceDates();
}
catch (IOException e) {
e.printStackTrace();
return false;
} catch (FtpException e) {
e.printStackTrace();
return false;
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return loaded;
}
@Override
public String getLoadStatusReport() {
// TODO Auto-generated method stub
return null;
}
private boolean fetchResources() throws IOException, FtpException {
boolean haveFiles = false;
//evaluate the kind of load
String commandKey = service.getServiceKey();
FtpBean ftp = new FtpBean();
ArrayList <BardExternalResource> resources = service.getExtResources();
if(resources == null || resources.size() < 2)
return false;
sdfResource = resources.get(0);
extrasResource = resources.get(1);
//if sdf resource isn't and is extras, then swap order
if(!sdfResource.getResourceKey().equals("SUBSTANCE_SDF_GZ_FILES")) {
BardExternalResource swapResource = sdfResource;
sdfResource = extrasResource;
extrasResource = swapResource;
}
String dest = service.getLocalResPath()+"/Substance/";
log.info("Local Resource for download = "+dest);
//clear dest directory
File destDir = new File(dest);
String [] files = destDir.list();
File file;
for(String fileName : files) {
file = new File(destDir+"/"+fileName);
if(file.isFile()) {
file.delete();
}
}
//connect
ftp.ftpConnect(sdfResource.getResourceServer(), sdfResource.getResourceUserName(), sdfResource.getResourcePassword());
if(commandKey.contains(SERVICE_KEY_FULL_LOAD)) {
//update the whole thing
ftp.setDirectory(sdfResource.getResourcePath());
FtpListResult list = ftp.getDirectoryContent();
while (list.next() ) {
ftp.getBinaryFile(list.getName(), dest+"/"+list.getName());
}
haveFiles = true;
} else if(commandKey.contains(SERVICE_KEY_DAILY_LOAD)) {
//fetch today's latest resources
ftp.setDirectory(sdfResource.getResourcePath());
FtpListResult list = ftp.getDirectoryContent();
//collect the data directories
ArrayList<String> dateList = new ArrayList<String>();
while (list.next()) {
dateList.add(list.getName());
}
String [] dateArr = new String[dateList.size()];
for(int i = 0; i < dateArr.length; i++) {
dateArr[i] = dateList.get(i);
}
Arrays.sort(dateArr);
String dateDir = dateArr[dateArr.length-1];
ftp.setDirectory(sdfResource.getResourcePath()+"/"+dateDir+"/SDF");
list = ftp.getDirectoryContent();
while (list.next() ) {
ftp.getBinaryFile(list.getName(), dest+"/"+list.getName());
}
haveFiles = true;
} else if(commandKey.contains(SERVICE_KEY_SPECIFIC_LOAD)) {
log.info("SPECIFIC SUBSTANCE LOAD FETCHING FILES");
//fetch all resources within the external resource directory
ftp.setDirectory(sdfResource.getResourcePath());
FtpListResult list = ftp.getDirectoryContent();
while (list.next() ) {
ftp.getBinaryFile(list.getName(), dest+"/"+list.getName());
}
haveFiles=true;
}
//get extras
dest = service.getLocalResPath() + "/Substance-Extras/";
ftp.setDirectory(extrasResource.getResourcePath());
ftp.getBinaryFile("SID-Date.gz", dest+"SID-Date.gz");
ftp.getBinaryFile("Source-Names", dest+"Source-Names");
ftp.close();
return haveFiles;
}
public long batchUpdateSubstances() {
long substanceCount = 0;
long substanceUpdateCount = 0;
try {
parseSourceNames(path+"/Substance-Extras/Source-Names");
//get connection
conn = BardDBUtil.connect(service.getDbURL());
conn.setAutoCommit(false);
substanceCount = getSubstanceCount();
//prepare sql statements update/replace statement and exists query
prepareSQLUpdateStatements();
stmt = conn.createStatement();
//make a vector of file name batches to process
Vector <Vector<String>> fileBatches = partitionFiles(path+"/Substance/");
//iterate over file batches
for(Vector<String> files: fileBatches) {
copyAndUnzipFilesToTemp(files);
updateSubstances();
this.deleteFilesInTemp();
}
//final inserts
replaceSubstancePS.executeBatch();
insertSynonymsPS.executeBatch();
logger.info("load complete");
conn.commit();
//get new size
substanceUpdateCount = getSubstanceCount() - substanceCount;
conn.close();
logger.info("closed connection");
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return substanceUpdateCount;
}
/**
* Loads compound files found at path supplied in constructor
* and expects .sdf.gz files (sdf files in gzip format).
* @throws IOException
*/
public void batchLoadSubstances() {
try {
this.parseSourceNames(path+"Substance-Extras/Source-Names");
//get connection
conn = BardDBUtil.connect(service.getDbURL());
conn.setAutoCommit(false);
//prepare sql statements
prepareSQLInsertStatements();
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists temp_substance like substance");
stmt.execute("truncate temp_substance");
stmt.execute("create table if not exists temp_synonyms like synonyms");
stmt.execute("truncate temp_synonyms");
stmt.close();
stmt = null;
conn.commit();
//make a vector of file name batches to process
Vector <Vector<String>> fileBatches = partitionFiles(path+"/Substance/");
//iterate over file batches
for(Vector<String> files: fileBatches) {
copyAndUnzipFilesToTemp(files);
loadSubstances();
this.deleteFilesInTemp();
}
//final inserts
insertSubstancePS.executeBatch();
insertSynonymsPS.executeBatch();
logger.info("load complete");
conn.commit();
conn.close();
logger.info("closed connection");
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void connect() {
try {
Class.forName(driver);
conn = DriverManager.getConnection(dbURL, user, pw);
conn.setAutoCommit(false);
connUpdate = DriverManager.getConnection(dbURL, user, pw);
connUpdate.setAutoCommit(false);
} catch (ClassNotFoundException e) {
e.printStackTrace();
logger.warning("Error: No driver class found for driver="+driver);
} catch (SQLException e) {
e.printStackTrace();
}
}
private void prepareSQLInsertStatements() {
try {
insertSubstancePS = conn.prepareStatement(insertSubstanceSQL);
insertSynonymsPS = conn.prepareStatement(insertSynonymsSQL);
} catch (SQLException e) {
e.printStackTrace();
}
}
private void prepareSQLUpdateStatements() {
try {
replaceSubstancePS = conn.prepareStatement(replaceSubstanceSQL);
replaceSynonymsPS = conn.prepareStatement(replaceSynonymsSQL);
checkSynonymExistsPS = conn.prepareStatement(checkSynonymExistsSQL);
insertSynonymsPS = conn.prepareStatement(insertSynonymsSQL);
} catch (SQLException e) {
e.printStackTrace();
}
}
public void loadSubstances() throws SQLException {
File directory = new File(tempPath);
String [] files = directory.list();
int fileCount = 1;
for(String fileName : files) {
if(fileName.endsWith(".sdf")) {
loadSubstance(tempPath+fileName);
fileCount++;
}
logger.info("loaded file "+fileCount+" name= "+fileName);
}
}
public void updateSubstances() throws SQLException {
File directory = new File(tempPath);
String [] files = directory.list();
int fileCount = 1;
for(String fileName : files) {
if(fileName.endsWith(".sdf")) {
replaceSubstance(tempPath+fileName);
fileCount++;
}
logger.info("loaded file "+fileCount+" name= "+fileName);
}
}
private void loadSubstance(String filePath) {
String sid = null;
String sourceName;
String displaySourceName;
String regID;
String patentID;
String substanceURL;
String substanceSynonym;
try {
InputStream is = new FileInputStream(filePath);
MolImporter mi = new MolImporter (is);
for (Molecule mol = new Molecule (); mi.read(mol); ) {
molCount++;
sid = sourceName = regID = patentID = substanceURL = displaySourceName = substanceSynonym = null;
sid = mol.getProperty(sidKey).trim();
sourceName = mol.getProperty(dataSourceNameKey);
regID = mol.getProperty(dataSourceRegidKey);
patentID = mol.getProperty(patentIDKey);
substanceURL = mol.getProperty(substanceURLKey);
if(sid != null) {
this.insertSubstancePS.setLong(1, Long.parseLong(sid));
} else {
//move to next substance
continue;
}
if(regID != null) {
this.insertSubstancePS.setString(2, regID);
} else {
this.insertSubstancePS.setNull(2, java.sql.Types.VARCHAR);
}
if(sourceName != null) {
//check for name mapping
displaySourceName = this.sourceNames.get(sourceName);
if(displaySourceName != null)
this.insertSubstancePS.setString(3, displaySourceName);
else
this.insertSubstancePS.setString(3, sourceName);
} else {
this.insertSubstancePS.setNull(3, java.sql.Types.VARCHAR);
}
if(substanceURL != null) {
this.insertSubstancePS.setString(4, substanceURL);
} else {
this.insertSubstancePS.setNull(4, java.sql.Types.VARCHAR);
}
if(patentID != null) {
this.insertSubstancePS.setString(5, patentID);
} else {
this.insertSubstancePS.setNull(5, java.sql.Types.VARCHAR);
}
//add the batch
insertSubstancePS.addBatch();
//now handle synonyms
substanceSynonym = mol.getProperty(substanceSynonymKey);
if(sid != null && substanceSynonym != null) {
insertSynonyms(Long.parseLong(sid), substanceSynonym);
}
//execute on every 100
if(molCount % 100 == 0) {
insertSubstancePS.executeBatch();
}
//commit on every 1000
if(molCount % 1000 == 0) {
conn.commit();
logger.info("Subst Commit Total ="+molCount);
}
}
conn.commit();
is.close();
} catch (MolFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void replaceSubstance(String filePath) {
String sid = null;
String sourceName;
String displaySourceName;
String regID;
String patentID;
String substanceURL;
String substanceSynonym;
try {
InputStream is = new FileInputStream(filePath);
MolImporter mi = new MolImporter (is);
for (Molecule mol = new Molecule (); mi.read(mol); ) {
molCount++;
sid = sourceName = regID = patentID = substanceURL = displaySourceName = substanceSynonym = null;
sid = mol.getProperty(this.sidKey).trim();
sourceName = mol.getProperty(this.dataSourceNameKey);
regID = mol.getProperty(this.dataSourceRegidKey);
patentID = mol.getProperty(this.patentIDKey);
substanceURL = mol.getProperty(this.substanceURLKey);
if(sid != null) {
replaceSubstancePS.setLong(1, Long.parseLong(sid));
} else {
//move to next substance
continue;
}
if(regID != null) {
replaceSubstancePS.setString(2, regID);
} else {
replaceSubstancePS.setNull(2, java.sql.Types.VARCHAR);
}
if(sourceName != null) {
//check for name mapping
displaySourceName = this.sourceNames.get(sourceName);
if(displaySourceName != null)
replaceSubstancePS.setString(3, displaySourceName);
else
replaceSubstancePS.setString(3, sourceName);
} else {
replaceSubstancePS.setNull(3, java.sql.Types.VARCHAR);
}
if(substanceURL != null) {
replaceSubstancePS.setString(4, substanceURL);
} else {
replaceSubstancePS.setNull(4, java.sql.Types.VARCHAR);
}
if(patentID != null) {
//limit patentID's to 25 in db.
patentArr = patentID.split("\n");
if(patentArr.length <= 25) {
replaceSubstancePS.setString(5, patentID);
} else {
patentID = "";
for (int i = 0; i < 25; i++) {
patentID += patentArr[i].trim() + "\n";
}
replaceSubstancePS.setString(5, patentID);
}
} else {
replaceSubstancePS.setNull(5, java.sql.Types.VARCHAR);
}
//add the batch
replaceSubstancePS.addBatch();
//now handle synonyms
substanceSynonym = mol.getProperty(substanceSynonymKey);
if(sid != null && substanceSynonym != null) {
updateSynonyms(Long.parseLong(sid), substanceSynonym);
}
//execute on every 100
if(molCount % 100 == 0) {
replaceSubstancePS.executeBatch();
}
//commit on every 1000
if(molCount % 1000 == 0) {
conn.commit();
logger.info("Subst Commit Total ="+molCount);
}
}
replaceSubstancePS.executeBatch();
this.replaceSynonymsPS.executeBatch();
conn.commit();
is.close();
} catch (MolFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void insertSynonyms(long sid, String synStr) throws SQLException {
String [] toks = synStr.split("\n");
for(String syn : toks) {
synonymCount++;
this.insertSynonymsPS.setLong(1, sid);
this.insertSynonymsPS.setLong(2, 0);
this.insertSynonymsPS.setString(3, syn.trim());
this.insertSynonymsPS.addBatch();
if(synonymCount % 100 == 0) {
insertSynonymsPS.executeBatch();
}
if(synonymCount % 1000 == 0) {
conn.commit();
}
}
}
private void updateSynonyms(long sid, String synStr) throws SQLException {
String [] toks = synStr.split("\n");
//need to delete the synonyms first
stmt.execute("delete from synonyms where id ="+sid+" and type=0");
for(String syn : toks) {
synonymCount++;
this.insertSynonymsPS.setLong(1, sid);
this.insertSynonymsPS.setLong(2, 0);
this.insertSynonymsPS.setString(3, syn.trim());
this.insertSynonymsPS.addBatch();
if(synonymCount % 100 == 0) {
insertSynonymsPS.executeBatch();
}
if(synonymCount % 1000 == 0) {
conn.commit();
}
}
}
/**
* Checks the table for the existence of a synonym.
* Only use for small updates.
* @return
* @throws SQLException
*/
public boolean doesSynonymExist(long id, long type, String syn) throws SQLException {
this.checkSynonymExistsPS.setLong(1, type);
this.checkSynonymExistsPS.setLong(2, id);
this.checkSynonymExistsPS.setString(3, syn);
ResultSet rs = this.checkSynonymExistsPS.executeQuery();
boolean exists = (rs.next());
rs.close();
return exists;
}
public void parseSourceNames(String sourceNameFileName) throws IOException {
sourceNames = new Hashtable <String, String>();
BufferedReader br = new BufferedReader(new FileReader(sourceNameFileName));
String line = "";
String [] toks;
while((line = br.readLine()) != null) {
toks = line.split("\t");
if(toks.length > 1) {
sourceNames.put(toks[0].trim(), toks[1].trim());
logger.info(toks[0]+" ==> " + toks[1]);
}
}
br.close();
}
private boolean isInteger(String s) {
char [] cArr = s.toCharArray();
for(char c : cArr) {
if(!Character.isDigit(c))
return false;
}
return true;
}
public void copyAndUnzipFilesToTemp(Vector <String> fileNameV) {
File file, tempFile;
GZIPInputStream zis;
String tempFileName;
byte [] buffer = new byte[1024];
int len;
FileOutputStream fos;
for(String fileName: fileNameV) {
System.out.println("file="+fileName);
file = new File(path+"/Substance/"+fileName);
if(!file.isDirectory()) {
tempFileName = tempPath+fileName;
tempFileName = tempFileName.substring(0, tempFileName.lastIndexOf(".gz"));
tempFile = new File(tempFileName);
try {
zis = new GZIPInputStream(new FileInputStream(file));
fos = new FileOutputStream(tempFile);
while( (len = zis.read(buffer)) > 0 ) {
fos.write(buffer, 0, len);
}
fos.flush();
fos.close();
zis.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
public void deleteFilesInTemp() {
File dir = new File(tempPath);
String [] fileNames = dir.list();
File file;
for(String fileName : fileNames) {
file = new File(tempPath+fileName);
file.delete();
System.out.println("delete file + "+file.getPath());
}
logger.info("Deleted Temp Files");
}
public void closeConnections() {
try {
System.out.println("close conn and factory");
if(conn != null && !conn.isClosed())
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public Vector <Vector<String>> partitionFiles(String path) throws FileNotFoundException {
File dir = new File(path);
String [] files = dir.list();
Vector <Vector<String>> fileBatches = new Vector <Vector<String>> ();
Vector <String> fileVector = new Vector <String>();
int fileCount = 0;
boolean partialBatch = true;
for(String fileName : files) {
if(fileName.endsWith(".gz")) {
fileVector.add(fileName);
fileCount++;
partialBatch = true;
if(fileCount % fileBatchSize == 0) {
fileBatches.add(fileVector);
fileVector = new Vector <String>();
partialBatch = false;
}
}
}
if(partialBatch) {
fileBatches.add(fileVector);
}
return fileBatches;
}
public boolean updateSubstanceDates() throws ClassNotFoundException, SQLException {
log.info("Starting Update of Substance Dates");
boolean updated = false;
conn = BardDBUtil.connect(service.getDbURL());
conn.setAutoCommit(false);
Connection conn2 = BardDBUtil.connect(service.getDbURL());
updateSubstanceDatePS = conn.prepareStatement(updateSubstanceDatesSQL);
extrasResource.getFileName();
String extrasDirPath = service.getLocalResPath() + "/Substance-Extras/";
//gunzip the date file
log.info("Unzipping SID-Dates.gz");
try {
BardResourceFetch.gunzipFile(extrasDirPath+"/SID-Date.gz", extrasDirPath+"/SID-Date");
} catch (FileNotFoundException e) {
e.printStackTrace();
log.info("SID-Date file not found in "+extrasDirPath);
return false;
} catch (IOException e) {
e.printStackTrace();
log.info("SID-Date update failed. IOException on SID-Date file unzip.");
return false;
}
log.info("Preparing temp_sid_date table.");
Statement stmt = conn2.createStatement();
stmt.execute("create table if not exists temp_sid_date (sid bigint(20), deposited date, updated date)");
stmt.execute("truncate temp_sid_date");
log.info("Loading data into temp_sid_date.");
stmt.execute("load data infile '"+extrasDirPath+"SID-Date' into table temp_sid_date");
log.info("Fisished data load into temp_sid_date / Starting update of substance.");
stmt.setFetchSize(Integer.MIN_VALUE);
ResultSet rs = stmt.executeQuery("select * from temp_sid_date");
long sidCnt = 0;
while(rs.next()) {
updateSubstanceDatePS.setDate(1, rs.getDate(2));
updateSubstanceDatePS.setDate(2, rs.getDate(3));
updateSubstanceDatePS.setLong(3, rs.getLong(1));
updateSubstanceDatePS.addBatch();
sidCnt++;
if(sidCnt % 100 == 0) {
updateSubstanceDatePS.executeBatch();
conn.commit();
}
if(sidCnt % 1000000 == 0) {
log.info("SID Date update count = "+sidCnt);
}
}
updateSubstanceDatePS.executeBatch();
conn.commit();
rs.close();
conn.close();
conn2.close();
updated = true;
// BufferedReader br = new BufferedReader(new FileReader(extrasDirPath+"SID-Date"));
//br to read OR load file into temp_subs_dates and query from table.
//load table might be faster for getting data into the db but requires a query/update.
//update statement, 110 million times?
// String s1 = "select sid, deposited, updated from temp_subst_dates";
// String s2 = "update substance set deposited = ?, updated = ? where sid = ?";
//
// connect();
//
// try {
//
// PreparedStatement selectPS = conn.prepareStatement(s1);
// selectPS.setFetchSize(Integer.MIN_VALUE);
//
// PreparedStatement updatePS = connUpdate.prepareStatement(s2);
// //updatePS.setFetchSize(Integer.MIN_VALUE);
//
// ResultSet rs = selectPS.executeQuery();
//
// long sid;
// Date deposited, updated;
// long updateCount = 0;
// long start = System.currentTimeMillis();
//
// while (rs.next()) {
// updateCount++;
//
// sid = rs.getLong(1);
// deposited = rs.getDate(2);
// updated = rs.getDate(3);
//
// updatePS.setLong(3, sid);
// updatePS.setDate(1, deposited);
// updatePS.setDate(2, updated);
//
// updatePS.addBatch();
//
// if(updateCount % 10000 == 0) {
// updatePS.executeBatch();
// conn.commit();
// // updatePS.clearBatch();
// }
//
// if(updateCount % 100000 == 0) {
// //conn.commit();
// System.out.println("count=" + updateCount + " " + (System.currentTimeMillis()-start)/1000);
// }
// }
// updatePS.executeUpdate();
// conn.commit();
// } catch (SQLException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
return updated;
}
private long getSubstanceCount() throws SQLException {
long size = 0;
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(getSubstanceCountSQL);
if(rs.next())
size = rs.getLong(1);
return size;
}
public static void main(String [] args) {
// if(args.length != 3) {
// logger.warning("Need paramters: path=<path_to_sdf_files> url=<database_url> driver=<driver_name>");
// System.exit(1);
// }
String arg, path, dbURL, driver, user, pw;
path = dbURL = driver = user = pw = null;
pw = "bard_manager";
user = "bard_manager";
driver = "com.mysql.jdbc.Driver";
dbURL = "jdbc:mysql://protein.nhgri.nih.gov:3306/bard2?zeroDateTimeBehavior=convertToNull";
path = "/ifs/prod/braistedjc/db_scripts/pubchem_substance";
String [] toks;
// for(int i = 0; i < args.length; i++) {
// arg = args[i];
// toks = arg.split("=");
// if(toks.length != 2) {
// logger.warning("Need paramters: path=<path_to_sdf_files> url=<database_url> driver=<driver_name>");
// logger.warning("Make sure \'=\' is used for each parameter");
// System.exit(1);
// }
// if(toks[0].equals("path")) {
// path = toks[1].trim();
// } else if(toks[0].equals("url")) {
// dbURL = toks[1].trim();
// } else if(toks[0].equals("driver")) {
// driver = toks[1].trim();
// }
// }
// if(path == null || dbURL == null || driver == null ||user == null || pw == null) {
// logger.warning("Have a null param. Need paramters: path=<path_to_sdf_files> url=<database_url> driver=<driver_name>");
// System.exit(1);
// }
//dbURL = "jdbc:mysql://protein.nhgri.nih.gov:3306/bard2?zeroDateTimeBehavior=convertToNull"
//driver = "com.mysql.jdbc.Driver"
PubchemSubstanceLoader loader = new PubchemSubstanceLoader(dbURL, driver, user, pw, path);
//loader.batchLoadSubstances();
//loader.updateSubstanceDates();
//MLBDSubstanceLoader loader = new MLBDSubstanceLoader();
//loader.fp();
//MLBDCompoundLoader loader = new MLBDCompoundLoader();
// loader.batchLoadCompoundFp();
System.exit(0);
}
}