package gov.nih.ncgc.bard.resourcemgr.extresource.kegg;
import gov.nih.ncgc.bard.resourcemgr.BardDBUtil;
import gov.nih.ncgc.bard.resourcemgr.BardExtResourceLoader;
import gov.nih.ncgc.bard.resourcemgr.IBardExtResourceLoader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Vector;
import java.util.logging.Logger;
public class BardKeggLoader extends BardExtResourceLoader implements IBardExtResourceLoader {
private final static Logger logger = Logger.getLogger(BardKeggLoader.class.getName());
private String insertKeggDisease = "insert into temp_kegg_gene2disease (gene_id, disease_names, disease_id, disease_category)" +
" values (?,?,?,?)";
private Connection conn;
private PreparedStatement insertKeggDiseasePS;
@Override
public boolean load() {
boolean haveFile = fetchExternalResource();
boolean loaded = false;
if(haveFile) {
log.info("Fetched KEGG Medicus Gene to Disease Resource");
loadKeggDisease();
log.info("KEGG load is complete.");
loaded = true;
} else {
log.warning("Failed to retrieve external resource for KEGG disease");
log.warning("Service: "+service.getServiceKey());
log.warning("Check stdout log for service details.");
}
return loaded;
}
@Override
public String getLoadStatusReport() {
// TODO Auto-generated method stub
return null;
}
/**
* Maps kegg diseases to genes
*/
public long loadKeggDisease() {
long tableSize = 0;
try {
conn = BardDBUtil.connect(service.getDbURL());
conn.setAutoCommit(false);
String localKeggFilePath = service.getLocalResPath();
tableSize = BardDBUtil.getTableRowCount("kegg_gene2disease", service.getDbURL());
File keggDiseaseFile = new File(localKeggFilePath+"/disease");
if(!keggDiseaseFile.exists() || !keggDiseaseFile.isFile()) {
logger.warning("ERROR: KEGG Disease File is Not Found.");
return -1;
}
//make and truncate temp_kegg_gene2disease
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists temp_kegg_gene2disease like kegg_gene2disease");
stmt.execute("truncate table temp_kegg_gene2disease");
//load temp
loadTempKeggDisease(keggDiseaseFile);
logger.info("TEMP KEGG DISEASE FILE LOADED");
//swap table if it passess the delta, new table is > 0.9 of the original
BardDBUtil.swapTempTableToProductionIfPassesSizeDelta(
"temp_kegg_gene2disease", "kegg_gene2disease", 0.9, service.getDbURL());
logger.info("RECREATED KEGG_GENE2DISEASE");
//table growth
tableSize = BardDBUtil.getTableRowCount("kegg_gene2disease") - tableSize;
conn.close();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return tableSize;
}
private void loadTempKeggDisease(File keggFile) throws IOException, SQLException, ClassNotFoundException {
this.insertKeggDiseasePS = conn.prepareStatement(this.insertKeggDisease);
BufferedReader br = new BufferedReader(new FileReader(keggFile));
String line;
String [] toks;
String diseaseID = "";
String names = "";
String genes = null;
String category = "";
String description = "";
String currField = "";
Vector <String> geneIDs;
int diseaseCount = 0;
long geneCount = 0;
logger.info("Loading KEGG Disease: File Parse");
while((line = br.readLine()) != null) {
if(line.startsWith(" ")) {
if(currField.equals("NAME")) {
names += line.trim();
} else if(currField.equals("GENE")) {
genes += line.trim()+";";
}
} else {
currField = "";
if(line.startsWith("ENTRY")) {
currField = "ENTRY";
toks = line.split("[\\s]+");
if(toks.length < 2) {
System.out.println("bad id parse");
continue;
}
diseaseID = toks[1];
} else if(line.startsWith("NAME")) {
currField = "NAME";
line = line.replace("including:", "");
names = line.substring(4).trim();
} else if(line.startsWith("GENE")) {
currField = "GENE";
genes = line.substring(4).trim()+";";
} else if(line.startsWith("CATEGORY")) {
currField = "CATEGORY";
category = line.substring(9).trim();
} else if(line.startsWith("DESCRIPTION")) {
currField = "DESCRIPTION";
description = line.substring(11).trim();
} else if(line.startsWith("///")) {
diseaseCount++;
//end of record
//if there are no associated genes, continue with next line
if(genes == null)
continue;
//parse genes
geneIDs = parseGenes(genes);
long geneIDNum;
for(String geneID: geneIDs) {
geneCount++;
try {
geneIDNum = Long.parseLong(geneID.trim());
} catch (Exception e) {
e.printStackTrace();
continue;
}
insertKeggDiseasePS.setLong(1, geneIDNum);
insertKeggDiseasePS.setString(2, names);
insertKeggDiseasePS.setString(3, diseaseID);
insertKeggDiseasePS.setString(4, category);
insertKeggDiseasePS.addBatch();
if(geneCount % 135 == 0) {
insertKeggDiseasePS.executeBatch();
conn.commit();
}
}
//parse symbols
//set values
// if(diseaseCount % 12 == 0) {
// System.out.println(
// // "ID: " + diseaseID + "\n" +
// // "Names " + names + "\n" +
// // "Desc: " + description + "\n" +
// // "Category " + category + "\n" +
// diseaseCount + " Genes " + genes //+ "\n"
// // "*************************************"
// );
// Vector <String> g = parseGenes(genes);
// System.out.println("Gene Count = "+g.size());
// System.out.print("Genes: ");
// for (String gene : g)
// System.out.print(gene+ " ");
// System.out.println();
//
// Vector <String> s = parseGeneSymbols(genes);
// System.out.println("Symbol Count = "+s.size());
// System.out.print("Symbols: ");
// for (String symbol : s)
// System.out.print(symbol+ " ");
// System.out.println();
//
// }
//insert
//clear current values
toks=null;
names = "";
diseaseID = names = genes = category = description = null;
currField = "";
}
}
}
insertKeggDiseasePS.executeBatch();
conn.commit();
}
private Vector <String> parseGenes (String geneStr) {
Vector <String> genes = new Vector <String> ();
String [] toks = geneStr.split("[;]+");
int index;
String gene;
String [] geneToks;
for(String tok: toks) {
index = -1;
tok = tok.trim();
index = tok.indexOf("[HSA:");
if(index == -1)
continue;
index += 5;
gene = tok.substring(index, tok.indexOf(']'));
geneToks = gene.split("[\\s]+");
for(String g : geneToks)
genes.add(g);
}
return genes;
}
private Vector <String> parseGeneSymbols (String geneStr) {
Vector <String> symbols = new Vector <String> ();
String [] toks = geneStr.split("[;]+");
int index;
String symbol;
for(String tok: toks) {
index = -1;
tok = tok.trim();
if(tok.startsWith("(")) {
index = tok.indexOf(")");
if(index == -1)
continue;
index++;
tok = tok.substring(index).trim();
//System.out.println("paren tok: "+ tok);
}
symbol = tok.split("[\\s]+")[0];
symbols.add(symbol);
}
return symbols;
}
private void countKeggDiseaseGenes(File keggFile) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(keggFile));
String line;
String [] toks;
String diseaseID = "";
String names = "";
String genes = null;
String category = "";
String description = "";
String currField = "";
Vector <String> geneIDs;
int diseaseCount = 0;
long geneCount = 0;
logger.info("Loading KEGG Disease: File Parse");
while((line = br.readLine()) != null) {
if(line.startsWith(" ")) {
if(currField.equals("NAME")) {
names += line.trim();
} else if(currField.equals("GENE")) {
genes += line.trim()+";";
}
} else {
currField = "";
if(line.startsWith("ENTRY")) {
currField = "ENTRY";
toks = line.split("[\\s]+");
if(toks.length < 2) {
System.out.println("bad id parse");
continue;
}
diseaseID = toks[1];
} else if(line.startsWith("NAME")) {
currField = "NAME";
line = line.replace("including:", "");
names = line.substring(4).trim();
} else if(line.startsWith("GENE")) {
currField = "GENE";
genes = line.substring(4).trim()+";";
} else if(line.startsWith("CATEGORY")) {
currField = "CATEGORY";
category = line.substring(9).trim();
} else if(line.startsWith("DESCRIPTION")) {
currField = "DESCRIPTION";
description = line.substring(11).trim();
} else if(line.startsWith("///")) {
diseaseCount++;
//end of record
//if there are no associated genes, continue with next line
if(genes == null)
continue;
//parse genes
geneIDs = parseGenes(genes);
geneCount += geneIDs.size();
//parse symbols
//set values
// if(diseaseCount % 12 == 0) {
// System.out.println(
// // "ID: " + diseaseID + "\n" +
// // "Names " + names + "\n" +
// // "Desc: " + description + "\n" +
// // "Category " + category + "\n" +
// diseaseCount + " Genes " + genes //+ "\n"
// // "*************************************"
// );
// Vector <String> g = parseGenes(genes);
// System.out.println("Gene Count = "+g.size());
// System.out.print("Genes: ");
// for (String gene : g)
// System.out.print(gene+ " ");
// System.out.println();
//
// Vector <String> s = parseGeneSymbols(genes);
// System.out.println("Symbol Count = "+s.size());
// System.out.print("Symbols: ");
// for (String symbol : s)
// System.out.print(symbol+ " ");
// System.out.println();
//
// }
//insert
//clear current values
toks=null;
names = "";
diseaseID = names = genes = category = description = null;
currField = "";
}
}
}
logger.info("KEGG Gene Count = "+geneCount);
}
/**
* Maps kegg diseases to genes
*/
public void insertOrUpdateKeggDisease(String keggDiseaseFilePath) {
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
BardKeggLoader loader = new BardKeggLoader();
File file = new File("C:/Putty/disease");
try {
loader.countKeggDiseaseGenes(file);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// try {
// loader.loadKeggDisease(file);
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
}