package gov.nih.ncgc.bard.resourcemgr.extresource.uniprot;
import gov.nih.ncgc.bard.resourcemgr.BardDBUtil;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Vector;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
public class BardUniprotProteinTargetLoader {
private final static Logger logger = Logger.getLogger(BardUniprotProteinTargetLoader.class.getName());
private String sqlCreateTempProteinTarget = "create table temp_protein_target like protein_target";
private String sqlReplaceIntoProteinTarget = "replace into protein_target (accession, gene_id, name, taxid, uniprot_status)" +
" values (?,?,?,?,?)";
private String sqlInsertIntoAccToPrimaryAcc = "insert into uniprot_acc2primary_acc (primary_acc, acc) values (?,?)";
private String replaceIntoUniprotMap = "replace into uniprot_map (uniprot_acc, acc, acc_type) values (?,?,?)";
private Connection conn;
private PreparedStatement createTempTablePS;
private PreparedStatement insertTargetPS;
private PreparedStatement insertAccToPrimaryAcc;
private long accMapCnt;
private long targetCnt;
private int relatedAccCnt;
private String relatedAccType;
private int index;
private String [] accToks;
//patterns
Pattern refseqPattern;
Pattern genbankPattern;
Pattern geneIDPattern;
Matcher matcher;
public BardUniprotProteinTargetLoader() {
refseqPattern = Pattern.compile("\\DP_");
genbankPattern = Pattern.compile("\\D{3}\\d");
geneIDPattern = Pattern.compile("\\d+");
}
public void loadUniprotToProteinTarget(String uniprotFilePath, String dbUrl, String dbDriverName) {
try {
conn = BardDBUtil.connect(dbUrl, dbDriverName, "bard_manager", "bard_manager");
conn.setAutoCommit(false);
//make a temp table for protein target
// createTempTablePS = conn.prepareStatement(sqlCreateTempProteinTarget);
// createTempTablePS.execute();
// createTempTablePS.close();
insertTargetPS = conn.prepareStatement(sqlReplaceIntoProteinTarget);
insertAccToPrimaryAcc = conn.prepareStatement(sqlInsertIntoAccToPrimaryAcc);
accMapCnt = targetCnt = 0;
GZIPInputStream gzipStream = new GZIPInputStream(new FileInputStream(uniprotFilePath));
BufferedReader br = new BufferedReader(new InputStreamReader(gzipStream));
String line = null;
StringBuffer sb = new StringBuffer();
int n = 0;
//need acc to be unique
HashMap <String, String> accMap = new HashMap <String, String> ();
Vector <String> accessions = new Vector <String> ();
boolean haveAcc = false;
while ((line = br.readLine()) != null) {
if (line.trim().equals("//")) {
String acc = "", status = "", name = "", geneid = "", desc = "", taxid = "";
String[] toks = sb.toString().split("\n");
haveAcc = false;
for (String aline : toks) {
if (aline.startsWith("ID")) {
status = aline.split("\\s+")[2].replace(";", "");
} else if (aline.startsWith("AC")) {
//take just the first AC
if(!haveAcc) {
acc = aline.split(";")[0].trim().replace("AC ", "");
haveAcc = true;
}
} else if (aline.startsWith("DE RecName:")) {
name = aline.split("=")[1].replace(";", "");
} else if (aline.startsWith("DR GeneID;")) {
geneid = aline.split(";")[1].trim();
} else if (aline.startsWith("OX ")) {
taxid = aline.trim().replace("OX NCBI_TaxID=", "").replace(";", "");
}
//handle acc to other accessions
if(aline.startsWith("DR GeneID")
|| aline.startsWith("RefSeq")
|| aline.startsWith("EMBL")
|| aline.startsWith("PDB")) {
processRelatedAcc(acc, aline);
}
}
if(accMap.get(acc) == null) {
//if not unique, don't insert
//accession, gene_id, name, taxid, description, uniprot_status
insertTargetPS.setString(1, acc);
if(!geneid.equals(""))
insertTargetPS.setLong(2, Long.parseLong(geneid));
else
insertTargetPS.setNull(2, java.sql.Types.INTEGER);
insertTargetPS.setString(3, name);
if(!taxid.equals(""))
insertTargetPS.setLong(4, Long.parseLong(taxid));
else
insertTargetPS.setNull(4, java.sql.Types.INTEGER);
insertTargetPS.setString(5, status);
insertTargetPS.addBatch();
targetCnt++;
if(targetCnt % 100 == 0) {
insertTargetPS.executeBatch();
conn.commit();
}
if(targetCnt % 1000 == 0) {
logger.info("Load Count="+targetCnt);
}
accMap.put(acc, "");
}
sb = new StringBuffer();
} else {
sb.append(line).append("\n");
}
}
insertTargetPS.executeBatch();
conn.commit();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void processRelatedAcc(String uniprotAcc, String line) {
//Have a line starting with 'DR', then source string
//then source string (RefSeq | EMBL | GeneID | PDB);
//remove the DR and space
line = line.substring(line.indexOf(' ')).trim();
accToks = line.split(";");
for(int i = 0; i < accToks.length; i++) {
accToks[i]=accToks[i].trim();
}
relatedAccType = accToks[0];
if(relatedAccType.equals("EMBL")) {
//DR EMBL; X56494; CAA39849.1; -; Genomic_DNA.
//look for ^three letters and number
for(int i = 1; i < accToks.length; i++) {
matcher = genbankPattern.matcher(accToks[i]);
if(matcher.find()) {
System.out.println(uniprotAcc+" "+accToks[i]+" "+"EMBL-GenBank");
}
}
} else if(relatedAccType.equals("RefSeq")) {
//DR RefSeq; NP_001193725.1; NM_001206796.1.
//look for ^*P_
for(int i = 1; i < accToks.length; i++) {
matcher = refseqPattern.matcher(accToks[i]);
if(matcher.find()) {
System.out.println(uniprotAcc+" "+accToks[i]+" "+relatedAccType);
}
}
} else if(relatedAccType.equals("PDB")) {
//DR PDB; 1ZJH; X-ray; 2.20 A; A=3-530.
//just take tok 1
} else {
//DR GeneID; 5315; -. //not sure if it can be many to 1
// for(int i = 1; i < accToks.length; i++) {
// matcher = refseqPattern.matcher(accToks[i]);
// if(matcher.find()) {
// System.out.println(uniprotAcc+" "+accToks[i]+" "+relatedAccType);
// }
// }
}
}
}