/**
* VolumeMarkupDbAccessor.java
*
* Description : This performs all the database access needed by the VolumeMarkup
* Version : 1.0
* @author : Partha Pratim Sanyal
* Created on : September 11, 2009
*
* Modification History :
* Date | Version | Author | Comments
*
* Confidentiality Notice :
* This software is the confidential and,
* proprietary information of The University of Arizona.
*/
package fna.db;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.HashSet;
import org.apache.log4j.Logger;
import fna.parsing.ApplicationUtilities;
import fna.parsing.ParsingException;
public class VolumeMarkupDbAccessor {
//public static String PRONOUN ="all|each|every|some|few|individual|both|other";
//public static String NUMBERS = "zero|one|ones|first|two|second|three|third|thirds|four|fourth|fourths|quarter|five|fifth|fifths|six|sixth|sixths|seven|seventh|sevenths|eight|eighths|eighth|nine|ninths|ninth|tenths|tenth";
//public static String CHARACTERS ="lengths|length|lengthed|width|widths|widthed|heights|height|character|characters|distribution|distributions|outline|outlines|profile|profiles|feature|features|form|forms|mechanism|mechanisms|nature|natures|shape|shapes|shaped|size|sizes|sized";
//public static String PREPOSITION ="above|across|after|along|around|as|at|before|beneath|between|beyond|by|for|from|in|into|near|of|off|on|onto|out|outside|over|than|throughout|toward|towards|up|upward|with|without";
//public static String CLUSTERSTRINGS = "group|groups|clusters|cluster|arrays|array|series|fascicles|fascicle|pairs|pair|rows|number|numbers|\\d+";
//public static String SUBSTRUCTURESTRINGS = "part|parts|area|areas|portion|portions";
//public static String STOP ="a|about|above|across|after|along|also|although|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|for|from|had|has|have|hence|here|how|if|in|into|inside|inward|is|it|its|may|might|more|most|near|no|not|of|off|on|onto|or|out|outside|outward|over|should|so|than|that|the|then|there|these|this|those|throughout|to|toward|towards|up|upward|was|were|what|when|where|which|why|with|within|without|would";
/**
* @param args
*/
private static final Logger LOGGER = Logger.getLogger(VolumeMarkupDbAccessor.class);
private static String url = ApplicationUtilities.getProperty("database.url");
private String tablePrefix = null ;
private String glossarytable;
private Connection conn = null;
public VolumeMarkupDbAccessor(String dataPrefix, String glossarytable){
this.tablePrefix = dataPrefix;
this.glossarytable = glossarytable;
try {
Class.forName(ApplicationUtilities.getProperty("database.driverPath"));
conn = DriverManager.getConnection(url);
} catch (Exception e) {
LOGGER.error("Couldn't find Class in MainFormDbAccessor" + e);
e.printStackTrace();
}
}
/**
* check for unmatched brackets too.
*/
// private void fillInWords(){
// try {
// Statement stmt = conn.createStatement();
// ResultSet rs = null;
// File sourcefolder = new File(ApplicationUtilities.getProperty("DESCRIPTIONS"));
// File[] flist = sourcefolder.listFiles();
// int total = flist.length;
// for(int i= 0; i < flist.length; i++){
// BufferedReader reader = new BufferedReader(new FileReader(flist[i]));
// String line = null;
// StringBuffer sb = new StringBuffer();
// while ((line = reader.readLine()) != null) {
// line = line.replaceAll(System.getProperty("line.separator"), " ");
// sb.append(line);
// }
// reader.close();
// String text = sb.toString();
// text = text.toLowerCase();
// text = text.replaceAll("<[^<]+?>", " ");
// text = text.replaceAll("\\d", " ");
// text = text.replaceAll("\\(", " ( ");
// text = text.replaceAll("\\)", " ) ");
// text = text.replaceAll("\\[", " [ ");
// text = text.replaceAll("\\]", " ] ");
// text = text.replaceAll("\\{", " { ");
// text = text.replaceAll("\\}", " } ");
// text = text.replaceAll("\\s+", " ").trim();
// String[] words = text.split("\\s+");
// int lround = 0;
// int lsquare = 0;
// int lcurly = 0;
// int inbracket = 0;
// for(int j = 0; j < words.length; j++){
// String w = words[j].trim();
// if(w.compareTo("(")==0) lround++;
// else if(w.compareTo(")")==0) lround--;
// else if(w.compareTo("[")==0) lsquare++;
// else if(w.compareTo("]")==0) lsquare--;
// else if(w.compareTo("{")==0) lcurly++;
// else if(w.compareTo("}")==0) lcurly--;
// else{
// w = w.replaceAll("[^-a-z]", " ").trim();
// if(w.matches(".*?\\w.*")){
// if(lround+lsquare+lcurly > 0){
// inbracket = 1;
// }else{
// inbracket = 0;
// }
// int count = 1;
// rs = stmt.executeQuery("select word, count, inbrackets from "+this.tablePrefix+"_allwords where word='"+w+"'");
// if(rs.next()){ //normal word exist
// count += rs.getInt("count");
// inbracket *= rs.getInt("inbrackets");
// }
// stmt.execute("delete from "+this.tablePrefix+"_allwords where word ='"+w+"'");
// stmt.execute("insert into "+this.tablePrefix+"_allwords (word, count, inbrackets) values('"+w+"', "+count+","+inbracket+")");
// }
// }
// }
// //listener.progress(5+i*45/total);
// }
// rs.close();
// stmt.close();
// } catch (Exception e) {
// LOGGER.error("Problem in VolumeDehyphenizer:fillInWords", e);
// e.printStackTrace();
// }
// }
/**
* display learned new structures in structures subtab in step 4 (perl markup) for curation.
* @param tagList
* @throws ParsingException
* @throws SQLException
*/
public ArrayList<String> structureTags4Curation(List <String> tagList) throws ParsingException, SQLException {
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String filter1 = "";
String filter2 = "";
String filter3 = "";
Statement stmt1 = conn.createStatement();
ResultSet rs1 = stmt1.executeQuery("show tables");
while(rs1.next()){
if(rs1.getString(1).compareToIgnoreCase(ApplicationUtilities.getProperty("NONEQTABLE"))==0){
filter1 = " and tag not in (select word from "+ ApplicationUtilities.getProperty("NONEQTABLE")+") ";
filter2 = " and plural not in (select word from "+ ApplicationUtilities.getProperty("NONEQTABLE")+") ";
filter3 = " and word not in (select word from "+ ApplicationUtilities.getProperty("NONEQTABLE")+") ";
}
}
rs1.close();
stmt1.close();
String sql = "select distinct tag as structure from "+this.tablePrefix+"_sentence where tag != 'unknown' and tag is not null and tag not like '% %' " +
filter1 +
"union select distinct plural as structure from "+this.tablePrefix+"_singularplural"+","+ this.tablePrefix+"_sentence where singular=tag "+
filter2 +
"order by structure";
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
String structurelist = "";
while (rs.next()) {
String tag = rs.getString("structure");
structurelist += "'"+tag+"',";
populateCurationList(tagList, tag); //select tags for curation
}
//terms that will be shown as "structures" should not be shown as "descriptors"
structurelist = structurelist.replaceFirst(",$", "");
stmt.execute("update "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" set saved_flag = 'red' where word in ("+structurelist+")");
sql = "select distinct word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where pos in ('p', 's', 'n') and saved_flag !='red' "+
filter3+/*" and word in (select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("ALLWORDS")+") "+*/
" order by word";
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
while (rs.next()) {
String tag = rs.getString("word");
populateCurationList(tagList, tag); //select tags for curation
}
ArrayList<String> words = deduplicateSort(tagList);
return words;
} catch (SQLException sqlexe) {
LOGGER.error("Couldn't update sentence table in VolumeMarkupDbAccessor:updateData", sqlexe);
sqlexe.printStackTrace();
throw new ParsingException("Error Accessing the database" , sqlexe);
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
}
private ArrayList<String> deduplicateSort(List<String> tagList) {
HashSet<String> set = new HashSet<String>(tagList);
String[] sorted = set.toArray(new String[]{});
Arrays.sort(sorted);
ArrayList<String> results = new ArrayList<String>();
for(int i=0; i<sorted.length; i++){
results.add(sorted[i]);
}
return results;
}
/**
* display unknown terms in morestructure/moredescriptor subtabs
* in step 4 (perl markup) for curation.
* @param curationList
* @throws ParsingException
* @throws SQLException
*/
public ArrayList<String> contentTerms4Curation(List <String> curationList, ArrayList<String> structures, ArrayList<String> characters) throws ParsingException, SQLException {
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String filter = "";
Statement stmt1 = conn.createStatement();
ResultSet rs1 = stmt1.executeQuery("show tables");
while(rs1.next()){
if(rs1.getString(1).compareToIgnoreCase(ApplicationUtilities.getProperty("NONEQTABLE"))==0){
filter = " and word not in (select word from "+ ApplicationUtilities.getProperty("NONEQTABLE")+") ";
}
}
rs1.close();
stmt1.close();
String sql = "select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("ALLWORDS")+
//" where count>=3 and inbrackets=0 and word not like '%\\_%' and " +
" where word not like '%\\_shaped' and word not like '%\\_size' and " +
" word not in (select word from "+ this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where saved_flag='red')"+
filter +
//" and word not in (select word from "+ this.tablePrefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+") " +
"order by word";
//add all words except those has been shown in the structure/descriptor list and in the gloss
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
while (rs.next()) {
String word = rs.getString("word");
if(!structures.contains(word) && !characters.contains(word)){
populateCurationList(curationList, word);
}
}
return this.deduplicateSort(curationList);
} catch (SQLException sqlexe) {
LOGGER.error("Couldn't update sentence table in VolumeMarkupDbAccessor:contentTerms4Curation", sqlexe);
sqlexe.printStackTrace();
throw new ParsingException("Error Accessing the database" , sqlexe);
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
}
/**
* if word in glossary, add it to wordroles
* if not in glossary, add to curationList
* @param curationList
* @param word
*/
private void populateCurationList(List<String> curationList, String word) {
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select category from "+this.glossarytable+" where term ='"+word+"'");
if(rs.next()){
String cat = rs.getString("category");
if(cat.matches("("+ApplicationUtilities.getProperty("STRUCTURECATEGORYPATTERNINGLOSSARY")+")")){
add2WordRolesTable(word, "op");
}else{
add2WordRolesTable(word, "c");
}
}else{
curationList.add(word);
}
}catch(Exception sqlexe){
LOGGER.error("Couldn't update sentence table in VolumeMarkupDbAccessor:populateCurationList", sqlexe);
sqlexe.printStackTrace();
}
}
/**
* load descriptor subtab in step 4 (perl markup)
* @return
* @throws SQLException
*/
public ArrayList<String> descriptorTerms4Curation() throws SQLException {
ArrayList<String> words = new ArrayList<String>();
PreparedStatement stmt = null;
ResultSet rset = null;
try {
String filter = "";
Statement stmt1 = conn.createStatement();
ResultSet rs1 = stmt1.executeQuery("show tables");
while(rs1.next()){
if(rs1.getString(1).compareToIgnoreCase(ApplicationUtilities.getProperty("NONEQTABLE"))==0){
filter = " and word not in (select word from "+ ApplicationUtilities.getProperty("NONEQTABLE")+") ";
}
}
rs1.close();
stmt1.close();
String sql = "select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where pos=? and saved_flag !='red' "+
filter/*+" and word in (select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("ALLWORDS")+") "*/+
" order by word";
//stmt = conn.prepareStatement("select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where pos=? and word not in (select distinct term from "+this.glossarytable+")");
stmt = conn.prepareStatement(sql);
stmt.setString(1, "b");
rset = stmt.executeQuery();
if (rset != null) {
while(rset.next()){
populateDescriptorList(words, rset.getString("word"));
}
}
words = deduplicateSort(words);
return words;
} catch (SQLException exe) {
LOGGER.error("Error in getting words as descriptors: " +
"mainFormDbAccessor.descriptorTerms4Curation", exe);
exe.printStackTrace();
} finally {
if(rset != null) {
rset.close();
}
if(stmt != null) {
stmt.close();
}
}
return words;
}
/**
* w is put into words only if
* it is a word
* it is not a pronoun, a stopword, a preposition, an adv (-ly), or -shaped
* it is not in the glossary
*
* if it is in the glossary, get its role from glossary and save it in wordroles table.
* @param words
* @param w
*/
private void populateDescriptorList(ArrayList<String> words, String w) {
if(w.matches(".*?\\w.*")){
String wc = w;
if(w.indexOf("-")>=0 || w.indexOf("_")>=0){
String[] ws = w.split("[_-]");
w = ws[ws.length-1];
}
try {
Statement stmt = conn.createStatement();
ResultSet rset = stmt.executeQuery("select category from "+this.glossarytable+" where term ='"+w+"'");
if(rset.next()){//in glossary
String cat = rset.getString(1);
if(cat.matches("\\b(STRUCTURE|SUBSTANCE|PLANT|nominative|structure)\\b")){
add2WordRolesTable(wc, "os");
}else{
add2WordRolesTable(wc, "c");
}
}else{ //not in glossary
words.add(wc);
}
} catch (SQLException exe) {
LOGGER.error("Error in VolumeMarkupDbAccess.populateDescriptorList", exe);
exe.printStackTrace();
}
}
}
/**
*
* @param w
* @param role
*/
private void add2WordRolesTable(String w, String role) {
try {
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select * from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+" where word='"+w+"' and semanticrole='"+role+"'");
if(!rs.next()){
stmt.execute("insert into "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+"(word, semanticrole) values ('"+w+"','"+role+"')");
}
} catch (SQLException exe) {
LOGGER.error("Error in adding a word to wordroles table" +
"mainFormDbAccessor.Add2WordRolesTable", exe);
exe.printStackTrace();
}
}
/**
* called also by "load last project"
* @return
* @throws SQLException
*/
public ArrayList<String> getSavedDescriptorWords() throws SQLException {
ArrayList<String> words = new ArrayList<String>();
PreparedStatement stmt = null;
ResultSet rset = null;
try {
//Populate descriptor Hong TODO 5/23/11
//stmt = conn.prepareStatement("select word from "+this.tablePrefix+"_wordpos4parser where pos=? and word not in (select distinct term from "+this.glossarytable+") and saved_flag not in ('green','red')");
stmt = conn.prepareStatement("select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where pos=? and word not in (select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+") and saved_flag not in ('red')");
stmt.setString(1, "b");
rset = stmt.executeQuery();
if (rset != null) {
while(rset.next()){
words.add(rset.getString("word"));
}
}
} catch (SQLException exe) {
LOGGER.error("Error in getting words as descriptors: " +
"mainFormDbAccessor.getDescriptorWords", exe);
} finally {
if(rset != null) {
rset.close();
}
if(stmt != null) {
stmt.close();
}
}
return words;
}
public ArrayList<ArrayList<String>> getUnSavedDescriptorWords() throws SQLException {
ArrayList<String> words = new ArrayList<String>();
ArrayList<String> flag = new ArrayList<String>();
ArrayList<ArrayList<String>> wordsAndFlag = new ArrayList<ArrayList<String>>();
PreparedStatement stmt = null;
ResultSet rset = null;
try {
//Populate descriptor Hong TODO 5/23/11
//stmt = conn.prepareStatement("select word from "+this.tablePrefix+"_wordpos4parser where pos=? and word not in (select distinct term from "+this.glossarytable+") and saved_flag not in ('green','red')");
stmt = conn.prepareStatement("select word,saved_flag from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("POSTABLE")+" where pos=? and word not in (select word from "+this.tablePrefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+")");
stmt.setString(1, "b");
rset = stmt.executeQuery();
if (rset != null) {
while(rset.next()){
words.add(rset.getString("word"));
flag.add(rset.getString("saved_flag"));
}
}
} catch (SQLException exe) {
LOGGER.error("Error in getting words as descriptors: " +
"mainFormDbAccessor.getUnSavedDescriptorWords", exe);
exe.printStackTrace();
} finally {
if(rset != null) {
rset.close();
}
if(stmt != null) {
stmt.close();
}
}
wordsAndFlag.add(words);
wordsAndFlag.add(flag);
return wordsAndFlag;
}
public static void main(String[] args)throws Exception {
// TODO Auto-generated method stub
//System.out.println(DriverManager.getConnection(url));
}
}