package fna.parsing.character;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import org.apache.log4j.Logger;
import fna.parsing.ApplicationUtilities;
import fna.parsing.DeHyphenizerCorrected;
import fna.parsing.MainForm;
/**
* compare learned terms with what is in glossary, produce a report
* @author hongcui
*
*/
public class LearnedTermsReport {
static private String gtablename = "fnaglossary";
static private String stablename = "learnedstates";
static private String otablename1 = "sentence";
static private String otablename2 = "wordpos";
static private String otablename = "learnedstructures";
static private String gstablename = "glossstructures";
private String database;
private static final Logger LOGGER = Logger.getLogger(LearnedTermsReport.class);
static private Connection conn = null;
//static private String username = ApplicationUtilities.getProperty("database.username");
//static private String password = ApplicationUtilities.getProperty("database.password");
private ArrayList<String> overlappedstructures = new ArrayList<String>();
private ArrayList<String> newstructures = new ArrayList<String>();
private ArrayList<String> modifiedstructures = new ArrayList<String>();
private ArrayList<String> overlappedstates = new ArrayList<String>();
private ArrayList<String> newstates = new ArrayList<String>();
private ArrayList<String> modifiedstates = new ArrayList<String>();
//private ArrayList unusedstructures = new ArrayList();
//private ArrayList unusedstates = new ArrayList();
private HashSet<String> learnedstructures = new HashSet<String>();
private HashSet<String> learnedstates = new HashSet<String>();
private Hashtable<String, String> donestates = new Hashtable<String, String>();
public LearnedTermsReport(String database) {
//check if fnaglossary and learnedstates tables exist
this.database = database;
boolean g = false;
boolean s = false;
boolean o1 = false;
boolean o2 = false;
try{
if(conn == null){
String URL = ApplicationUtilities.getProperty("database.url");
conn = DriverManager.getConnection(URL);
}
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("show tables");
while(rs.next()){
String tablename = rs.getString(1);
if(tablename.compareTo(stablename) == 0){
s = true;
}
if(tablename.compareTo(gtablename) == 0){
g = true;
}
if(tablename.compareTo(otablename1) == 0){
o1 = true;
}
if(tablename.compareTo(otablename2) == 0){
o2 = true;
}
}
if(!s){
System.err.println("Learned state table does not exist! Program exists.");
System.exit(1);
}
if(!g){
System.err.println("Glossary table does not exist! Program exists.");
System.exit(1);
}
if(!o1){
System.err.println("Learned organ table does not exist! Program exists.");
System.exit(1);
}
if(!o2){
System.err.println("Learned pos table does not exist! Program exists.");
System.exit(1);
}
//create a table holding singular structure terms from the glossary
createGlossStructureTable();
//make another table with learned singular organ names.
createLearnedStructureTable();
DeHyphenizerCorrected dh = new DeHyphenizerCorrected(ApplicationUtilities.getProperty("database.name"),
otablename, "structure", null, "_", MainForm.dataPrefixCombo.getText().replaceAll("-", "_"), null);//TODO: replace last null with glossary
dh.deHyphen();
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport constructor", e);
e.printStackTrace();
}
}
public String report(){
StringBuffer sb = new StringBuffer();
String ls = System.getProperty("line.separator");
//Glossary
sb.append("Comparison between FNA Glossary and Learned Terms"+ls);
sb.append("Note: in the report, structure is defined to include any terms in either of the following categories: 'STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT'"+ls);
sb.append("FNA Glossary Info:"+ls);
sb.append("FNA Glossary All Structure Count: "+getAllStructureCount()+ls);
/*unusedStructures();
sb.append("\t FNA Glossary Structures NOT Seen in Text: "+unusedstructures.size());
Iterator it = unusedstructures.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name);
}*/
/*sb.append("FNA Glossary All States Count: "+ (getAllStateCount()-learnedstates));
unusedStates();
sb.append("\t FNA Glossary States NOT Seen in Text: "+unusedstates.size());
Iterator it = unusedstates.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name);
}*/
sb.append("FNA Glossary All Character Count: "+getAllCharacterCount()+ls);
//organ/structure names
sb.append(ls+":::::::::::::::::::::::::::::::::::::"+ls);
sb.append("Structures Learned from "+database+":"+ls);
sb.append("Learned Structure Count: "+getLearnedStructuresCount()+ls);
compareStructureTerms();
sb.append("\t Learned Structure Names Overlap with Glossary: "+this.overlappedstructures.size()+ls);
sb.append("\t Learned Modified Structure Names: "+this.modifiedstructures.size()+ls);
Iterator<String> it = modifiedstructures.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name+ls);
}
sb.append("\t Learned Structure Names NOT in Glossary: "+this.newstructures.size()+ls);
it = newstructures.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name+ls);
}
//character states
sb.append(ls+":::::::::::::::::::::::::::::::::::::"+ls);
sb.append("States Learned from "+database+":"+ls);
sb.append("Learned States Count: "+getLearnedStatesCount()+ls);
compareStateTerms();
sb.append("\t Learned State Names Overlap with Glossary: "+this.overlappedstates.size()+ls);
sb.append("\t Learned Modified State Names: "+this.modifiedstates.size()+ls);
it = modifiedstates.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name+ls);
}
sb.append("\t Learned State Names NOT in Glossary: "+this.newstates.size()+ls);
it = newstates.iterator();
while(it.hasNext()){
String name = (String) it.next();
sb.append("\t\t "+name+ls);
}
statesAssignedCharacters();
sb.append("\t Learned State Assigned Characters: "+this.donestates.size()+ls);
Enumeration<String> en = donestates.keys();
while(en.hasMoreElements()){
String name = (String) en.nextElement();
String chara = (String) donestates.get(name);
sb.append("\t\t "+name+ " is a type of "+chara+ls);
}
return sb.toString();
}
private void createGlossStructureTable(){
try{
Statement stmt = conn.createStatement();
stmt.execute("drop table if exists "+LearnedTermsReport.gstablename);
stmt.execute("create table if not exists "+LearnedTermsReport.gstablename+" as select term from "+gtablename+" where category in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative') and status !='learned' and term not in (select distinct term2 from termforms where type ='pl')");
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport createGlossStructureTable", e);
e.printStackTrace();
}
}
/**
*
*/
private void createLearnedStructureTable(){
try{
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists "+LearnedTermsReport.otablename +"(structure varchar(100))");
stmt.execute("delete from "+LearnedTermsReport.otablename);
ResultSet rs = stmt.executeQuery("select distinct modifier, tag from "+LearnedTermsReport.otablename1);
while(rs.next()){
if(rs.getString("tag")!=null && !rs.getString("tag").equals("unknown")){
String modifier = rs.getString("modifier") == null || rs.getString("modifier").equals("NULL")? "" : rs.getString("modifier");
String tag = (modifier+" "+rs.getString("tag")).trim();
Statement stmt1 = conn.createStatement();
stmt1.execute("insert into "+LearnedTermsReport.otablename +" values ('"+tag+"')");
}
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport createLearnedStructureTable", e);
e.printStackTrace();
}
}
@SuppressWarnings("unused")
private ArrayList<String> unusedStructures(){
ArrayList<String> unused = new ArrayList<String>();
ArrayList<String> sents = new ArrayList<String>();
try{
Statement stmt = conn.createStatement();
String query = "select sentence from "+otablename1;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
sents.add(rs.getString("sentence"));
}
query = "select distinct term from "+gtablename +" where category in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative')";
rs = stmt.executeQuery(query);
while(rs.next()){
String term = rs.getString("term");
boolean used = false;
Iterator<String> it = sents.iterator();
while(it.hasNext()){//TODO: match singular with pl.
String sent = ((String) it.next()).toLowerCase();
if(sent.indexOf(term)>=0){
used = true;
continue;
}
}
if(!used){
unused.add(term);
}
}
int size = unused.size();
HashSet<String> toremove = new HashSet<String>();
for(int i = 0; i<size; i++){
String term = (String)unused.get(i);
rs = stmt.executeQuery("select term from "+gtablename+" where definition in (select definition from "+gtablename+" where term = '"+term+"')");
boolean used = false;
ArrayList<String> terms = new ArrayList<String>();
while(rs.next()){
String tprime = rs.getString("term").trim();
terms.add(tprime);
if(term.compareTo(tprime)!= 0){
if(!unused.contains(tprime)){
used = true;
}
}
}
terms.remove(term);
if(used){
toremove.add(term);
toremove.addAll(terms);
}else{
toremove.addAll(terms);
}
}
unused.removeAll(toremove);
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport unusedStructures", e);
e.printStackTrace();
}
return unused;
}
@SuppressWarnings("unused")
private ArrayList<String> unusedStates(){
ArrayList<String> unused = new ArrayList<String>();
ArrayList<String> sents = new ArrayList<String>();
try{
Statement stmt = conn.createStatement();
String query = "select sentence from "+otablename1;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
sents.add(rs.getString("sentence"));
}
query = "select distinct term from "+gtablename +" where category NOT in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative')";
rs = stmt.executeQuery(query);
while(rs.next()){
String term = rs.getString("term");
boolean used = false;
Iterator<String> it = sents.iterator();
while(it.hasNext()){
String sent = ((String) it.next()).toLowerCase();
if(sent.indexOf(term)>=0){
used = true;
continue;
}
}
if(!used){
unused.add(term);
}
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport unusedStates", e);
e.printStackTrace();
}
return unused;
}
private void statesAssignedCharacters(){
try{
Statement stmt = conn.createStatement();
String query = "select term, category from "+gtablename +" where status ='learned'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
donestates.put(rs.getString("term"), rs.getString("category"));
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport statesAssignedCharacters", e);
e.printStackTrace();
}
}
private void compareStructureTerms(){
Iterator<String> it = learnedstructures.iterator();
while(it.hasNext()){
String name = (String)it.next();
String[] parts = name.split(" ");
if(matchInGlossStructure(name)){
this.overlappedstructures.add(name);
}else if(findInGlossStructure(parts[parts.length-1])){
this.modifiedstructures.add(name);
}else{
this.newstructures.add(name);
}
}
}
private void compareStateTerms(){
Iterator<String> it = learnedstates.iterator();
while(it.hasNext()){
String name = (String)it.next();
String[] parts = name.split(" ");
if(matchInGlossStates(name)){
this.overlappedstates.add(name);
}else if(findInGlossStates(parts[parts.length-1])){
this.modifiedstates.add(name);
}else{
this.newstates.add(name);
}
}
}
private int getLearnedStructuresCount(){
try{
Statement stmt = conn.createStatement();
String query = "select structure from "+otablename;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
String structure = rs.getString("structure");
learnedstructures.add(structure.trim());
}
/*query = "select distinct word from "+otablename2+" where pos='p' or pos='s'";
rs = stmt.executeQuery(query);
while(rs.next()){
learnedstructures.add(rs.getString("word").trim());
}*/
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getLearnedStructuresCount", e);
e.printStackTrace();
}
return learnedstructures.size();
}
private int getLearnedStatesCount(){
try{
Statement stmt = conn.createStatement();
String query = "select distinct state from "+stablename;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
learnedstates.add(rs.getString("state"));
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getLearnedStatesCount", e);
e.printStackTrace();
}
return learnedstates.size();
}
private int getAllCharacterCount(){
try{
Statement stmt = conn.createStatement();
String query = "select count(distinct category) from "+gtablename;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
return rs.getInt(1);
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getAllCharacterCount", e);
e.printStackTrace();
}
return -1;
}
//state
@SuppressWarnings("unused")
private int getAllStateCount(){
try{
Statement stmt = conn.createStatement();
String query = "select count(term) from "+gtablename +" where category not in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative')";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
return rs.getInt(1);
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getAllStateCount", e);
e.printStackTrace();
}
return -1;
}
@SuppressWarnings("unused")
private int getAllStateCount(String status){
try{
Statement stmt = conn.createStatement();
String query = "select count(term) from "+gtablename +" where category not in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative') and status ='"+status+"'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
return rs.getInt(1);
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getAllStateCount", e);
e.printStackTrace();
}
return -1;
}
private boolean matchInGlossStates(String term){ //match whole term
boolean match = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+gtablename +" where term ='"+term+"' and category not in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative') and status !='learned'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
match = true;
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport matchInGlossStates", e);
e.printStackTrace();
}
return match;
}
private boolean findInGlossStates(String main){//match the main word in the term
boolean find = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+gtablename +" where (term like '% "+main+"' or term = '"+main+"') and category not in ('STRUCTURE / SUBSTANCE','STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT', 'nominative') and status !='learned'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
find = true;
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport findInGlossStates", e);
e.printStackTrace();
}
return find;
}
//structure
private int getAllStructureCount(){
try{
Statement stmt = conn.createStatement();
String query = "select count(term) from "+gstablename;
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
return rs.getInt(1);
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport getAllStructureCount", e);
e.printStackTrace();
}
return -1;
}
private boolean matchInGlossStructure(String term){ //match whole term: single to single
boolean match = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+gstablename +" where term ='"+term+"'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
match = true;
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport matchInGlossStructure", e);
e.printStackTrace();
}
return match;
}
private boolean findInGlossStructure(String main){//match the main word in the term
boolean find = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+gstablename +" where term like '% "+main+"' or term = '"+main+"'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
find = true;
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport findInGlossStructure", e);
e.printStackTrace();
}
return find;
}
@SuppressWarnings("unused")
private boolean stringMatchInGloss(String term){
boolean find = false;
try{
Statement stmt = conn.createStatement();
String query = "select term from "+gtablename +" where term like '%"+term+"%'";
ResultSet rs = stmt.executeQuery(query);
while(rs.next()){
find = true;
}
}catch(Exception e){
LOGGER.error("Exception in LearnedTermsReport stringMatchInGloss", e);
e.printStackTrace();
}
return find;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
LearnedTermsReport ltr = new LearnedTermsReport("fnav5_corpus");
ltr.report();
}
}