package fna.parsing.state;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.eclipse.swt.custom.StyledText;
import org.eclipse.swt.widgets.Display;
import org.eclipse.swt.widgets.Text;
/**
* DO NOT treat a list of states, such as imbricate, lanceolate or ovate because it is common for an author to enumerate different characters in a list
* Treat only states connected by or/to, such as /{elliptic} to {oblong}/ or {ovate}, {glabrous} or /{villous} to {tomentose}/, clasping or short_decurrent,
* Watch out for "adv or/to adv state" pattern, such as {thinly} to {densely} {arachnoid_tomentose}
* Watch out for preposition to: reduced to
*
* {distalmost} {linear} to {narrowly} {elliptic} , {bractlike} , {spinulose} to {irregularly} {dentate} or {shallowly} {lobed} .
*
* @author hongcui
*
*/
@SuppressWarnings({ "unused","static-access" })
public class StateCollectorTest extends StateCollector {
private boolean filtered = false;
private Hashtable<String, Boolean> checkedtermpairs = new Hashtable<String, Boolean>();
public StateCollectorTest(Connection conn, String tableprefix, boolean filtered, String glosstable, Display display, StyledText charLog) {
super(conn, tableprefix, glosstable, display, charLog);
//statematrix.save2MySQL(database, "termsuser", "termspassword");
this.filtered = filtered;
}
public StateCollectorTest(Connection conn, String tableprefix, ArrayList<String> knownstates, boolean filtered, String glosstable, Display display, StyledText charLog) {
super(conn, tableprefix, knownstates, glosstable, display, charLog);
this.filtered = filtered;
}
public void saveStates(){
this.showOutputMessage("System is saving character state terms to database ...");
statematrix.save2MySQL(this.conn, this.tableprefix, "termsuser", "termspassword");
}
public int grouping4GraphML(){
this.showOutputMessage("System is grouping character state terms ...");
statematrix.Grouping();
int countXMLFiles = statematrix.output2GraphML();
return countXMLFiles;
}
/**
* rely on {c} /[o] and to/or
* add to the statematrix
*/
protected void parseSentence(String source, String sent){
String scopy = sent;
sent = sent.replaceAll("\\}-\\{", "-").replaceAll("\\}-c-\\{", "-c-");
Pattern p = Pattern.compile("\\b(to|or)\\b");
Matcher m = p.matcher(sent);
if(m.find()){
System.out.println("from sent ["+source+"]:"+sent);
//Pattern p1 = Pattern.compile("((?:\\{\\w+}\\s)+|\\s*(,|or|to)\\s*)+\\s*(to|or|nor)\\s*(?:\\{\\w+\\}\\s)+");
Pattern p1 = Pattern.compile("(?:\\{[\\w-]+}\\s)+\\s*(or|to)\\s*(?:\\{[\\w-]+\\}\\s*)+"); //add - for {dark-c-brown}
//Pattern p1 = Pattern.compile("(?:(?:\\{\\w+}\\s)+\\s*(or|to)\\s*)+(?:\\{\\w+\\}\\s*)+");
Matcher m1 = p1.matcher(sent);
while(m1.find()){
String matched = sent.substring(m1.start(), m1.end());
String mstring = matched;
boolean endofseg = false;
int end = m1.end() + 5 > sent.length()? sent.length() : m1.end()+5;
String follow = sent.substring(m1.end(), end);
if(follow.matches("\\s*[,;\\.:].*") ){
endofseg = true;
}
matched = matched.toLowerCase();
//sent = sent.substring(m1.end()); take from after (or|to) instead
sent = sent.substring(m1.end(1)+1); //3 for "or|to "
matched = matched.replaceFirst("^[\\s,]*", "").replaceAll("[{}]", "");
matched = split(matched, endofseg).replaceAll("-c-", "-");
if(matched.length() > 0 && ! mstring.matches(".*?(ed|ing)}.*? to .*")){ //ignore "reduced to", but take "reduced or"
add2matrix(matched, source);
//this.showOutputMessage("\t====::"+matched);
System.out.println("\t====::"+matched); //deal with two "to"/"or" in one match: {distalmost} {linear} to {narrowly} {elliptic} , {bractlike} , {spinulose} to {irregularly} {dentate} or {shallowly} {lobed} .
}
m1 = p1.matcher(sent);
}
}
return;
}
/*1) thinly to/or densely arachnoid_tomentose} : leave this alone: no need to capture degrees.
*2) distalmost linear to/or narrowly elliptic
* @param if endofseg is true, take the last adj for the last segment
* @return
*/
private String split(String conjunction, boolean endofseg){
String[] terms = conjunction.split("\\s+(to|or)\\s+");
String csv = "";
int count = 0;
//this.showOutputMessage("########### from :"+conjunction);
System.out.println("########### from :"+conjunction);
int size = terms.length;
int i = 0;
//all but the last term: save the last non-adv word from each term
for(i = 0; i < terms.length-1; i++){
terms[i] = terms[i].trim();
String[] parts = terms[i].split("\\s+");
//if(parts.length > 1){
// System.out.println("########### from :"+conjunction);
//}
for(int j = parts.length-1; j >=0; j--){
if(!isAdv(parts[j])){ //save the last non-adv word from each term
csv += ","+parts[j];
count++;
break;
}
}
}
//the last term: save the first non-adv word
String[] parts = terms[i].split("\\s+");
if(!endofseg){
for(int j = 0; j <parts.length; j++){
if(!isAdv(parts[j])){ //save the first non-adv word for the last term
csv += ","+parts[j];
count++;
break;
}
}
}else{
for(int j = parts.length-1; j >=0; j--){
if(!isAdv(parts[j])){ //save the first non-adv word for the last term
csv += ","+parts[j];
count++;
break;
}
}
}
if(count > 1){//at least two states in a conjunction
csv = csv.replaceFirst("^[\\s,]*", "").replaceFirst("[\\s,]*$", "");
return csv;
}
return "";
}
protected boolean isAdv(String word){
//access WordNet for answer
String wordc = word;
word = word.replaceFirst("ly$", "");
if(word.compareTo(wordc) != 0){
WordNetWrapper wnw1 = new WordNetWrapper(word);
WordNetWrapper wnw2 = new WordNetWrapper(word+"e");
if(wnw1.isAdj() || wnw2.isAdv()){
//this.showOutputMessage(wordc + " is an adv");
System.out.println(wordc + " is an adv");
return true;
}
}
WordNetWrapper wnw = new WordNetWrapper(wordc);
//if(wnw.isAdv() && !wnw.isAdj()){
if(wnw.mostlikelyPOS() !=null && wnw.mostlikelyPOS().compareTo("adv") == 0){
//this.showOutputMessage(word + " is an adv");
System.out.println(word + " is an adv");
return true;
}
return false;
}
/*
* refined is a list of format: a,b,c,d
*/
protected void add2matrix(String refined, String source){
String[] alist = refined.split(",");
for(int i = 0; i<alist.length; i++){
for(int j = i+1; j<alist.length; j++){
int score = 1; //absent or erect
State s1 = statematrix.getStateByName(alist[i]);
State s2 = statematrix.getStateByName(alist[j]);
s1 = s1 == null? new State(alist[i]) : s1;
s2 = s2 == null? new State(alist[j]) : s2;
boolean add = true;
if(this.filtered){
add = notInGlossary(s1.getName(), s2.getName());
}
if(add && !s1.getName().matches("\\d+") && !s2.getName().matches("\\d+")){
statematrix.addPair(s1, s2, score, source);
}
}
}
}
/**
*
* @param term1
* @param term2
* @return false iff term1 and term2's categories overlap in glossary.
*/
private boolean notInGlossary(String term1, String term2) {
Boolean result = null;
//check the cache
result = this.checkedtermpairs.get(term1+"#"+term2);
result = result == null? this.checkedtermpairs.get(term2+"#"+term1) : result;
if(result==null){//not in cache
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select * from "+super.glosstable+
" where term ='"+term1+"' and category in (select category from "+super.glosstable+
" where term='"+term2+"')");
if(rs.next()){
this.checkedtermpairs.put(term1+"#"+term2, new Boolean(false));
return false;
}
}catch (Exception e){
e.printStackTrace();
}
}else{
return result.booleanValue();
}
return true;
}
/**
* @param args
*/
public static void main(String[] args) {
/*StateCollectorTest sct = new StateCollectorTest("onto_foc_corpus"); //using learned semanticroles only
sct.collect("onto_foc_corpus");
sct.saveStates("onto_foc_corpus");
*/
//to use the result from unsupervisedclausemarkup, change wordpos table to wordroles (word, semanticroles) where semanticroles in (c, os, op)
Connection conn = null;
String database="";
String username="";
String password="";
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password;
conn = DriverManager.getConnection(URL);
}
}catch(Exception e){
e.printStackTrace();
}
StateCollectorTest sct = new StateCollectorTest(conn, "fnav19", false, "fnaglossaryfixed", null, null); /*using learned semanticroles only*/
sct.collect();
sct.saveStates();
sct.grouping4GraphML();
}
}