package fna.parsing.state;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.eclipse.swt.custom.StyledText;
import org.eclipse.swt.widgets.Display;
import fna.charactermarkup.ChunkedSentence;
/**
* first run dehypenizer, then run unsupervised.pl,
* run on untagged sentences/originalsent
*/
/**
* this class should only gather state info, to be used as a base for further reasoning
* Should not involve any glossary or other ontology which should be part of further reasoning.
* Use simple, list, to, tolist patterns to gather states, and record the cooccurrence of the states in a matrix easily be used for further reasoning.
* The matrix holds co-occurrence scores for a pair of states
*
*/
/**
* Changes:
* Patterns: remove "stop" from start, keep "stop" in the end [when to remove after stop? consider states learned vs. displayed. learned should include only the key term, displayed needs to show constraints and modifiers.]
* to make sure states extracted are of the organ, not related parts.
* Stop: add new stop words:
* a|above|after|all|almost|along|amp|an|and|are|as|at
* |be|because|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|
* |each|even|from|has|had|have|here|how|if|in|into|is|it|its
* |less|may|might|more|most|much|near|not|of|often|on|over|should|so|some|sometimes|should
* |than|that|the|then|there|these|this|those|toward|towards
* |was|well|were|what|when|why|with|without|would
*
* |few|frequently
* |occasionally|often|rarely|somewhat|throughout|very";
*
* False to-patterns:"reduced to", "to form", "appressed to", "in contrast to", "similar to"
* "confined to", "equal to", "perpendicular to", "dissimilar to","lobed to", "divided to", "invisible to"
* "adherent to", "according to", "proximal to"/"distal to", "to touch", "fused to", "attached to"
* "formed by", "axillary to", "back to back", "articulated to"
*
* In Bootstrap: allow different characters in a group. e.g lacking or yellowish to light orangeish
*
* numbers: convert to NUM with state number.
*
* comparative: > equal to or slightly shorter than <
* special -ly words: mealy, scaly, prickly,
*
* adv to/or adv adj patterns: sparsely to much branched => record only adj branched.
*
* OR patterns: or not: scabrous or not. =>record only adj scabrous.
* more or less
* ca.: , to ca. x m times or just ca.
*
* list pattern: , "or meeting <"=>match single seg, most cases are fine, this one is bad.
*
*preposition phrases: on rock
*
*tag sentence: should organ names only be tagged when they appear at the beginning of a sentence? e.g. , rounded or with single <groove> [ or change simple pattern?]
*markup character: may need to merge saved stategroups and checked states. eg. > entire or with 3 broad ,
*/
public class StateCollector {
static protected Connection conn = null;
//static protected String database = null;
//static protected String username = "termsuser";
//static protected String password = "termspassword";
static protected String word = "(?:[\\w_]+\\s)";
//static public String stop ="a|about|above|across|after|along|also|although|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|for|from|had|has|have|hence|here|how|if|in|into|inside|inward|is|it|its|may|might|more|most|near|no|not|of|off|on|onto|or|out|outside|outward|over|should|so|than|that|the|then|there|these|this|those|throughout|to|toward|towards|up|upward|was|were|what|when|where|which|why|with|within|without|would";
//static public String stop ="a|above|above|across|after|along|also|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|each|even|few|frequently|from|had|has|have|here|how|if|in|into|is|it|its|less|may|might|more|most|much|near|not|occasionally|of|off|often|on|onto|or|over|rarely|should|so|some|sometimes|somewhat|soon|than|that|the|then|there|these|this|those|throughout|to|toward|towards|up|upward|very|was|well|were|what|when|where|which|why|with|without|would";
static protected String tophrases="articulated to|adnate to|connate to|to ca|reduced to|to form|appressed to|in contrast to|similar to|confined to|equal to|perpendicular to|dissimilar to|lobed to|divided to|invisible to|adherent to|according to|proximal to|distal to|to touch|fused to|attached to|axillary to|back to back|restricted to|ankylosed to|anterior to|attaching to|close to|complementary to|connected to|continuing to|difficult to|extending to|extended to|extend to|joined to|leading to|limited to|posterior to|prior to|tending to|tend to|tends to|tendency to|up to|widening to";
static protected String orphrases="more or less";
static protected String simple = "((?:(?:^|,|>) "+word+"))or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|\\b(?:"+ChunkedSentence.stop+")\\b)"; //a or b
static protected String list = "((?:(?:^|,|>) "+word+")*), or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|\\b(?:"+ChunkedSentence.stop+")\\b)"; //a, b, c, or e f g
static protected String to = "((?:(?:^|,|>) (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|\\b(?:"+ChunkedSentence.stop+")\\b)";
static protected String tolist ="((?:(?:^|,|>) (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|\\b(?:"+ChunkedSentence.stop+")\\b)";
protected StateMatrix statematrix = null;
protected Hashtable<String, String> sentences = null;
protected String tableprefix = null;
protected String glosstable = null;
//protected String organnames = null;
protected Display display;
protected StyledText charLog;
protected boolean marked = false;
StateCollector(Connection conn, String tableprefix, String glosstable, Display display, StyledText charLog){
this.statematrix = new StateMatrix(conn, tableprefix,glosstable);
this.tableprefix = tableprefix;
StateCollector.conn = conn;
this.glosstable = glosstable;
//this.database = database;
//collect(database);
this.display = display;
this.charLog = charLog;
}
StateCollector(Connection conn, String tableprefix, ArrayList<String> knownstates, String glosstable, Display display, StyledText charLog){
if(knownstates!=null){
StateImporter si = new StateImporter(knownstates);
this.statematrix = new StateMatrix(conn, tableprefix, si.getStates(),glosstable);
}
this.tableprefix = tableprefix;
StateCollector.conn = conn;
this.glosstable = glosstable;
//this.database = database;
//collect(database);
this.display = display;
this.charLog = charLog;
}
public void collect(){
// read sentences in the database.sentencetable generated by unsupervised.pl
// create statemartix
try{
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists "+this.tableprefix+"_learnedstates (state varchar(100) NOT NULL PRIMARY KEY, count int(4))");
stmt.execute("delete from "+this.tableprefix+"_learnedstates");
//stmt.execute("create table if not exists "+this.tableprefix+"_markedsentence (source varchar(100) NOT NULL PRIMARY KEY, markedsent text, rmarkedsent text)");
//ResultSet rs = stmt.executeQuery("select * from "+this.tableprefix+"_markedsentence");
//if(rs.next()){this.marked = true;}
stmt.execute("update "+this.tableprefix+"_sentence set charsegment =''");
}catch(Exception e){
e.printStackTrace();
}
SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(StateCollector.conn, this.tableprefix, this.glosstable, true, display, charLog);//tag organ names
try {
this.sentences = sosm.markSentences();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
parseSentences();//create StateGroups
//System.out.println(statematrix.toString());
}
/**
* parse or and to patterns,
* save each pattern as a group of state (StateGroup)
* check for characters for a state
* save patterns in a sorted collection
*/
protected void parseSentences(){
Enumeration<String> en = sentences.keys();
this.showOutputMessage("System is parsing sentences for character terms ...");
while(en.hasMoreElements()){
String source = en.nextElement();
String taggedsent = (String)sentences.get(source);
parseSentence(source, taggedsent);
}
}
protected void parseSentence(String source, String sent){
boolean match = false;
do{
match = false;
String copy = sent;
//sent = doSynonyms(source, sent);
sent = doSimple(source, sent);
sent = doList(source, sent);
sent = doTo(source,sent);
sent = doToList(source, sent);
if(copy.compareTo(sent) != 0){
match = true;
}
}while(match);
}
protected String doToList(String source, String sent){
Pattern tolistp = Pattern.compile(StateCollector.tolist);
Matcher m = tolistp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
if(isValidStateString(t2) && isValidStateString(t1)){
String [] terms = t1.split("\\s*,\\s*");
List<String> list = Arrays.asList(terms);
ArrayList<String> alist = new ArrayList<String>(list);
alist.add(t2);
for(int i = 0; i<alist.size(); i++){
for(int j = i+1; j<alist.size(); j++){
int score = i==0? -1 : 1; //absent or erect
State s1 = statematrix.getStateByName(alist.get(i));
State s2 = statematrix.getStateByName(alist.get(j));
s1 = s1 == null? new State(alist.get(i)) : s1;
s2 = s2 == null? new State(alist.get(j)) : s2;
statematrix.addPair(s1, s2, score, source);
}
}
System.out.println(t1+" and "+t2+" are in the same group [tolist] in ["+sent+"]\n");
}
sent = sent.replaceFirst(StateCollector.tolist, "");
}
return sent;
}
protected String doTo(String source, String sent){
Pattern top = Pattern.compile(StateCollector.to);
Matcher m = top.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
if(isValidStateString(t2) && isValidStateString(t1)){/**TODO: move this check to function group**/
ArrayList<String> list = new ArrayList<String>();
list.add(t1);
System.out.print("["+t1+"] ");
String[] t2s = t2.split("\\b(to|or)\\b");
for(int i = 0; i<t2s.length; i++){
list.add(normalize(t2s[i]));
System.out.print("["+t2s[i]+"] ");
}
System.out.println(" are in the same group [to] in ["+sent+"]\n");
for(int i = 0; i<list.size(); i++){
for(int j = i+1; j<list.size(); j++){
int score = i==0? -1 : 1; //absent or erect
State s1 = statematrix.getStateByName(list.get(i));
State s2 = statematrix.getStateByName(list.get(j));
s1 = s1 == null? new State(list.get(i)) : s1;
s2 = s2 == null? new State(list.get(j)) : s2;
statematrix.addPair(s1, s2, score, source);
//statematrix.addPair(new State(list.get(i)), new State(list.get(j)), score);
}
}
}
sent = sent.replaceFirst(StateCollector.to, "");
}
return sent;
}
protected String doList(String source, String sent){
Pattern listp = Pattern.compile(StateCollector.list);
Matcher m = listp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = m.group(1);
String t2 = m.group(2);
if(isValidStateString(t2) && isValidStateString(t1)){
String [] terms = t1.split("\\s*,\\s*");
List<String> list = Arrays.asList(terms);
ArrayList<String> alist = new ArrayList<String>(list);
alist.add(t2);
for(int i = 0; i < alist.size(); i++){
alist.set(i, normalize((String)alist.get(i)));
}
System.out.println(t1+" and "+t2+" are in the same group [list] in ["+sent+"]\n");
for(int i = 0; i<alist.size(); i++){
for(int j = i+1; j<alist.size(); j++){
int score = i==0? -1 : 1; //absent or erect
State s1 = statematrix.getStateByName(alist.get(i));
State s2 = statematrix.getStateByName(alist.get(j));
s1 = s1 == null? new State(alist.get(i)) : s1;
s2 = s2 == null? new State(alist.get(j)) : s2;
statematrix.addPair(s1, s2, score, source);
//statematrix.addPair(new State(alist.get(i)), new State(alist.get(j)), score);
}
}
}
sent = sent.replaceFirst(StateCollector.list, "");
}
return sent;
}
/**
* watch out for "internodes glabrous or midstem ones slightly scabrous ."
* @param sentid
* @param sent
* @return
*/
protected String doSimple(String source, String sent){
Pattern simplep = Pattern.compile(StateCollector.simple);
Matcher m = simplep.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println("\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
if(isValidStateString(t2) && isValidStateString(t1)){/*TODO move this checkpoint to function group*/
ArrayList<String> list = new ArrayList<String>();
list.add(t1);
System.out.print("["+t1+"] ");
String[] t2s = t2.split("\\b(to|or)\\b");
for(int i = 0; i<t2s.length; i++){
list.add(normalize(t2s[i]));
System.out.print("["+t2s[i]+"] ");
}
for(int i = 0; i<list.size(); i++){
for(int j = i+1; j<list.size(); j++){
int score = i==0? -1 : 1; //absent or erect
State s1 = statematrix.getStateByName(list.get(i));
State s2 = statematrix.getStateByName(list.get(j));
s1 = s1 == null? new State(list.get(i)) : s1;
s2 = s2 == null? new State(list.get(j)) : s2;
statematrix.addPair(s1, s2, score, source);
//statematrix.addPair(new State(list.get(i)), new State(list.get(j)), score);
}
}
System.out.println(" are in the same group [simple] in ["+sent+"]\n");
}
sent = sent.replaceFirst(StateCollector.simple, "");
}
return sent;
}
/**
* a state is of typical length if each section of the state separated by or/and/to contains no more than two words
* @param state
* @return
*/
protected boolean isValidStateString(String statestring) {
/*if(statestring.matches(".*?\\b("+this.organnames+")\\b.*")){
return false;
}*/
if(statestring.matches("\\d")){
return false;
}
String[] sections = statestring.split("\\b(,|or|and|to)\\b");
for(int i = 0; i < sections.length; i++){
if(sections[i].trim().split("\\s+").length > 2){
return false;
}
}
return true;
}
/*TODO
* protected String doSynonyms(int sentid, String sent){
Pattern synonymsp = Pattern.compile(CharacterLearner.synonyms);
Matcher m = synonymsp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println("\t"+seg);
String save = m.group(2);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(3));
String[] terms = t2.split("\\s*(\\bor\\b|,)\\s*");
List<String> list = Arrays.asList(terms);
list.add(t1);
//if(list.size() > 1){
group(list, sentid, seg);
System.out.println("[t1] and [t2] are in the same group [syn]");
//}else{
//System.out.println("[t1] and [t2] were not put in the same group [syn]");
//}
save = save.replaceAll("\\[", "\\[").replaceAll("\\]", "\\]");
sent = sent.replaceFirst(" "+save, "");
}
return sent;
}*/
protected String normalize(String sent){
if(sent == null){return sent;}
sent = sent.replaceAll("[<>,;.]", "")/*.replaceAll("\\bwith\\b", " WITH ")*/
.replaceAll("\\b("+ChunkedSentence.stop+")\\b", "")/*.replaceAll(" WITH ", " with ")*/.replaceAll("\\d+", "").replaceAll("\\b[_a-z]*?ly\\b","")
.replaceFirst("^to_", "to").replaceFirst("^or_", "or")./*replaceAll("_", " ").*/replaceAll("\\s+", " ").replaceFirst("^\\s+", "").replaceFirst("\\s+$", "");
return sent.trim().toLowerCase();
}
/*
* NOT USED
protected String collectStateNames(){
StringBuffer tags = new StringBuffer();
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select distinct state from learnedstates");
while(rs.next()){
String tag = rs.getString("state");
if(tag == null){continue;}
tags.append(tag+"|");
}
}catch(Exception e){
e.printStackTrace();
}
return tags.toString()+glossary.getAllCharacters();
}*/
protected void showOutputMessage(final String message) {
display.syncExec(new Runnable() {
public void run() {
charLog.append(message+"\n");
}
});
}
/**
* @param args
*/
public static void main(String[] args) {
//StateCollector sc = new StateCollector("test_asist09ont");
Connection conn = null;
String database="";
String username="";
String password="";
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password;
conn = DriverManager.getConnection(URL);
}
}catch(Exception e){
e.printStackTrace();
}
//StateCollector sc = new StateCollector(conn, "fnav19", "fnaglossaryfixed");
//sc.collect();
}
}