package fna.parsing.character;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import fna.parsing.ApplicationUtilities;
import fna.parsing.DeHyphenizerCorrected;
import fna.parsing.Learn2Parse;
/**
* Changes:
* Patterns: remove "stop" from start, keep "stop" in the end [when to remove after stop? consider states learned vs. displayed. learned should include only the key term, displayed needs to show constraints and modifiers.]
* to make sure states extracted are of the organ, not related parts.
* Stop: add new stop words:
* a|above|after|all|almost|along|amp|an|and|are|as|at
* |be|because|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|
* |each|even|from|has|had|have|here|how|if|in|into|is|it|its
* |less|may|might|more|most|much|near|not|of|often|on|over|should|so|some|sometimes|should
* |than|that|the|then|there|these|this|those|toward|towards
* |was|well|were|what|when|why|with|without|would
*
* |few|frequently
* |occasionally|often|rarely|somewhat|throughout|very";
*
* False to-patterns:"reduced to", "to form", "appressed to", "in contrast to", "similar to"
* "confined to", "equal to", "perpendicular to", "dissimilar to","lobed to", "divided to", "invisible to"
* "adherent to", "according to", "proximal to"/"distal to", "to touch", "fused to", "attached to"
* "formed by", "axillary to", "back to back"
*
* In Bootstrap: allow different characters in a group. e.g lacking or yellowish to light orangish
*
* numbers: convert to NUM with state number.
*
* comparative: > equal to or slightly shorter than <
* special -ly words: mealy, scaly, prickly,
*
* adv to/or adv adj patterns: sparsely to much branched => record only adj branched.
*
* OR patterns: or not: scabrous or not. =>record only adj scabrous.
* more or less
* ca.: , to ca. x m times or just ca.
*
* list pattern: , "or meeting <"=>match single seg, most cases are fine, this one is bad.
*
*preposition phrases: on rock
*
*tag sentence: should organ names only be tagged when they appear at the beginning of a sentence? e.g. , rounded or with single <groove> [ or change simple pattern?]
*markup character: may need to merge saved stategroups and checked states. eg. > entire or with 3 broad ,
*/
public class CharacterLearner implements Learn2Parse{
private static final Logger LOGGER = Logger.getLogger(CharacterLearner.class);
static {
try {
Class.forName(ApplicationUtilities.getProperty("database.driverPath"));
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
LOGGER.error("Couldn't find Class in CharacterLearner" + e);
e.printStackTrace();
}
}
static private Connection conn = null;
static private String database = null;
static private String word = "(?:[\\w_]+\\s)";
static private String num = "\\d[^a-z]+"; //5 pairs. 0 . 5 - 3 mm
static public String stop = "a|above|after|all|almost|along|amp|an|and|are|as|at|be|because|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|each|even|from|has|had|have|here|how|if|in|into|is|it|its|less|may|might|more|most|much|near|not|of|often|on|over|should|so|some|sometimes|should|than|that|the|then|there|these|this|those|toward|towards|was|well|were|what|when|why|with|without|would|few|frequently|occasionally|often|rarely|somewhat|throughout|very";
static private String tophrases="connate to|to ca|reduced to|to form|appressed to|in contrast to|similar to|confined to|equal to|perpendicular to|dissimilar to|lobed to|divided to|invisible to|adherent to|according to|proximal to|distal to|to touch|fused to|attached to|axillary to|back to back";
static private String orphrases="more or less";
static private String synonyms ="(?:>|^|,|"+stop+") ("+word+"{1,3})(\\[ (.{1,30}) \\])";
//static private String simple = "((?:(?:^|,|>|"+stop+") "+word+"))or ("+word+"{1,})"; //a or b
static private String simple = "((?:(?:^|,|>) "+word+"))or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")"; //a or b
//static private String simple = "((?:(?:^|,|>|"+stop+") "+word+"))or ("+word+"{1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")"; //a or b
//static private String list = "((?:(?:^|,|>|"+stop+") "+word+")*), or ("+word+"{1,})"; //a, b, c, or e f g
static private String list = "((?:(?:^|,|>) "+word+")*), or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")"; //a, b, c, or e f g
//static private String list = "((?:(?:^|,|>|"+stop+") "+word+")*), or ("+word+"{1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")"; //a, b, c, or e f g
//static private String to = "((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,})";
static private String to = "((?:(?:^|,|>) (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";
//static private String to = "((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";
//static private String tolist ="((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,})";
//static private String tolist ="((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";
static private String tolist ="((?:(?:^|,|>) (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";
private ArrayList<StateGroup> stategroups = null;
private Hashtable<Integer, String> sentences = null;
private Glossary glossary = null;
private Hashtable<String, StateGroup> groups = null;
private String statespatterns = "";
private String organnames = null;
private String tablePrefix = "";
public CharacterLearner(String database, String tablePrefix) {
// read sentences in the database.sentencetable generated by unsupervised.pl
// create StateGroups
// bootstrap StateGroups
this.tablePrefix = tablePrefix;
CharacterLearner.database = database;
this.groups = new Hashtable<String, StateGroup>();
try{
if(conn == null){
String URL = ApplicationUtilities.getProperty("database.url");
conn = DriverManager.getConnection(URL);
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists "+this.tablePrefix+"_learnedstates (state varchar(100) NOT NULL PRIMARY KEY, count int(4))");
stmt.execute("delete from "+this.tablePrefix+"_learnedstates");
//Statement stmt = conn.createStatement();
stmt.execute("update "+this.tablePrefix+"_sentence set charsegment =''");
}
}catch(Exception e){
LOGGER.error("Exception in CharacterLearner constructor" + e);
e.printStackTrace();
}
//glossary is created in VolumeDehyphenizer
// this.glossary = new Glossary(new File(Registry.ConfigurationDirectory + "FNAGloss.txt"), true, this.database, this.tablePrefix);
this.stategroups = new ArrayList<StateGroup>();
this.sentences = new Hashtable<Integer, String>();
this.organnames = collectOrganNames();
markSentences();//tag organ names
parseSentences();//create StateGroups
bootstrap();//infer characters
DeHyphenizerCorrected dh = new DeHyphenizerCorrected(database, this.tablePrefix+"_learnedstates", "state", "count", "_", this.tablePrefix, this.glossary);
dh.deHyphen();
this.statespatterns = collectStateNames(); //create character patterns
}
/*
* bootstrap stategroups
*/
private void bootstrap(){
/*try{
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists "+this.tablePrefix+"_bootstrap (state1 varchar(100), character1 varchar(200), PRIMARY KEY (state1, character1))");
}catch(Exception e){
e.printStackTrace();
}*/
//Bootstrap b = new Bootstrap(stategroups, glossary, database, "bootstrap");
Bootstrap b = new Bootstrap(stategroups,glossary, database);
b.go();
}
@SuppressWarnings("unused")
private void assembleDescription(){
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink");
int start = 0;
while(rs.next()){
String filename = rs.getString("filename");
int end = rs.getInt("endindex");
System.out.println("output "+filename);
String content = getDescription(start, end);
System.out.println(content);
// SAXBuilder builder = new SAXBuilder();
//Document doc = builder.build(new ByteArrayInputStream(content.getBytes("UTF-8")));
BufferedWriter out = new BufferedWriter(new FileWriter(filename));
out.write(content);
out.close();
start = end+1;
}
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner assembleDescription" + e);
e.printStackTrace();
}
}
public ArrayList<String> getMarkedDescription(String filename){
ArrayList<String> results = new ArrayList<String>();
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink where filename=\""+filename+"\"");
if(rs.next()){
int end = rs.getInt("endindex");
rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink where endindex<"+end+" order by endindex desc");
int start = 0;
if(rs.next()){
start = rs.getInt("endindex")+1;
}
results.add(getDescription(start, end));
return results;
}
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner getMarkedDescription" + e);
e.printStackTrace();
}
return results;
}
private String getDescription(int start, int end) throws SQLException {
String content = "<?xml version=\"1.0\"?><description>";
for(int i = start; i <= end; i++){
Statement stmt1 = conn.createStatement();
ResultSet rs1 = stmt1.executeQuery("select clause, tag, attributes, modifier from "+this.tablePrefix+"_clause where clauseid="+i);
rs1.next();
String sent = rs1.getString("clause");
sent = sent==null? "" : sent.trim();
String atts = rs1.getString("attributes");
atts = atts==null? "" : atts.trim();
String modifier = rs1.getString("modifier");
modifier = modifier==null? "" : modifier.trim();
modifier = modifier.replaceAll("\\s+", "_");
String tag = rs1.getString("tag");
tag = tag==null? "" : tag.trim();
tag = tag.replaceFirst("\\b(2n|n|x)\\b", "chromosomes");
tag = tag.replaceAll("\\s+", "_");
String starttag ="";
if(modifier!=null && modifier.compareTo("")!= 0 && modifier.compareTo("null")!= 0){
starttag +=modifier+"_"+tag;
}else{
starttag +=tag.trim();
}
starttag = starttag.replaceAll("^\\d+\\s*", "").replaceAll("\\W", "");
String endtag = "</"+starttag+">";
if(atts.compareTo("")!=0){
starttag += " "+atts;
}
starttag = "<"+starttag+">";
content += starttag+sent+endtag;
}
content+="</description>";
return unhide(content.replaceAll("[}{]", ""));
}
private String generateAttributes(String sent, String charsegment){
if(sent.compareTo("or perennial ;") == 0){
System.out.println();
}
String attributes = "";
//TODO sort atts,
//TODO no duplicated atts are allowed in an xml tag
//TODO deal with comparisons between two organs.
Hashtable<String, String> atts = new Hashtable<String, String>(); //collect attributes then sort them alphabetically
//deal with numbers:size
Pattern p = Pattern.compile("(.*?) ("+num+")(cm|mm|m|dm|meters|meter)\\b(.*)");
Matcher m = p.matcher(sent);
while(m.find()){
String value = m.group(2).trim()+ " "+m.group(3);
if(atts.get("size") == null){
atts.put("size", value);
}else{
atts.put("size", atts.get("size")+";"+value);
}
sent = m.group(1)+m.group(4);
m = p.matcher(sent);
}
//deal with numbers:count
p = Pattern.compile("(.*?) ("+num+")(.*)");
m = p.matcher(sent);
while(m.find()){
String value = m.group(2).replaceAll("\\W+$", "");
if (value.indexOf('/')<0){
if(atts.get("count") == null){
atts.put("count", value);
}else{
atts.put("count", atts.get("count")+";"+value);
}
}
sent = m.group(1)+m.group(3);
m = p.matcher(sent);
}
if(charsegment != null && charsegment.compareTo("")!=0){
String[] segs = charsegment.split(";");
for(int i = 0; i < segs.length; i++){
String[] parts = segs[i].split("#");
String text = parts[0];
String exp = parts[1];
sent = sent.replace(text, " ");
StateGroup sg = (StateGroup)groups.get(exp);
String att = sg.mostFreqCategory().replaceFirst("#.*","").replaceAll("\\s+", "_");
if(att.compareTo("") != 0){
String value = text;//TODO to or patterns
if(atts.get(att) == null){
atts.put(att, value);
}else{
atts.put(att, atts.get(att)+";"+value);
}
}
}
}
//TODO deal with negations
Pattern pattern = Pattern.compile("((?:(?:not|rarely|barely|seldom) (?:\\w+ )?)?\\b("+statespatterns+")\\b)");
m = pattern.matcher(sent);
while(m.find()){
String state = m.group(2);
String value = m.group(1);
ArrayList<?> chars = Glossary.getCharacter(state);
if(chars.size() >0){
Iterator<?> it = chars.iterator();
String att = "";
while(it.hasNext()){
att += ((String)it.next()).replaceAll("\\s+", "_")+"_or_";
}
att = att.replaceFirst("_or_$", "");
if(atts.get(att) == null){
atts.put(att, value);
}else{
atts.put(att, atts.get(att)+";"+value);
}
}
}
//sort atts
Set<String> keys = atts.keySet();
String[] keyarray = (String[])keys.toArray(new String[]{});
Arrays.sort(keyarray);
for(int i = 0; i<keyarray.length; i++){
String att = keyarray[i]+"='"+(String)atts.get(keyarray[i])+"'";
attributes += att+" ";
}
return attributes.trim();
}
/**
* parse or and to patterns,
* save each pattern as a group of state (StateGroup)
* check for characters for a state
* save patterns in a sorted collection
*/
private void parseSentences(){
Enumeration<Integer> en = sentences.keys();
while(en.hasMoreElements()){
Integer key = (Integer)en.nextElement();
int sentid = key.intValue();
String taggedsent = (String)sentences.get(key);
parseSentence(sentid, taggedsent);
}
}
private void parseSentence(int sentid, String sent){
boolean match = false;
do{
match = false;
String copy = sent;
if(sent.indexOf("glabrous or floccose to tomentose or lanate")>=0){
System.out.println();
}
//sent = doSynonyms(sentid, sent);
sent = doSimple(sentid, sent);
sent = doList(sentid, sent);
sent = doTo(sentid,sent);
sent = doToList(sentid, sent);
if(copy.compareTo(sent) != 0){
match = true;
}
}while(match);
}
private String doToList(int sentid, String sent){
Pattern tolistp = Pattern.compile(CharacterLearner.tolist);
Matcher m = tolistp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
String [] terms = t1.split("\\s*,\\s*");
List<String> list = Arrays.asList(terms);
ArrayList<String> alist = new ArrayList<String>(list);
alist.add(t2);
//if(alist.size() > 1){
group(alist, sentid, seg);
System.out.println("["+t1+"] and ["+t2+"] are in the same group [tolist]");
//}else{
System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [tolist]");
//}
sent = sent.replaceFirst(CharacterLearner.tolist, "");
}
return sent;
}
private String doTo(int sentid, String sent){
Pattern top = Pattern.compile(CharacterLearner.to);
Matcher m = top.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
//if(t1.compareTo("") != 0 && t2.compareTo("") != 0){
ArrayList<String> list = new ArrayList<String>();
list.add(t1);
System.out.print("["+t1+"] ");
String[] t2s = t2.split("\\b(to|or)\\b");
for(int i = 0; i<t2s.length; i++){
list.add(normalize(t2s[i]));
System.out.print("["+t2s[i]+"] ");
}
group(list, sentid, seg);
System.out.println(" are in the same group [to]\n");
//}else{
// System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [to]\n");
//}
sent = sent.replaceFirst(CharacterLearner.to, "");
}
return sent;
}
private String doList(int sentid, String sent){
Pattern listp = Pattern.compile(CharacterLearner.list);
Matcher m = listp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println( "\t"+seg);
String t1 = m.group(1);
String t2 = m.group(2);
String [] terms = t1.split("\\s*,\\s*");
List<String> list = Arrays.asList(terms);
ArrayList<String> alist = new ArrayList<String>(list);
if (alist.size() >= 3){
alist.remove(0); //be conservative to avoid sessile, rhomic, lanceolate, or oblanceolate
}
alist.add(t2);
//if(alist.size() > 1){
for(int i = 0; i < alist.size(); i++){
alist.set(i, normalize((String)alist.get(i)));
}
group(alist, sentid, seg);
System.out.println ("["+t1+"] and ["+t2+"] are in the same group [list]\n");
//}else{
// System.out.println ("["+t1+"] and ["+t2+"] were not put in the same group [list]\n");
//}
sent = sent.replaceFirst(CharacterLearner.list, "");
}
return sent;
}
private String doSimple(int sentid, String sent){
Pattern simplep = Pattern.compile(CharacterLearner.simple);
Matcher m = simplep.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println("\t"+seg);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(2));
if(t2.length()<30){
//if(t1.compareTo("") != 0 && t2.compareTo("") != 0){
ArrayList<String> list = new ArrayList<String>();
list.add(t1);
System.out.print("["+t1+"] ");
String[] t2s = t2.split("\\b(to|or)\\b");
for(int i = 0; i<t2s.length; i++){
list.add(normalize(t2s[i]));
System.out.print("["+t2s[i]+"] ");
}
group(list, sentid, seg);
System.out.println(" are in the same group [simple]\n");
//}else{
// System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [simple]\n");
//}
}
sent = sent.replaceFirst(CharacterLearner.simple, "");
}
return sent;
}
@SuppressWarnings("unused")
private String doSynonyms(int sentid, String sent){
Pattern synonymsp = Pattern.compile(CharacterLearner.synonyms);
Matcher m = synonymsp.matcher(sent);
if(m.find()){
String seg = sent.substring(m.start(), m.end());
System.out.println("\t"+seg);
String save = m.group(2);
String t1 = normalize(m.group(1));
String t2 = normalize(m.group(3));
String[] terms = t2.split("\\s*(\\bor\\b|,)\\s*");
List<String> list = Arrays.asList(terms);
list.add(t1);
//if(list.size() > 1){
group(list, sentid, seg);
System.out.println("[t1] and [t2] are in the same group [syn]");
//}else{
//System.out.println("[t1] and [t2] were not put in the same group [syn]");
//}
save = save.replaceAll("\\[", "\\[").replaceAll("\\]", "\\]");
sent = sent.replaceFirst(" "+save, "");
}
return sent;
}
/**
* check against glossary
*/
private void group(List<String> terms, int clauseid, String matchedseg){
Iterator<String> it = terms.iterator();
StateGroup g = new StateGroup();
while(it.hasNext()){
String term = ((String) it.next()).trim();
String[] tmp = new String[1];
tmp[0] = term;
if(term.matches(".*?\\b(or|to)\\b.*")){
tmp = term.split("\\s*(or|to)\\s*");
}
for(int i=0; i<tmp.length; i++){
if(tmp[i].compareTo("") != 0 && !tmp[i].matches(".*?\\b("+this.organnames+")\\b.*")){
String t = add2LearnedStates(tmp[i]);
State s = new State(t, glossary);
g.addState(s);
}
}
}
String exp = g.toString();
if(exp.compareTo("") != 0){
if(this.groups.containsKey(exp)){
((StateGroup)this.groups.get(exp)).increment();
}else{
stategroups.add(g); //duplicates will not be added
this.groups.put(exp, g);
}
matchedseg = matchedseg.replaceAll("[><;,\\.]", "").trim();
matchedseg = matchedseg+"#"+exp;
//sentence
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select charsegment from "+this.tablePrefix+"_sentence where sentid="+clauseid);
rs.next();
String tmp =rs.getString("charsegment");
if(tmp != null && tmp.compareTo("") !=0){
matchedseg = tmp+";"+matchedseg; // seg#exp;seg#exp
}
stmt.execute("update "+this.tablePrefix+"_sentence set charsegment =\""+matchedseg+"\" where sentid ="+clauseid);
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner group" + e);
e.printStackTrace();
}
/*clause
* try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select charsegment from "+this.tablePrefix+"_clause where clauseid="+clauseid);
rs.next();
String tmp =rs.getString("charsegment");
if(tmp != null && tmp.compareTo("") !=0){
matchedseg = tmp+";"+matchedseg; // seg#exp;seg#exp
}
stmt.execute("update "+this.tablePrefix+"_clause set charsegment =\""+matchedseg+"\" where clauseid ="+clauseid);
}catch (Exception e){
e.printStackTrace();
}*/
}
}
private String add2LearnedStates(String term){
String t = null;
//if(term.indexOf("_")<0){ //normal term without "-"
t = term;
try{
Statement stmt =conn.createStatement();
ResultSet rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+term+"'");
if(rs.next()){
int count = rs.getInt("count")+1;
stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+term+"'");
}else{
stmt.execute("insert into "+this.tablePrefix+"_learnedstates values('"+term+"', 1)");
}
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner add2LearnedStates" + e);
e.printStackTrace();
}
/*}else{
String t1 = term.replaceAll("_", "");
String t2 = term.replaceAll("_", " ");
try{
Statement stmt =conn.createStatement();
ResultSet rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+t1+"'");
if(rs.next()){//use t1
int count = rs.getInt("count")+1;
stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+t1+"'");
t = t1;
}else{//use t2
rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+t2+"'");
if(rs.next()){
int count = rs.getInt("count")+1;
stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+t2+"'");
}else{
stmt.execute("insert into "+this.tablePrefix+"_learnedstates values('"+t2+"', 1)");
}
t = t2;
}
}catch (Exception e){
e.printStackTrace();
}
}*/
return t;
}
private String normalize(String sent){
if(sent == null){return sent;}
sent = sent.replaceAll("[<>,;.]", "")
.replaceAll("\\b("+stop+")\\b", "").replaceAll("\\d+", "").replaceAll("\\b[_a-z]*?ly\\b","")
.replaceFirst("^to_", "to").replaceFirst("^or_", "or")./*replaceAll("_", " ").*/replaceAll("\\s+", " ").replaceFirst("^\\s+", "").replaceFirst("\\s+$", "");
return sent.trim().toLowerCase();
}
private String collectStateNames(){
StringBuffer tags = new StringBuffer();
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select distinct state from "+this.tablePrefix+"_learnedstates");
while(rs.next()){
String tag = rs.getString("state");
if(tag == null){continue;}
tags.append(tag+"|");
}
}catch(Exception e){
LOGGER.error("Exception in CharacterLearner collectStateNames" + e);
e.printStackTrace();
}
return tags.toString()+Glossary.getAllCharacters();
}
private String collectOrganNames(){
StringBuffer tags = new StringBuffer();
try{
Statement stmt = conn.createStatement();
//ResultSet rs = stmt.executeQuery("select distinct term from fna.fnaglossary where category in ('STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT')");
ResultSet rs = stmt.executeQuery("select distinct term from "+this.tablePrefix+"_fnaglossary where category in ('STRUCTURE', 'SUBSTANCE', 'PLANT')");
while(rs.next()){
String tag = rs.getString("term");
if(tag == null){continue;}
tags.append(tag+"|");
}
rs = stmt.executeQuery("select distinct tag from "+this.tablePrefix+"_sentence");
while(rs.next()){
String tag = rs.getString("tag");
if(tag == null || tags.indexOf("|"+tag+"|") >= 0){continue;}
tags.append(tag+"|");
}
//find pl. form
rs = stmt.executeQuery("select word from "+this.tablePrefix+"_wordpos where pos = \"p\"");
while(rs.next()){
tags.append(rs.getString("word").trim()+"|");
}
tags = tags.replace(tags.lastIndexOf("|"), tags.lastIndexOf("|")+1, "");
}catch(Exception e){
LOGGER.error("Exception in CharacterLearner collectOrganNames" + e);
e.printStackTrace();
}
return tags.toString();
/*StringBuffer names = new StringBuffer();
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select distinct tag from "+this.tablePrefix+"_sentence order by tag");
while(rs.next()){
names.append(rs.getString("tag")+"|");
}
rs = stmt.executeQuery("select distinct word from "+this.tablePrefix+"_wordpos where pos='p' order by word");
while(rs.next()){
names.append(rs.getString("word")+"|");
}
}catch(Exception e){
e.printStackTrace();
}
return names.toString().replaceFirst("\\|$", "");*/
}
/**
* collect tag names (sing. and pl. forms)
mark sentences one by one using tags
tag only the starting words of a sentence.
convert numbers to NUM
* @param sentencetable
*/
private void markSentences(){
try{
Pattern tagsp = Pattern.compile("(.*?)\\b("+this.organnames+")\\b(.*)", Pattern.CASE_INSENSITIVE);
//now mark sentence one by one, add marked sentences in this.sentences
//break sentence into meaningful clauses (each with a marked subject)
//create a clause table to save the clauses
//tracking the relation between filenames and clauses, saving this info in
//a new table fileclauselink (filename, endindex of the last clause in the file).
/*moved to sentences2clauses
stmt.execute("create table if not exists "+this.tablePrefix+"_clause (clauseid int(11) not null primary key, tag varchar(150), modifier varchar(150), clause varchar(500), charsegment varchar(250), attributes varchar(500))");
stmt.execute("delete from "+this.tablePrefix+"_clause");
stmt.execute("create table if not exists "+this.tablePrefix+"_fileclauselink (filename varchar(200) not null primary key, endindex int(11))");
stmt.execute("delete from "+this.tablePrefix+"_fileclauselink");
*/
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select count(sentid) from "+this.tablePrefix+"_sentence");
rs.next();
int total = rs.getInt(1);
int sentid = 0;
String[] tos = CharacterLearner.tophrases.split("\\|");
String[] ors = CharacterLearner.orphrases.split("\\|");
while(sentid < total){
//rs = stmt.executeQuery("select tag, modifier, sentence from "+this.tablePrefix+"_sentence where sentid="+sentid+""); Hong: 10/17/09
rs = stmt.executeQuery("select tag, modifier, originalsent from "+this.tablePrefix+"_sentence where sentid="+sentid+"");
if(rs.next()){
String sent = (String)rs.getString("originalsent"); //Partha 10/17/09
//String sent = (String)rs.getString("sentence");
sent = sent.replaceAll("\\([^)]*\\)", ""); ///Hong 10/17/09 added 3 lines
sent = sent.replaceAll("\\{[^}]*\\}", "");
sent = sent.replaceAll("\\[[^]]*\\]", "");
//sent = sent.replaceAll(this.num, "NUM "); //all numbers => NUM
if(sent.matches(".*?("+CharacterLearner.tophrases+").*")){
sent = hide(tos, sent);
}
if(sent.matches(".*?("+CharacterLearner.orphrases+").*")){
sent = hide(ors, sent);
}
String taggedsent = "";
/*String[] sts = sent.split("\\s*,\\s*");
Matcher m = null;
for(int i = 0; i< sts.length; i++){
if(i!=0){ sts[i] = " , "+sts[i];}
m = p.matcher(sts[i]);
if(m.matches()){ //tag the first mentioning of an organ in a sentence
taggedsent += m.group(1)+"<"+m.group(2)+">"+m.group(3);
}else{
taggedsent +=sts[i];
}
}*/
Matcher m = tagsp.matcher(sent);
while(m.matches()){
taggedsent += m.group(1)+"<"+m.group(2)+">";
sent = m.group(3);
m = tagsp.matcher(sent);
}
taggedsent +=sent;
/* seg clauses should be done later, after the learning of states.
Pattern p = Pattern.compile(", (\\w+)? ?(<.*?>)");//the word after , should not be connectors such as "or"
Matcher m2 = p.matcher(taggedsent);
int start = 0;
while(m2.find()){
if(m2.group(1)==null || m2.group(1).compareTo("or") != 0){//the word after , should not be connectors such as "or"
int end = m2.start(); //this ends a clause
String taggedclause = taggedsent.substring(start, end+1);
addClause(sentid, sentid+offset, tag, modifier, taggedclause, false);
offset++;
start = end+1;
modifier = m2.group(1);
tag = m2.group(2).replaceAll("[<>]", "");
}
}
String taggedclause = taggedsent.substring(start);
addClause(sentid, sentid+offset, tag, modifier, taggedclause, true);
*/
sentences.put(new Integer(sentid), taggedsent); //do this in addClause
}
sentid++;
//System.out.println(sentid);
}
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner markSentences" + e);
e.printStackTrace();
}
}
private String hide(String[] phrases, String str){
for(int i = 0; i < phrases.length; i++){
String hidden = phrases[i].replaceAll("\\s+", "*");
str = str.replaceAll(phrases[i], hidden);
}
return str;
}
private String unhide(String str){
str = str.replaceAll("\\*", " ").trim();
return str;
}
private void createClauseTables(){
try{
Statement stmt = conn.createStatement();
stmt.execute("create table if not exists "+this.tablePrefix+"_clause (clauseid int(11) not null primary key, tag varchar(150), modifier varchar(150), clause varchar(500), charsegment varchar(500), attributes varchar(500))");
stmt.execute("delete from "+this.tablePrefix+"_clause");
stmt.execute("create table if not exists "+this.tablePrefix+"_fileclauselink (filename varchar(200) not null primary key, endindex int(11))");
stmt.execute("delete from "+this.tablePrefix+"_fileclauselink");
}catch(Exception e){
LOGGER.error("Exception in CharacterLearner createClauseTables" + e);
e.printStackTrace();
}
}
/**
* tag states with {}
* @param sentence, with organ names tagged, e.g. <leaves> basal.
* @return
*/
private String tagStates(String sent){
String taggedsent = "";
Pattern tagsp = Pattern.compile("(.*?)\\b("+this.statespatterns+")\\b(.*)", Pattern.CASE_INSENSITIVE);
Matcher m = tagsp.matcher(sent);
while(m.matches()){
taggedsent += m.group(1)+"{"+m.group(2)+"}";
sent = m.group(3);
m = tagsp.matcher(sent);
}
taggedsent +=sent;
return taggedsent.replaceAll("\\} \\{", " ");
}
private String[] getInfo(int sentid){
String[] info = new String[3];
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select tag, modifier, charsegment from "+this.tablePrefix+"_sentence where sentid="+sentid+"");
rs.next();
info[0] = rs.getString("tag");
info[1]= rs.getString("modifier");
info[2] = rs.getString("charsegment");
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner getInfo" + e);
e.printStackTrace();
}
return info;
}
/**
*
* @param taggedclause
* @param charsegment
* @return [0] the segment of charsegment matching this taggedclause, [1] the rest
*/
private String[] splitCharSegment(String taggedclause, String charsegment){
String[] splits = {"",""};
if(charsegment == null || charsegment.trim().compareTo("") == 0){
return splits;
}
String[] segs = charsegment.split(";");
int i = 0;
for(i = 0; i<segs.length; i++){
String[] parts = segs[i].split("#");
if(taggedclause.indexOf(parts[0]) >= 0){
splits[0] += segs[i]+";";
}else{
break;
}
}
for(int j = i; j<segs.length; j++){
splits[1] += segs[j]+";";
}
splits[0] = splits[0].replaceFirst(";$", "");
splits[1] = splits[1].replaceFirst(";$", "");
return splits;
}
private void sentences2clauses(){
createClauseTables();
int total = sentences.size();
int offset = 0;
for(int sentid = 0; sentid < total; sentid++){
if(sentid == 567){
System.out.println();
}
String taggedsent = (String)sentences.get(new Integer(sentid));
taggedsent = tagStates(taggedsent);
String[] info = getInfo(sentid);
String tag = info[0];
String modifier = info[1];
String charsegment = info[2]; //TODO split charsegment among clauses.
Pattern p = Pattern.compile(", (\\{[^{]*?\\})? ?(<\\w*?>)");
Matcher m2 = p.matcher(taggedsent);
int start = 0;
while(m2.find()){
//if(m2.group(1)==null ){
int end = m2.start(); //this ends a clause
String taggedclause = taggedsent.substring(start, end+1);
taggedclause = taggedclause.replaceAll("[}{]", "");
String[] segs = splitCharSegment(taggedclause, charsegment);
charsegment = segs[1];
addClause(sentid, sentid+offset, tag, modifier, taggedclause, segs[0], false);
offset++;
start = end+1;
modifier = m2.group(1)==null? "" : m2.group(1);
tag = m2.group(2).replaceAll("[<>]", "");
//}
}
String taggedclause = taggedsent.substring(start);
taggedclause = taggedclause.replaceAll("[}{]", "");
addClause(sentid, sentid+offset, tag, modifier, taggedclause, charsegment, true);
}
}
/**
* update clause table and fileclauselink table, and clause hashtable
* @param clauseid
* @param tag
* @param modifier
* @param clause
*/
private void addClause(int sentid, int clauseid, String tag, String modifier, String taggedclause, String charsegment, boolean lastclause){
//remove <> from taggedclause before put it in the clause table
//remove 2nd and later sets of <> from taggedclause before put into sentences (renamed to clauses) <pollen><grains> a b c d e ...
Pattern p = Pattern.compile("^([^>]*?)> <(.*)");
Matcher m = p.matcher(taggedclause);
if(m.matches()){
taggedclause = m.group(1)+"@"+m.group(2);
}
String tmp = taggedclause.replaceFirst("<", "#").replaceFirst(">", "##");
tmp = tmp.replaceAll("[<>]", "");
tmp = tmp.replaceFirst("##", ">");
tmp = tmp.replaceFirst("#", "<");
tmp = tmp.replaceFirst("@", "> <");
//sentences.put(new Integer(clauseid), tmp);
tmp = tmp.replaceAll("[<>]", "");
charsegment = charsegment==null || charsegment.trim().compareToIgnoreCase("null")==0 ? "" : charsegment;
charsegment = charsegment.trim();
try{
Statement stmt = conn.createStatement();
stmt.execute("insert into "+this.tablePrefix+"_clause (clauseid, tag, modifier, clause, charsegment) values("+clauseid+", '"+tag+"', '"+modifier+"', '"+tmp+"', '"+charsegment+"')");
if(lastclause){
ResultSet rs = stmt.executeQuery("select filename from "+this.tablePrefix+"_sentinfile where endindex="+sentid);
if(rs.next()){
String fname = rs.getString("filename");
stmt.execute("insert into "+this.tablePrefix+"_fileclauselink values ('"+fname+"', '"+clauseid+"')");
}
}
}catch (Exception e){
LOGGER.error("Exception in CharacterLearner addClause" + e);
e.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
/* String gfile = "C://Documents and Settings//hongcui//Desktop//WorkFeb2008//FNA//FNAGloss.txt";
CharacterLearner cl = new CharacterLearner("fnav5_corpus", "fna");
cl.markupCharState();
cl.assembleDescription();
cl.getMarkedDescription("1.xml");*/
}
}