/** * */ package fna.parsing.state; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Collections; import java.util.Enumeration; import java.util.Hashtable; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.eclipse.swt.custom.StyledText; import org.eclipse.swt.widgets.Display; import outputter.knowledge.TermOutputerUtilities; import fna.charactermarkup.ChunkedSentence; import fna.charactermarkup.Utilities; import fna.parsing.ApplicationUtilities; import conceptmapping.*; /** * @author hongcui * last stable version: 653 * this version: try to find additional nouns from unknown words, and mark them with <>. */ @SuppressWarnings("unused") public class SentenceOrganStateMarker { private Hashtable<String, String> sentences = new Hashtable<String, String>(); private Connection conn = null; private boolean marked = false; private boolean fixadjnn = false; private int fixedcount =0; private Hashtable<String, String> adjnounsent = null; private String adjnounslist = ""; private String organnames = null; private String statenames = null; private String tableprefix = null; private String glosstable = null; private String colors = null; //public static String compoundprep = "adaxial to|abaxial to|axial to|herniated into|ipsilateral to|displaced to|herniated out of|specificity to|broken into two pieces|altered number of|quality of a substance|deviation towards the lateral side|deviation towards the medial side|misaligned towards|misaligned away from|present in normal numbers in organism|down|up|has extra parts of type|lacks all parts of type|has fewer parts of type|sensitivity to irradiation|decreased sensitivity to irradiation|increased sensitivity to irradiation|proportionality to|sensitivity to oxygen|lack of fertility in offspring|detached from|susceptibility toward|resistance to|reflex angle to|obtuse angle to|convex angle to|degree of pigmentation|adjacent to|located in|insoluble in|soluble in|parallel to|articulated with|increased sensitivity toward|quality of a liquid|tightly articulated with|quality of a gas|quality of a solid|broadly articulated with|decreased sensitivity toward|separated from|responsive to|unresponsive to|present in greater numbers in organism|far from|opposite|variability of shape|susceptible toward|insusceptible toward|anterodorsal to|posterodorsal to|anteroventral to|posteroventral to|absent|has normal numbers of parts of type|concentration of|sensitivity toward|response to|lacks parts or has fewer parts of type|present in fewer numbers in organism|resistant to|has number of|overlap with|in contact with|variability of rate|increased variability of rate|decreased variability of rate|increased variability of size|decreased variability of size|variability of size|proximal to|ventral to|lateral to|hyporesponsive to|medial to|hyperresponsive to|increased tolerance to|diagonal to|decreased tolerance to|basal to|cauline to|sensitive toward|insensitive toward|tolerant to|dorsal to|decreased variability of color|increased variability of color|variability of color|decreased resistance to|increased resistance to|protruding into|protruding out of|anterior to|posterior to|decreased susceptibility toward|attached to|associated with|increased susceptibility toward|distal to|misaligned with|aligned with|decreased variability of temperature|variability of temperature|increased variability of temperature|posteromedial to|oriented towards|multifocal to coalescing|divergent from|perpendicular to|interlocked with|level with|unfused from|fused with|inserted into|activity (of a radionuclide)|right angle to|dissociated from|h minus|h plus|F minus mating type|a mating type (yeast)|left side of|right side of|according to|ahead of|along with|apart from|as for|aside from|as per|as to as well as|away from|because of|but for|by means of|close to|contrary to|depending on|due to|except for|equal to|forward of|further to|in addition to|in association to|in association with|in between|in case of|in combination with|in face of|in favour of|in front of|in lieu of|in spite of|instead of|in view of|near to|next to|on account of|on behalf of|on board|on to|on top of|opposite to|other than|out of|outside of|owing to|preparatory to|prior to|regardless of|relative to|save for|subequal to|together with|unequal to|up against|up until|vis-a-vis|with reference to|with regard to"; //don't introduce () in compoundprep as it is used in a reg exp public static String compoundprep = "adaxial to|abaxial to|axial to|herniat(?:ed|ing|es|e) into|ipsilateral to|displaced to|herniat(?:ed|ing|es|e) out of|specificity to|altered number of|deviation towards?|deviation towards?|misalign(?:ed|ing|s)? towards?|misalign(?:ed|ing|s)? away from|ha(?:s|ve|ving) extra parts of|lack(?:s|ing)? all parts of|ha(?:s|ve|ving) fewer parts of|decreased sensitivity to|increased sensitivity to|sensitivity to|proportionality to|lack(?:s|ing)? of|detach(?:es|ed|ing)? from|susceptibility toward|reflex angle to|obtuse angle to|convex angle to|adjacent to|locat(?:ed|es|ing) in|insoluble in|soluble in|parallel to|articulat(?:ed|es|ing|e) with|increased sensitivity towards?|broadly articulat(?:ed|es|ing|e) with|tightly articulat(?:ed|es|ing|e) with|articulat(?:ed|es|ing|e) with|decreased sensitivity towards?|separat(?:ed|ing|es|e) from|responsive to|unresponsive to|far from|susceptible towards?|insusceptible towards?|anterodorsal to|posterodorsal to|anteroventral to|posteroventral to|concentration of|sensitivity towards?|respons(?:ing|es|e) to|resistant(?:ing|ed|s)? to|ha(?:s|ve|ving) number of|overlap(?:ped|ping|s)? with|in contact with|proximal to|ventral to|lateral to|hyporesponsive to|medial to|hyperresponsive to|increased tolerance to|diagonal to|decreased tolerance to|basal to|cauline to|sensitive towards?|insensitive towards?|tolerant to|dorsal to|decreased resistance to|increased resistance to|resistance to|protrud(?:ing|ed|es|e) into|protrud(?:ing|ed|es|e) out of|anterior to|posterior to|decreased susceptibility toward|attach(?:ed|ing|es) to|increased susceptibility toward|distal to|misalign(?:ed|ing|s)? with|align(?:ed|ing|s)? with|decreased variability of|increased variability of|variability of|posteromedial to|orient(?:ed|ing|s)? towards|multifocal to|divergent from|perpendicular to|interlock(?:ed|ing|s)? with|level(?:ing|ed|s)? with|unfus(?:ing|ed|es|e) from|fus(?:ing|ed|es|e) with|insert(?:ing|ed|s)? into|right angle to|dissociat(?:ing|ed|es|e) from|left side of|right side of|according to|ahead of|along with|apart from|as for|aside from|as per|as to as well as|away from|because of|but for|by means of|close to|composed of|consist(?:s|ed)? of|contrary to|depending on|due to|except for|equal to|forward of|further to|in addition to|in association to|in association with|in between|in case of|in combination with|in face of|in favour of|in front of|in lieu of|in spite of|instead of|in view of|near to|next to|on account of|on behalf of|on board|on to|on top of|opposite to|other than|out of|outside of|owing to|preparatory to|prior to|regardless of|relative to|save for|subequal to|together with|unequal to|up against|up until|vis-a-vis|with reference to|with regards? to|in left side of|in right side of|connect(?:ing|ed|s)? to|extend(?:ing|ed|s)? to|deep to|develop(?:ing|ed|s)? from|enclos(?:ing|es|e)|extend(?:ing|ed|s)? from|in anterior side of|in distal side of|in lateral side of|in median plane of|in posterior side of|in proximal side of|locat(?:ed|ing|e) in|overlap(?:s)?|part of|pass(?:ing|es)? through|surrounded by|surround(?:s|ing)?|vicinity of"; public static Pattern compreppattern = Pattern.compile("(.*?)\\b("+compoundprep+")\\b(.*)"); private String ignoredstrings = "if at all|at all|as well (?!as)|i\\s*\\.\\s*e\\s*\\.|means of"; //private ArrayList<String> order = new ArrayList<String>(); private Display display; private StyledText charLog; private String termprefix = "basi|hypo"; private Connection con; private String url; private boolean printCompoundPP=false; private Pattern organp; private Pattern statep; /** * */ public SentenceOrganStateMarker(Connection conn, String tableprefix, String glosstable, boolean fixadjnn, Display display, StyledText charLog) { this.display = display; this.charLog = charLog; this.tableprefix = tableprefix; this.conn = conn; this.glosstable = glosstable; this.fixadjnn = fixadjnn; try{ Statement stmt = conn.createStatement(); stmt.execute("drop table if exists "+this.tableprefix+"_markedsentence"); stmt.execute("create table if not exists "+this.tableprefix+"_markedsentence (sentid int(11)NOT NULL Primary Key, source varchar(100) , markedsent text, rmarkedsent text, type varchar(20))"); //stmt.execute("update "+this.tableprefix+"_sentence set charsegment =''"); colors = this.colorsFromGloss(); }catch(Exception e){ e.printStackTrace(); } //preparing... this.adjnounsent = new Hashtable<String, String>(); //source ->adjnoun (e.g. inner) ArrayList<String> adjnouns = new ArrayList<String>();//all adjnouns try{ Statement stmt = conn.createStatement(); //ResultSet rs = stmt.executeQuery("select source, tag, originalsent from "+this.tableprefix+"_sentence"); ResultSet rs = stmt.executeQuery("select source, modifier, tag, sentence, originalsent from "+this.tableprefix+"_sentence order by sentid desc"); //leave ditto as it is while(rs.next()){//read sent in in reversed order String tag = rs.getString("tag"); String sent = rs.getString("sentence").trim(); if(sent.length()!=0){ String source = rs.getString("source"); String osent = rs.getString("originalsent"); String text = stringColors(sent.replaceAll("</?[BNOM]>", "")); text = text.replaceAll("[_-]+\\s*shaped", "-shaped").replaceAll("(?<=\\s)�\\s+m\\b", "um"); text = text.replaceAll("°", "�"); text = text.replaceAll("\\bca\\s*\\.", "ca"); text = text.replaceAll("(?<=\\d)\\s*(?=("+ChunkedSentence.percentage+")\\b)", " ").replaceAll("\\s+", " "); //80percent =>80 percent text = text.replaceAll("(?<=\\d)\\s*(?=("+ChunkedSentence.degree+")\\b)", " ").replaceAll("\\s+", " "); //80degree =>80 degree text = text.replaceAll("height width ratio", "h/w"); text = Utilities.reformAuxiliaryVerbs (text); text = text.replaceAll("(?<=\\d\\s)x(?=\\s\\w)", "times"); // 2 x longer => 2 times longer, won't match 2x=24 text = stringCompoundPP(text); text = rs.getString("modifier")+"##"+tag+"##"+text; //text.matches(".*?("+termprefix+").*") sentences.put(source, text); } } //merge ditto sentences with previous sentences: this had the drawback of attaching nearest organ as the subject of the ditto sentence /*String dittos = ""; while(rs.next()){//read sent in in reversed order String tag = rs.getString("tag"); String sent = rs.getString("sentence"); String source = rs.getString("source"); String osent = rs.getString("originalsent"); if(tag.compareTo("ditto")==0){ //attach ditto to the previous sentence dittos = sent.trim()+" "+dittos; //sentences.put(source, ""); //make ditto sent id's disappear }else{ sent =sent.trim() +" "+ dittos.trim(); if(osent.indexOf(dittos.trim())<0) osent =osent.trim() +" "+ dittos.trim(); //put a check here so dittos are not added multiple times when the user runs the Parser mutiple times on one document collection dittos = ""; String text = stringColors(sent.replaceAll("</?[BNOM]>", "")); text = text.replaceAll("[ _-]+\\s*shaped", "-shaped").replaceAll("(?<=\\s)�\\s+m\\b", "um"); text = text.replaceAll("°", "�"); text = text.replaceAll("\\bca\\s*\\.", "ca"); text = rs.getString("modifier")+"##"+tag+"##"+text; sentences.put(source, text); //update originalsent Statement st = conn.createStatement(); st.execute("update "+this.tableprefix+"_sentence set originalsent='"+osent+"' where source='"+source+"'"); } }*/ //collect adjnouns stmt = conn.createStatement(); rs = stmt.executeQuery("SELECT distinct modifier FROM "+this.tableprefix+"_sentence s where modifier != \"\" and tag like \"[%\""); while(rs.next()){ String modifier = rs.getString(1).replaceAll("\\[.*?\\]", "").trim(); adjnouns.add(modifier); } //collect senteces that need adj-nn disambiguation stmt = conn.createStatement(); rs = stmt.executeQuery("SELECT source, tag, modifier FROM "+this.tableprefix+"_sentence s where modifier != \"\" and tag like \"[%\""); while(rs.next()){ String modifier = rs.getString(2).replaceAll("\\[.*?\\]", "").trim(); String tag = rs.getString("tag"); adjnounsent.put(tag, modifier);//tag: [phyllary] //adjnounsent.put(tag.replaceAll("\\W", ""), modifier);//TODO: need to investigate more on this } }catch(Exception e){ e.printStackTrace(); } Collections.sort(adjnouns); for(int i = adjnouns.size()-1; i>=0; i--){ this.adjnounslist +=adjnouns.get(i)+"|"; } this.adjnounslist = this.adjnounslist.trim().length()==0? null : "[<{]*"+this.adjnounslist.replaceFirst("\\|$", "").replaceAll("\\|+", "|").replaceAll("\\|", "[}>]*|[<{]*").replaceAll(" ", "[}>]* [<{]*")+"[}>]*"; this.organnames = collectOrganNames(); this.statenames = collectStateNames(); this.organp = Pattern.compile("(.*?)\\b("+organnames+")\\b(.*)", Pattern.CASE_INSENSITIVE); this.statep = Pattern.compile("(.*?)\\b("+statenames+")\\b(.*)", Pattern.CASE_INSENSITIVE); } /** * The normalize prefix method is used to expand the prefix of a sentence. * Example : {basi}- and <hypobranchial> <ossifications> will be expanded to * <basibranchial> and <hypobranchial> <ossifications> * * @param text * @return * @throws ClassNotFoundException * @throws SQLException */ private String normalizePrefix(String text) throws ClassNotFoundException, SQLException { String[] splittext = text.split("\\s"); for(int i=0;i<splittext.length;i++) { if(splittext[i].matches("\\{?("+termprefix+")\\}?-.*")) { if(((i+2)<=splittext.length) && ((splittext[i+1].equals("and")||(splittext[i+1].equals("or"))))) { String termprefix1[] = termprefix.split("\\|"); for(int z=0;z<termprefix1.length;z++) if(splittext[i+2].contains(termprefix1[z])) { splittext[i] ="<"+splittext[i].substring(1, splittext[i].lastIndexOf("-")-1)+splittext[i+2].substring(termprefix1[z].length()+1, splittext[i+2].length()); inserttotable(splittext[i]); break; } break; } } } // combine splittext to form a single text. text=""; for(int i=0,j=0;i<splittext.length;i++,j++) { text+=splittext[i]; if(j<splittext.length-1) text+=' '; } return text; } // The normalized prefix is inserterted into term category table as a structure term void inserttotable(String term) throws ClassNotFoundException, SQLException { Class.forName("com.mysql.jdbc.Driver"); con = DriverManager.getConnection(ApplicationUtilities.getProperty("database.url")); // Drop table if exists Statement stmt0 = con.createStatement(); System.out.println("Insert into "+this.tableprefix+"_term_category(term,category)"+" values(\""+term.trim()+"\",\"structure\")"); stmt0.executeUpdate("Insert into "+this.tableprefix+"_term_category(term,category)"+" values(\""+term.trim()+"\",\"structure\")"); } /** * turn reddish purple to reddish-purple * @param replaceAll * @return */ private String stringColors(String text) { boolean did = false; String pt = "\\b(?<="+this.colors+")\\s+(?="+this.colors+")\\b"; Pattern p = Pattern.compile(pt); Matcher m = p.matcher(text); while(m.find()){ text = text.replaceFirst(pt, "_c_"); m = p.matcher(text); did = true; } //if(did) System.out.println("[color]:"+text); return text; } public Hashtable<String, String> markSentences() throws Exception{ if(this.marked){ loadMarked(); }else{ this.showOutputMessage("System is preparing the sentences..."); //Iterator<String> it = order.iterator(); //while(it.hasNext()){ Enumeration<String> en = sentences.keys(); while(en.hasMoreElements()){ String source = en.nextElement(); //String source = it.next(); String sent = (String)sentences.get(source); String taggedsent = ""; //if(sent.trim().length()>0){ String[] splits = sent.split("##"); String modifier = splits[0]; String tag = splits[1]; sent = splits[2].trim().replaceAll("\\b("+this.ignoredstrings+") ", "");//must use space at the end for "i . e ." to match taggedsent = markASentence(source, modifier, tag.trim(), sent); //} // System.out.println(taggedsent); sentences.put(source, taggedsent); try{ Statement stmt1 = conn.createStatement(); ResultSet rs = stmt1.executeQuery("select sentid, type from "+this.tableprefix+"_sentence where source='"+source+"'"); if(rs.next()){ int id = rs.getInt("sentid"); String type = rs.getString("type"); stmt1.execute("insert into "+this.tableprefix+"_markedsentence (sentid, source, markedsent, type) values("+id+",'"+source+"', '"+taggedsent+"', '"+type+"')"); } }catch(Exception e){ e.printStackTrace(); } } } return sentences; } protected void loadMarked() { try{ Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select source, markedsent from "+this.tableprefix+"_markedsentence"); while(rs.next()){ String source = (String)rs.getString("source"); String taggedsent = (String)rs.getString("markedsent"); sentences.put(source, taggedsent); //do this in addClause } }catch(Exception e){ e.printStackTrace(); } } public String markASentence(String source, String modifier, String tag, String sent) throws ClassNotFoundException, SQLException { String taggedsent = markthis(source, sent, organp, "<", ">"); taggedsent = markthis(source, taggedsent, statep, "{", "}"); taggedsent = taggedsent.replaceAll("[<{]or[}>]", "or"); //make sure to/or are left untagged taggedsent = taggedsent.replaceAll("[<{]to[}>]", "to"); //remove "<>" for <{spine}>-{tipped} =>spine-{tipped} or {spine}-{tipped} if(taggedsent.indexOf(">-")>=0){ taggedsent = taggedsent.replaceAll(">-", "#-").replaceAll("<(?=\\S+#)", "").replaceAll("#", ""); } if(this.fixadjnn && this.adjnounslist!=null){ //if((adjnounsent.containsKey(tag)&& taggedsent.matches(".*?[<{]*\\b(?:"+adjnounslist+")\\b[}>]*.*")) || taggedsent.matches(".*? of [<{]*\\b(?:"+adjnounslist+")\\b[}>]*.*")){ if((adjnounsent.containsKey(tag)&& taggedsent.matches(".*?[<{]*\\b(?:"+adjnounslist+")[^ly ]*\\b[}>]*.*")) || taggedsent.matches(".*? of [<{]*\\b(?:"+adjnounslist+")[^ly ]*\\b[}>]*.*")){ taggedsent = fixInner(source, taggedsent, tag.replaceAll("\\W",""));//need to put tag in after the modifier inner } //including modifiers results in nouns are added to state adjs. //if(adjnounsent.containsKey(modifier) && taggedsent.matches(".*?[<{]*\\b(?:"+adjnounslist+")\\b[}>]*.*") ){ // taggedsent = fixInner(source, taggedsent, modifier, true);//@TODO: debug: need to put tag in after the modifier inner //} } //fix cases such as {dorsal} and <{anal}> <fins> => <dorsal> <fins> and <anal> <fins>: "dorsal and anal fins" if(taggedsent.matches(".*?\\}>? and .*")){ String sentcopy = taggedsent; boolean changed = false; Pattern p = Pattern.compile("(.*?)<?\\{(\\w+)\\}>? and ([^\\d]*) (<\\w+> *)+(.*)"); Matcher m = p.matcher(taggedsent); while(m.matches()){ String lead = m.group(1).trim()+ " "; String m1 = m.group(2).trim(); String m2 = m.group(3).trim(); String organ = taggedsent.substring(m.end(3), m.start(5)).trim(); String rest = m.group(5); if(Utilities.isPosition(m1, conn, this.glosstable) && m2.matches("(<?\\{\\w+\\}>? *)+")){ //m2 can not have numbers, puncts, or stopword/prep taggedsent = lead +"{"+ m1 +"} " + organ + " and " +m2+" "+organ +" "+ rest; m = p.matcher(taggedsent); changed = true; }else{ taggedsent = lead +"{"+ m1 +"}### and " +m2+" "+organ +" "+ rest; //avoid infinite loop m = p.matcher(taggedsent); } } taggedsent = taggedsent.replaceAll("\\}###", "}"); if(changed){ System.out.println("before inserting organ: "+sentcopy); System.out.println("after inserting organ: "+taggedsent); } } //fix cases such as basi_ and hypobranchial => basibranchial and hypobranchial if(taggedsent.matches("\\{?("+termprefix+")\\}?-.*")){ taggedsent = normalizePrefix(taggedsent); //basi_ and hypobranchial => basibranchial and hypobranchial } return taggedsent; } /** * mark Inner as organ for sent such as inner red. * @param adjnouns * @param taggedsent * @return */ private String fixInner(String source, String taggedsent, String tag) { this.showOutputMessage("System is rewriting some sentences..."); String fixed = ""; String copysent = taggedsent; boolean needfix = false; boolean changed = true; //Pattern p =Pattern.compile("(.*?)(\\s*(?:[ <{]*\\b(?:"+adjnounslist+")\\b[}> ]*)+\\s*)(.*)"); //Pattern p0 =Pattern.compile("(.*?)((?:^| )(?:(?:\\{|<\\{)*\\b(?:"+adjnounslist+")\\b(?:\\}>|\\})*) )(.*)"); //Pattern p =Pattern.compile("(.*?)((?:^| )(?:(?:\\{|<\\{)*\\b(?:"+adjnounslist+")[^ly ]*\\b(?:\\}>|\\})*)\\s+)(.*)"); Pattern p =Pattern.compile("(.*?)((?:^| )(?:(?:\\{|<\\{)*\\b(?:"+adjnounslist+")[^ly ]*\\b(?:\\}>|\\})*)\\s+)(((?!to\\s+\\D).*).*)"); Matcher m = p.matcher(taggedsent); //Matcher m0 = p0.matcher(taggedsent); int matchcount = 0; while(m.matches() && changed){ changed = false; matchcount++; String before = m.group(1); String inner = m.group(2); String after = m.group(3); //TODO: may be after should not start with "to" : proximal to heads tocheck: 3/30/11 if(!before.trim().endsWith(">") &&!after.trim().startsWith("<")){//mark inner as organ if(before.trim().endsWith("of")&& before.lastIndexOf("<")>=0){ //"apices of inner" may appear at the main structure is mentioned, in these cases, matchcount>1 String organ = before.substring(before.lastIndexOf("<")); if(copysent.startsWith(organ)){ tag = getParentTag(source);//tag may be null, remove before return } organ = organ.replaceFirst("\\s*of\\s*$", "").replaceAll("\\W", ""); if(TermOutputerUtilities.toSingular(organ).compareTo(tag)==0 || (organ.matches("(apex|apices)") && tag.compareTo("base")==0)){ String b = source.substring(0, source.indexOf("-")+1); String nsource = b +(Integer.parseInt(source.substring(source.indexOf("-")+1))-1); tag = getParentTag(nsource); } } String copyinner = inner.trim(); inner = copyinner.replaceAll("[<{}>]", "").replaceAll("\\s+", "} {").replaceAll("\\{and\\}", "and").replaceAll("\\{or\\}", "or"); //inner = "<"+inner+">"; //inner = "{"+inner+"} <"+tag+">"; fixed +=before+" "+"{"+inner+"} "; //taggedsent = matchcount==1 && !before.trim().endsWith("of")? " "+after : "#<"+tag+">#"+" "+after; if(after.matches("^\\d\\s*/\\s*\\d.*")){//proximal 1 / 2 taggedsent = " "+after; }else if(inner.endsWith("er") && after.startsWith("than")){ taggedsent = " "+after; }else if(before.trim().endsWith("of")){ taggedsent = "<"+tag+">"+" "+after; }else if(matchcount==1 && copysent.startsWith(copyinner)){ taggedsent = " "+after; }else{ int start = fixed.lastIndexOf(">")>=0? fixed.lastIndexOf(">") : 0; String segment = fixed.substring(start).trim(); if(segment.indexOf(",")<0 && !segment.startsWith("and")){ taggedsent = " "+after; }else{ taggedsent = "<"+tag+">"+" "+after; } } needfix = true; changed = true; } //fixed +=before+" "; //taggedsent = inner+" "+after; m = p.matcher(taggedsent); //fixed = before+" "+inner+" "+after; //{outer} {pistillate} //m = p.matcher(fixed); } fixed +=taggedsent; if(needfix){ //System.out.println("fixed "+fixedcount+":["+source+"] "+fixed); fixedcount++; } if(fixed.trim().length()<1){ fixed = taggedsent; } return fixed.replaceAll("\\s+", " ").replaceAll("<null>", ""); } private String getParentTag(String source) { String tag = null; try{ Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select sentid from "+this.tableprefix+"_sentence where source='"+source+"'"); if(rs.next()){ int sentid = rs.getInt("sentid"); sentid = sentid+1; do{ sentid--; rs = stmt.executeQuery("select tag from "+this.tableprefix+"_sentence where sentid <"+sentid+" order by sentid desc limit 1"); if(rs.next()){ tag = (String)rs.getString("tag").replaceAll("\\W", ""); } }while(tag.compareTo("ditto")==0); } rs.close(); stmt.close(); }catch(Exception e){ e.printStackTrace(); } return tag; } /** * retag {caline} 10 to <caline> 10 when an adjnoun does not follow an organ or proceeds an organ. * @param adjnouns * @param taggedsent * @return */ private String fixAdjNouns(/*ArrayList<?> adjnouns,*/ String adjnoun, String taggedsent) { adjnoun = adjnoun.replaceAll("\\s+", "\\\\W+"); taggedsent = Pattern.compile("[<{]*\\b"+adjnoun+"\\b[}>]*", Pattern.CASE_INSENSITIVE).matcher(taggedsent).replaceFirst("<"+adjnoun+">").replaceAll("W\\+", "> <").replaceAll("<and>", "and").replaceAll("<or>", "or"); return taggedsent; } public static String markthis(String source, String sent, Pattern tagsp, String leftmark, String rightmark) { //no need if select sentence (vs. originalsent) //remove () //sent = sent.replaceAll("\\(.*?\\)", ""); //remove (text) //sent = sent.replaceAll("\\(\\s+(?![\\d\\�\\-\\�]).*?(?<![\\d\\�\\-\\�])\\s+\\)", ""); sent = sent.replaceAll("(?<=\\w)\\s+(?=[,\\.;:])", ""); //sent = sent.replaceAll("_", "-"); //keep _ so phrases are treated as one word //System.out.println(parts); String taggedsent = ""; Matcher m = tagsp.matcher(sent); while(m.matches()){ taggedsent += m.group(1)+leftmark+m.group(2)+rightmark; sent = m.group(3); m = tagsp.matcher(sent); } taggedsent +=sent; String tsent = ""; Pattern p = Pattern.compile("(.*\\}-)(\\w+)(.*)"); m = p.matcher(taggedsent); while(m.matches()){ tsent += m.group(1)+"{"+m.group(2)+"}"; taggedsent = m.group(3); m = p.matcher(taggedsent); } tsent +=taggedsent; tsent = tsent.replaceAll("\\}-\\{", "-"); // => {oblong}-{ovate} : {oblong-ovate} /*p = Pattern.compile("(.*?<[^>]*) ([^<]*>.*)");//<floral cup> => <floral-cup> m = p.matcher(tsent); while(m.matches()){ tsent = m.group(1)+"-"+m.group(2); m = p.matcher(tsent); }*/ tsent = tsent.replaceAll("\\s*,\\s*", " , "); tsent = tsent.replaceAll("\\s*\\.\\s*", " . "); tsent = tsent.replaceAll("\\s*;\\s*", " ; "); tsent = tsent.replaceAll("\\s*:\\s*", " : "); tsent = tsent.replaceAll("\\s*\\]\\s*", " ] "); tsent = tsent.replaceAll("\\s*\\[\\s*", " [ "); //tsent = tsent.replaceAll("\\s*\\)\\s*", " ) "); //tsent = tsent.replaceAll("\\s*\\(\\s*", " ( "); tsent = tsent.replaceAll("\\s+", " ").trim(); return tsent; } protected String collectStateNames(){ String statestring = ""; try{ Statement stmt = conn.createStatement(); //ResultSet rs = stmt.executeQuery("select word from "+this.tableprefix+"_wordpos where pos ='b'"); ResultSet rs = stmt.executeQuery("select word from "+this.tableprefix+"_wordroles where semanticrole ='c' "); while(rs.next()){ String w = rs.getString("word"); if(!w.matches("\\W+") && !w.matches("("+ChunkedSentence.stop+")") &&!w.matches("("+ChunkedSentence.prepositions+")")){ statestring += "|"+ w; } } /*wordroles only holds word not in glossary, so need to use glossary to mark a sentence as well.*/ rs = stmt.executeQuery("select distinct term from "+this.glosstable+" where category not in ('STRUCTURE', 'SUBSTANCE', 'PLANT', 'nominative', 'life_style')"); while(rs.next()){ String term = rs.getString("term").trim(); if(term == null){continue;} term = term.indexOf(" ")> 0? term.substring(term.lastIndexOf(' ')+1) : term; if(!statestring.matches(".*\\b"+term+"\\b.*") && !term.matches("("+ChunkedSentence.stop+")") &&!term.matches("("+ChunkedSentence.prepositions+")")) statestring+=("|"+ term); } }catch (Exception e){ e.printStackTrace(); } return statestring.replaceAll("\\b(and|or|to)\\b", "").replaceAll("\\\\d\\+", "").trim().replaceFirst("^\\|", "").replaceFirst("\\|$", "").replaceAll("\\|+", "|"); } protected String collectOrganNames(){ StringBuffer tags = new StringBuffer(); try{ Statement stmt = conn.createStatement(); organNameFromGloss(tags, stmt); organNameFromSentences(tags, stmt); organNameFromPlNouns(tags, stmt); tags = tags.replace(tags.lastIndexOf("|"), tags.lastIndexOf("|")+1, ""); }catch(Exception e){ e.printStackTrace(); } return tags.toString().replaceAll("\\b\\d+\\b", "").replaceAll("\\|+", "|"); } protected void organNameFromPlNouns(StringBuffer tags, Statement stmt) throws SQLException { ResultSet rs; String wordroletable = this.tableprefix + "_"+ApplicationUtilities.getProperty("WORDROLESTABLE"); rs = stmt.executeQuery("select word from "+wordroletable+" where semanticrole in ('op', 'os')"); while(rs.next()){ String w = rs.getString("word").trim(); if(!w.matches("("+ChunkedSentence.stop+")") &&!w.matches("("+ChunkedSentence.prepositions+")")){ w = Utilities.cleanup(w); tags.append(w+"|"); } } /* String postable = this.tableprefix + "_"+ApplicationUtilities.getProperty("POSTABLE"); rs = stmt.executeQuery("select word from "+postable+" where pos in ('p', 's', 'n') and word not in (select word from "+wordroletable+" where semanticrole in ('op', 'os'))");// and word not in (select term from "+this.glosstable+" where category ='life_style')"); while(rs.next()){ tags.append(rs.getString("word").trim()+"|"); }*/ } /** * collect adj-noun structures such as "inner" as structure name * @param tags * @param stmt * @throws SQLException */ protected void organNameFromSentences(StringBuffer tags, Statement stmt) throws SQLException { ResultSet rs; /*tag terms are already in WORDROLES * rs = stmt.executeQuery("select distinct tag from sentence where tag not like '% %'"); while(rs.next()){ String tag = rs.getString("tag"); if(tag == null || tag.indexOf("[")>=0|| tags.indexOf("|"+tag+"|") >= 0){continue;} tags.append(tag+"|"); }*/ rs = stmt.executeQuery("select modifier, tag from "+this.tableprefix+"_sentence where tag like '[%]'"); //inner [tepal] while(rs.next()){ String m = rs.getString("modifier"); m = m.replaceAll("\\[^\\[*\\]", ""); if(m.compareTo("")!= 0){ String tag = null; if(m.lastIndexOf(" ")<0){ tag = m; }else{ tag = m.substring(m.lastIndexOf(" ")+1); //last word from modifier } if(tag == null ||tag.indexOf("[")>=0|| tags.indexOf("|"+tag+"|") >= 0 || tag.indexOf("[")>=0 || tag.matches(".*?(\\d|"+ChunkedSentence.stop+"|"+ChunkedSentence.prepositions+").*")){continue;} tag = Utilities.cleanup(tag); tags.append(tag+"|"); } } } protected void organNameFromGloss(StringBuffer tags, Statement stmt) throws SQLException { ResultSet rs = stmt.executeQuery("select distinct term from "+this.glosstable+" where category in ('STRUCTURE', 'SUBSTANCE', 'PLANT', 'nominative', 'structure')"); while(rs.next()){ String term = rs.getString("term").trim(); if(term == null){continue;} term = term.indexOf(" ")> 0? term.substring(term.lastIndexOf(' ')+1) : term; if(!term.matches("("+ChunkedSentence.stop+")") &&!term.matches("("+ChunkedSentence.prepositions+")")){ term = Utilities.cleanup(term); tags.append(term+"|"); } } } protected String colorsFromGloss() throws SQLException { StringBuffer colors = new StringBuffer(); Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select distinct term from "+this.glosstable+" where category in ('coloration', 'color')"); while(rs.next()){ String term = rs.getString("term").trim(); if(term == null){continue;} term = term.indexOf(" ")> 0? term.substring(term.lastIndexOf(' ')+1) : term; term = Utilities.cleanup(term); colors.append(term+"|"); } return colors.toString().replaceFirst("\\|$", ""); } private void resetOutputMessage() { if(display==null)return; display.syncExec(new Runnable() { public void run() { if(charLog!=null) charLog.setText(""); } }); } private void showOutputMessage(final String message) { if(display==null)return; display.syncExec(new Runnable() { public void run() { if(charLog!=null) charLog.append(message+"\n"); } }); } /* * Handles the compound prepositions */ private String stringCompoundPP(String text) { boolean did = false; String result = ""; Matcher m = compreppattern.matcher(text); while(m.matches()){ String linked = m.group(2).replaceAll("\\s+", "-"); result += m.group(1)+ linked; text = m.group(3); m = compreppattern.matcher(text); did = true; } result += text; if(did && printCompoundPP ) System.out.println("[result]:"+result); return result; } /** * @param args */ public static void main(String[] args) { Connection conn = null; //String database="fnav19_benchmark"; //String database="treatiseh_benchmark"; //String database="plaziants_benchmark";//TODO //String database="annotationevaluation"; //String database ="phenoscape"; try{ if(conn == null){ Class.forName("com.mysql.jdbc.Driver"); conn = DriverManager.getConnection(ApplicationUtilities.getProperty("database.url")); } //SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, "pltest", "antglossaryfixed", false); //SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, "fnav19", "fnaglossaryfixed", true); //SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, "treatiseh", "treatisehglossaryfixed", false); SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, ApplicationUtilities.getProperty("table.prefix"), "orig_fishglossaryfixed", true, null, null); //SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, "plazi_ants_clause_rn", "antglossary"); //SentenceOrganStateMarker sosm = new SentenceOrganStateMarker(conn, "bhl_clean", "fnabhlglossaryfixed"); sosm.markSentences(); }catch(Exception e){ e.printStackTrace(); } } }