/* $Id: POSTagger4StanfordParser.java 988 2011-09-23 16:44:53Z hong1.cui $ */
/**
*
*/
package fna.charactermarkup;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import outputter.knowledge.TermOutputerUtilities;
import fna.parsing.ApplicationUtilities;
import fna.parsing.state.SentenceOrganStateMarker;
import conceptmapping.*;
/**
* @author hongcui
*
*/
@SuppressWarnings({ "unused","static-access" })
public class POSTagger4StanfordParser {
static protected Connection conn = null;
static protected String username = "root";
static protected String password = "root";
private ArrayList<String> chunkedtokens = null;
private ArrayList<String> charactertokensReversed = null;
public static Hashtable<String, String> characterhash = new Hashtable<String, String>();
private boolean printList = false;
private String tableprefix = null;
private String glosstable = null;
public static String comprepstring = SentenceOrganStateMarker.compoundprep.replaceAll(" ", "-");
private static Pattern compreppattern = Pattern.compile("\\{?("+comprepstring+")\\}?");
//private Pattern viewptn = Pattern.compile( "(.*?\\b)(in\\s+[a-z_<>{} -]+\\s+[<{]*view[}>]*)(\\s.*)"); to match in dorsal view
//private Pattern viewptn = Pattern.compile( "(.*?\\b)(in\\s+[a-z_<>{} -]*\\s*[<{]*(?:view|profile)[}>]*)(\\s.*)"); //to match in dorsal view and in profile
private Pattern viewptn = Pattern.compile( "(.*?\\b)(in\\s+[a-z_<>{} -]*\\s*[<{]*(?:view|profile)[}>]*)(.*)"); //to match in dorsal view and in profile
private String countp = "more|fewer|less|many|\\d+";
private Pattern countptn = Pattern.compile("((?:^| |\\{)(?:"+countp+")\\}? (?:or|to) \\{?(?:"+countp+")(?:\\}| |$))");
private String romandigits = "i|v|x";
private Pattern positionptn = Pattern.compile("(<(\\S+?)> [<{]*(?:\\d|"+romandigits+")+\\b[}>]*(?![-\\d]*%)(?:\\s*(and|-)\\s*[<{]*(?:\\d|"+romandigits+")+\\b[}>]*(?!%))?)");
private ArrayList<String> prepphrases = new ArrayList<String>();
private String positions = ""; //initialized with two values that are not positions for convenience
private Pattern positionptn2;
private String characterptn;
Pattern pof1 = Pattern.compile("(.*?)\\{?("+this.characterptn+")\\}? of (.*?<\\w+>.*)");
Pattern p1 = Pattern.compile("(.*<\\w+> )\\{?("+this.characterptn+")\\}?");
Pattern pof2 = Pattern.compile("\\{?("+this.characterptn+")\\}? of ((?:<?\\{?("+this.positions+")\\}?>? |<\\w+> |of )+)(.*)");
Pattern p2 = Pattern.compile("((?:<?\\{?("+this.positions+")\\}?>? |<\\w+> |of )+)\\{?("+this.characterptn+")\\}?(.*)");
Pattern pof3=Pattern.compile("((?:<?\\{?("+this.positions+")\\}?>? |<\\w+> |of )+)");
Pattern p3=Pattern.compile("\\{?("+this.characterptn+")\\}?");
String structs = "((?:<?\\{?("+this.positions+")\\}?>? |<\\w+> |of )+)";
private boolean printRelative=true;
private boolean printfromto = true;
/**
*
*/
public POSTagger4StanfordParser(Connection conn, String tableprefix, String glosstable, String characterptn) {
this.conn = conn;
this.tableprefix = tableprefix;
this.glosstable = glosstable;
this.characterptn = characterptn;
try{
Statement stmt = this.conn.createStatement();
ResultSet rs = stmt.executeQuery("select distinct phrase from "+tableprefix+"_prepphrases");
while(rs.next()){
prepphrases.add(rs.getString("phrase"));
}
rs= stmt.executeQuery("select distinct term from "+tableprefix+"_term_category where category='position' union select distinct term from "+this.glosstable+" where category='position'");
while(rs.next()){
positions += rs.getString(1).replaceAll("\\(.*?\\)", "")+"|";
}
positions = positions.replaceFirst("\\|$", "");
positionptn2 = Pattern.compile("(.*?)([<{]?\\b(?:"+this.positions+")\\b[}>]?\\s+to)(\\b.*)");
}catch(Exception e){
e.printStackTrace();
}
}
/**
* //insert our POS tags to segments (simple or complex: new segmentation)
//output POSed segments to a database table and to the posed file
* str is markedsent
*/
protected String POSTag(String str, String src, String type) throws Exception{
boolean containsArea = false;
String strcp = str;
str = StanfordParser.normalizeSpacesRoundNumbers(str);
/*str = str.replaceAll("\\b(?<=\\d+) \\. (?=\\d+)\\b", "."); //2 . 5 =>2.5
str = str.replaceAll("(?<=\\d)\\s+/\\s+(?=\\d)", "/"); // 1 / 2 => 1/2
str = str.replaceAll("(?<=\\d)\\s+[�-�]\\s+(?=\\d)", "-"); // 1 - 2 => 1-2*/
/*if(str.indexOf(" -{")>=0){//1�2-{pinnately} or -{palmately} {lobed} => {1�2-pinnately-or-palmately} {lobed}
str = str.replaceAll("\\s+or\\s+-\\{", "-or-").replaceAll("\\s+to\\s+-\\{", "-to-").replaceAll("\\s+-\\{", "-{");
}*/
//if(str.matches(".*?-(or|to)\\b.*") || str.matches(".*?\\b(or|to)-.*") ){//1�2-{pinnately} or-{palmately} {lobed} => {1�2-pinnately-or-palmately} {lobed}
//to avoid turning 'relative-to its {length}' to 'relative-to-its {length}'
if(str.matches(".*?-(or|to)\\s+[^a-z<].*") || str.matches(".*?\\b(or|to)-.*") ){//1�2-{pinnately} or-{palmately} {lobed} => {1�2-pinnately-or-palmately} {lobed}
str = str.replaceAll("\\}?-or\\s+\\{?", "-or-").replaceAll("\\}?\\s+or-\\{?", "-or-").replaceAll("\\}?-to\\s+\\{?", "-to-").replaceAll("\\}?\\s+to-\\{?", "-to-").replaceAll("-or\\} \\{", "-or-").replaceAll("-to\\} \\{", "-to-");
}
//{often} 2-, 3-, or 5-{ribbed} ; =>{often} {2-,3-,or5-ribbed} ; 635.txt-16
Pattern pp = Pattern.compile("(.*?)((\\d-,\\s*)+ (to|or) \\d-\\{)(.*)");
Matcher m = pp.matcher(str);
while(m.matches()){
str = m.group(1)+"{"+m.group(2).replaceAll("[, ]","").replaceAll("\\{$", "")+m.group(5);
m = pp.matcher(str);
}
String scp = str;
str = str.replaceAll("(?<![\\d(\\[��-]\\s?)[��-]+\\s*(?="+NumericalHandler.numberpattern+"\\s+\\W?("+ChunkedSentence.units+")\\W?)", " to "); //fna: tips>-2.5 {mm}
if(!scp.equals(str)){
System.out.println();
}
//make a position to [anterior to] a single token
m = this.positionptn2.matcher(str);
while(m.matches()){
str = m.group(1)+m.group(2).replaceAll("[<{}>]", "").replaceAll("\\s+", "-")+"-PPP"+m.group(3);
m = this.positionptn2.matcher(str);
}
str = str.replaceAll("(?<=> [\\d"+this.romandigits+"]{1,3})-(?=<)", " - "); //<metacarpal> 2-<metacarpal> 1 {length} {ratio}
this.chunkedtokens = new ArrayList<String>(Arrays.asList(str.split("\\s+")));
str = normalizePositionList(str);
str = normalizeCountList(str);
lookupCharacters(str);//populate charactertokens
if(this.charactertokensReversed.contains("color") || this.charactertokensReversed.contains("coloration")){
str = normalizeColorPatterns();
lookupCharacters(str);
}
if(str.indexOf(" to ")>=0 ||str.indexOf(" or ")>=0){
if(this.printList){
System.out.println(str);
}
str = normalizeCharacterLists(); //a set of states of the same character connected by ,/to/or => {color-blue-to-red}
}
if(str.matches(".*? as\\s+[\\w{}<>]+\\s+as .*")){
str = normalizeAsAs(str);
}
if(str.matches(".*\\bsame\\b.*?\\bas\\b.*")){
str = normalizeSameAs(str);
}
//if(str.matches(".*?\\bin\\b.*?\\bassociation\\W+(with|to)\\b.*")){
// str = normalizeAssociationWith(str);
//}
if(str.matches(".*?(?<=[a-z])/(?=[a-z]).*")){
str = str.replaceAll("(?<=[a-z])/(?=[a-z])", "-");
}
//10-20(-38) {cm}�6-10 {mm}
try{
Statement stmt = conn.createStatement();
Statement stmt1 = conn.createStatement();
String strcp2 = str;
String strnum = null;
/*
//if(str.indexOf("}�")>0){//{cm}�
if(str.indexOf("�")>0){
containsArea = true;
String[] area = normalizeArea(str);
str = area[0]; //with complete info
strnum = area[1]; //contain only numbers
}
*/
//deal with (3) as bullet
Pattern pattern1 = Pattern.compile("^(and )?([(\\[]\\s*\\d+\\s*[)\\]]|\\d+.)\\s+(.*)"); //( 1 ), [ 2 ], 12.
m = pattern1.matcher(str.trim());
if(m.matches()){
str = m.group(3);
}
if(str.indexOf("�")>=0){
str = str.replaceAll("�(?!~[a-z])","{moreorless}").replaceAll("�(?!\\s+\\d)","moreorless");
}
/*to match {more} or {less}*/
if(str.matches(".*?\\b[{<]*more[}>]*\\s+or\\s+[{<]*less[}>]*\\b?.*")){
str = str.replaceAll("[{<]*more[}>]*\\s+or\\s+[{<]*less[}>]*", "{moreorless}");
}
//if(str.matches(".*?\\bin\\s+[a-z_<>{} -]+\\s+[<{]?view[}>]?\\b.*")){//ants: "in full-face view"
if(str.matches(".*?\\bin\\s+[a-z_<>{} -]*\\s*[<{]?(view|profile)[}>]?\\b.*")){
m = viewptn.matcher(str);
while(m.matches()){
str = m.group(1)+" {"+m.group(2).replaceAll("[<>{}]", "").replaceAll("\\s+", "-")+"} "+m.group(3);
m = viewptn.matcher(str);
}
}
//make a prepphrase (e.g. in relation to) a single token
Iterator<String> it = prepphrases.iterator();
while(it.hasNext()){
String phrase = "\\{?\\<?"+it.next().trim().replaceAll(" ", "\\\\>?\\\\}? \\\\{?\\\\<?")+"\\>?\\}?";
Pattern p = Pattern.compile("(.*?)(\\b"+phrase+"\\b)(.*)");
m = p.matcher(str);
while(m.matches()){
str = m.group(1)+m.group(2).replaceAll("[<{}>]", "").replaceAll("\\s+", "-")+"-PPP"+m.group(3);
m = p.matcher(str);
}
}
//make a position to [anterior to] a single token
//m = this.positionptn2.matcher(str);
//while(m.matches()){
// str = m.group(1)+m.group(2).replaceAll("[<{}>]", "").replaceAll("\\s+", "-")+"-PPP"+m.group(3);
// m = this.positionptn2.matcher(str);
//}
if(str.indexOf("�")>0){
containsArea = true;
String[] area = normalizeArea(str);
str = area[0]; //with complete info
strnum = area[1]; //like str but with numerical expression normalized
}
str = handleBrackets(str);
str = stringCharacterComparison(str);
str = normalizefromto(str);
if(type.compareTo("character")==0){//{postorbital} , {form} of {dorsal} <surface>
String temp = str;
str = str.replaceFirst("(?<=^|,\\s)\\{?\\w+\\}? of ", "").trim(); //shape of
String ch = temp.replace(str, "").replace("\\s+of\\s+", "").replaceAll("[{}]", "").trim();
StanfordParser.characterRstates.put(ch, "1"); //to keep only the unique characters
}
stmt.execute("update "+this.tableprefix+"_markedsentence set rmarkedsent ='"+str.replaceAll("-PPP", "")+"' where source='"+src+"'");
if(containsArea){
str = strnum;
str = handleBrackets(str);
}
str = Utilities.threeingSentence(str);
if(strcp.compareTo(str)!=0){
System.out.println("orig sent==>"+ strcp);
System.out.println("rmarked==>"+ strcp2);
System.out.println("threed-sent==>"+ str);
}
//str = str.replaceAll("}>", "/NN").replaceAll(">}", "/NN").replaceAll(">", "/NN").replaceAll("}", "/JJ").replaceAll("[<{]", "");
StringBuffer sb = new StringBuffer();
/*Pattern pattern7 = Pattern.compile("(.*?)([<{]*)([0-9a-zA-Z-]+)[}>]*(.*)");
Matcher m = pattern7.matcher(str);
while ( m.matches()){
sb.append(m.group(1));
String pos = m.group(2);
String word = m.group(3);
str = m.group(4);*/
//m = pattern7.matcher(str);
//continue;
String[] tokens = str.split("\\s+");
for(int i = 0; i<tokens.length; i++){
String word = tokens[i];
String pos = "";
if(word.endsWith("}")){
pos = "{";
}else if(word.endsWith(">")){
pos = "<";
}
word = word.replaceAll("[<>{}]", "").trim();
String p = "";
if(word.length()>0 && !word.matches("\\W") && !word.matches("("+ChunkedSentence.prepositions+")") &&!word.matches("("+ChunkedSentence.stop+")")){
ResultSet rs1 = stmt1.executeQuery("select semanticrole from "+this.tableprefix+"_"+ApplicationUtilities.getProperty("WORDROLESTABLE")+" where word='"+word+"'");
if(rs1.next()){
p = rs1.getString("semanticrole");
}
}
Matcher mc = compreppattern.matcher(word);
if(mc.matches()){
sb.append(word+"/IN ");
}else if(word.contains("relative~")){
sb.append(word+"/JJ ");
}else if(word.matches("in-.*?(-view|profile)")){
sb.append(word+"/RB ");
}else if(word.matches("from~.*?~to~.*")){
sb.append(word+"/RB ");
}else if(word.endsWith("-PPP")){//prepphrase in_association_with
sb.append(word.replaceFirst("-PPP", "")+"/IN ");
}else if(word.endsWith("ly") && word.indexOf("~") <0){ //character list is not RB
sb.append(word+"/RB ");
}else if(word.compareTo("becoming")==0 || word.compareTo("about")==0){
sb.append(word+"/RB ");
}else if(word.compareTo("throughout")==0 && i+1 < tokens.length && tokens[i+1].matches("(\\.|;|,|or)")){
sb.append(word+"/RB ");
}else if(word.compareTo("throughout")==0 && i+1 >= tokens.length){
sb.append(word+"/RB ");
}else if(word.compareTo("at-least")==0){
sb.append(word+"/RB ");
}else if(word.compareTo("one_another")==0){
sb.append(word+"/NN ");
}else if(word.compareTo("plus")==0){
sb.append(word+"/CC ");
}else if(word.matches("\\d+[cmd]?m\\d+[cmd]?m")){ //area turned into 32cm35mm
//sb.append(word+"/CC ");
sb.append(word+"/CD ");
}else if(word.matches("("+ChunkedSentence.units+")")){
sb.append(word+"/NN ");
}else if(word.matches("as-\\S+")){ //as-wide-as
sb.append(word+"/IN "); //changed from RB to IN 2/22/02 by Hong
}else if(word.matches("same-\\S+")){ //same-as
sb.append(word+"/IN "); //added 2/22/02 by Hong
}else if(word.matches("in-\\S+")){ //in-association-with/to
sb.append(word+"/IN "); //added 2/22/02 by Hong
}else if(p.contains("op")){ //<inner> larger.
//System.out.println(rs1.getString(2));
sb.append(word+"/NN ");
}else if(p.contains("os") || pos.indexOf('<') >=0){
sb.append(word+"/NN ");
}else if(p.contains("c")|| pos.indexOf('{') >=0){
//ResultSet rs3 = stmt1.executeQuery("select word from wordpos4parser where word='"+word+"' and certaintyl>5");
ResultSet rs2 = stmt1.executeQuery("select word from brown_wordfreq where word='"+word+"' and freq>79");//1/largest freq in wordpos = 79/largest in brown
if(rs2.next()){
sb.append(word+" ");
//}else if(word.indexOf("3-")>=0){
// sb.append(word+"/CD");
}else{
sb.append(word+"/JJ ");
}
}else{
sb.append(word+" ");
}
//m = pattern7.matcher(str);
}
//sb.append(str);
str = sb.toString().trim();
str = str.replaceAll("(?<=[a-z])\\s+[_�-]\\s+(?=[a-z])", "-").replaceAll("/[A-Z]+\\s*[-�]\\s*", "-").replaceAll("\\d-\\s+(?=[a-z])", "3-"); //non -septate/JJ or linear/JJ _ovoid/JJ
str = str.replaceAll("[\\[\\(]", " -LRB-/-LRB- ").replaceAll("[\\)\\]]", " -RRB-/-RRB- ").replaceAll("\\s+", " ").trim();
str = str.replaceAll("moreorless/JJ","moreorless/RB");
return str;
}catch(Exception e){
e.printStackTrace();
throw e;
}
//return "";
}
private String normalizefromto(String str) {
String cp = str;
boolean changed = false;
if(str.matches(".*\\bfrom .*? to\\b.*")){
Pattern struct = Pattern.compile("(.*?)(\\bfrom (?:<?\\{?(?:"+this.positions+")\\}?>? |<\\w+> |of |the )+to (?:<?\\{?(?:"+this.positions+")\\}?>? |<\\w+> |of |the )+)(.*)");
Matcher m = struct.matcher(str+" "); //need the trailing space
while(m.matches()){
str = m.group(1)+m.group(2).trim().replaceAll(" ", "~")+" "+m.group(3);
m = struct.matcher(str);
changed = true;
}
if(this.printfromto && changed){
System.out.println("normalized from-to from:"+cp);
System.out.println("normalized from-to to: "+str);
}
}
return str;
}
/**
* {width} of <ethmoid> relative-to its {length} from <snout> <tip> to the {posterior} <{margin}> of the <parietals>
* @param str [char of A|A char] [relative-to|<=|>=|=|x times] [char of B|B char]
* @return {relative~{A~char}~{relation}~{B~char}}, assign JJ as its post
*/
private String stringCharacterComparison(String str) {
//width of A relative-to length of B
//width of A relative to B
//A width relative to length of B
//A width relative to B
//width of A relative to length
String cp = str;
Pattern relations = Pattern.compile("(.*?)\\b(relative-to|[\\w-]+equal-to|[\\w]+er\\}? than|times)\\b(.*)");
Matcher m = relations.matcher(str);
if(m.matches() && str.indexOf("<")>=0 && str.matches(".*?\\b("+this.characterptn+")\\b.*")){ //mostly like a comparison of characters
if(m.group(1).trim().length()>0 && m.group(3).trim().length()>0){
String[] part1 = pullCharacterInfo(m.group(1).trim(), "part1");
String[] part2 = pullCharacterInfo(m.group(3).trim(), "part2");
String relation = m.group(2);
if(part1[0]!=null && part2[0]!=null && part1[0].length()>0 && part2[0].length()>0){
str = part1[1]+" {relative~{"+part1[0]+"}~{"+relation+"}~{"+part2[0]+"}} " +part2[1];
if(this.printRelative){
System.out.println(cp);
System.out.println("after relative reformation:" + str);
}
return str;
}
}
}
return str;
}
/**
* part1:
* input string: {width} of <ethmoid>
* output string[0]: ethmoid~width string[1]: ""
* part2:
* input string: its {length} from <snout> <tip> to the {posterior} <{margin}> of the <parietals>
* output string[0]: length; [1] from <snout> <tip> to the {posterior} <{margin}> of the <parietals>
* @param str a string containing an organ or a character or both
* @return two text segments: the first is the organ~character pair (if not found, the first element = ""), the text that to the left or right of the organ~character pair makes the second segment
*/
private String[] pullCharacterInfo(String str, String part){
String[] result = new String[2];
if(part.compareTo("part1")==0){
//find the last structure and character
Matcher m = pof1.matcher(str);
if(m.matches()){
result[1] = m.group(1).trim();
result[0] = m.group(3).replaceAll("[{<>}]", "").trim().replaceAll(" ", "-")+"~"+m.group(2);
return result;
}
m = p1.matcher(str);
if(m.matches()){
String temp = m.group(1);//ends with a space
String ch = m.group(2);
result[1] = temp.replaceFirst("(<?\\{?(this.positions)\\}?>? |<\\w+> |of )+$", "").trim();
result[0] = temp.replace(result[1], "").trim().replaceAll("[{<>}]", "").replaceAll(" ", "-")+"~"+ch;
return result;
}
}else{
//find the first structure/character (may just have one of the two)
//contain both elements
Matcher m = pof2.matcher(str);
if(m.matches()){
result[1] = m.group(4).trim();
result[0] = m.group(2).replaceAll("[{<>}]", "").trim().replaceAll(" ", "-")+"~"+m.group(1);
return result;
}
m = p2.matcher(str);
if(m.matches()){
result[1] = m.group(4).trim();
result[0] = m.group(1).replaceAll("[{<>}]", "").trim().replaceAll(" ", "-")+"~"+m.group(3);
return result;
}
str = str+" "; //need the trailing space
//contain one of the two
m = pof3.matcher(str);
int starto=1000, endo=1000, startc=1000, endc=1000, start=0, end=0;
if(m.find()){
starto = m.start();
endo = m.end();
}
m = p3.matcher(str);
if(m.find()){
startc = m.start();
endc = m.end();
}
start = starto<startc? starto:startc;
if(start == starto) end = endo;
else end = endc;
result[0] = str.substring(start, end).replaceAll("[<{}>]", "").trim();
result[1] = str.substring(end).trim();
return result;
}
return result;
}
/**
* @param str: {upper} {pharyngeal} <tooth> <plates> 4 and 5
* @return: {upper} {pharyngeal} <tooth> <plates_4_and_5>
*/
private String normalizePositionList(String str) {
Matcher m = positionptn.matcher(str);
while(m.find()){
int start = m.start(1);
int end = m.end(1);
String position = m.group(1);
String organ = m.group(2);
if(!isPosition(organ, position)) continue;
String rposition = "<"+position.replaceAll("[<>]", "").replaceAll("\\s+", "_")+">";
//synchronize this.chunkedtokens
//split by single space to get an accurate count to elements that would be in chunkedtokens
int index = (str.substring(0, start).trim()+" a").trim().split("\\s").length-1; //number of tokens before the count pattern
this.chunkedtokens.set(index, rposition);
int num = position.split("\\s+").length;
for(int i = index+1; i < index+num; i++){
this.chunkedtokens.set(i, "");
}
//resemble the str from chunkedtokens, counting all empty elements, so the str and chunkedtokens are in synch.
str = "";
for(String t: this.chunkedtokens){
str +=t+" ";
}
m = positionptn.matcher(str);
}
return str.replaceAll("\\s+", " ").trim();
}
/**
* tooth 5 means the fifth tooth, 5 is position (true)
* teeth 5 means 5 teeth, 5 is count(false)
* teeth 2 and 3 means the second and third teeth, 2 and 3 are position(true)
* tooth 1 ??? treated as position (true) for the time being
* @param organ: teeth
* @param position: <teeth> 4 and 5
* @return
*/
private boolean isPosition(String organ, String position) {
boolean multiplepositions = false;
boolean pluralorgan = false;
position = position.replace("<"+organ+">", "").trim();
if(position.contains(" ") || position.contains("-")){
multiplepositions = true;
}
if(TermOutputerUtilities.isPlural(organ)){
pluralorgan = true;
}
if(pluralorgan && !multiplepositions) return false;
return true;
}
/**
* replace "one or two" with {count~list~one~or~two} in the string
* update this.chunkedTokens
* @param str
*/
private String normalizeCountList(String str) {
Matcher m = this.countptn.matcher(str);
while(m.find()){
int start = m.start(1);
int end = m.end(1);
String count = m.group(1).trim();
String rcount = "";
if(count.compareTo("more or less")==0){
rcount = count.replaceAll(" ","-").replaceAll("[{}]", "");
}else{
rcount = "{count~list~"+count.replaceAll(" ","~").replaceAll("[{}]", "")+"}";
}
//synchronise this.chunkedtokens
//split by single space to get an accurate count to elements that would be in chunkedtokens
int index = (str.substring(0, start).trim()+" a").trim().split("\\s").length-1; //number of tokens before the count pattern
this.chunkedtokens.set(index, rcount);
int num = count.split("\\s+").length;
for(int i = index+1; i < index+num; i++){
this.chunkedtokens.set(i, "");
}
//resemble the str from chunkedtokens, counting all empty elements, so the str and chunkedtokens are in synch.
str = "";
for(String t: this.chunkedtokens){
str +=t+" ";
}
m = this.countptn.matcher(str);
}
return str.replaceAll("\\s+", " ").trim();
}
/**remove all bracketed text such as "leaves large (or small as in abc)"
* do not remove brackets that are part of numerical expression : 2-6 (-10)
* @param str: "leaves large (or small as in abc)"
* @return: "leaves large"
*/
private String handleBrackets(String str) {
//remove nested brackets left by pl such as (petioles (2-)4-8 cm)
//String p1 ="\\([^()]*?[a-zA-Z][^()]*?\\)";
//String p2 = "\\[[^\\]\\[]*?[a-zA-Z][^\\]\\[]*?\\]";
//String p3 = "\\{[^{}]*?[a-zA-Z][^{}]*?\\}";
if(str.matches(".*?\\(.*?[a-zA-Z].*?\\).*") || str.matches(".*?\\[.*?[a-zA-Z].*?\\].*")){
String[] pretokens = str.split("\\s+");
str = Utilities.threeingSentence(str);
String[] tokens = str.split("\\s+");
StringBuffer bracketfree = new StringBuffer();
boolean inbracket = false;
for(int i=0; i<tokens.length; i++){
if(tokens[i].matches("[(\\[].*")){
inbracket = true;
}
if(!inbracket){
if(tokens[i].compareTo("3")==0){
bracketfree.append(pretokens[i]+" ");
}else{
bracketfree.append(tokens[i]+" ");
}
}
if(tokens[i].matches(".*[)\\]]")){
inbracket = false;
}
}
str = bracketfree.toString().trim();
if(str.matches(".*?\\(\\s+?\\s+\\).*")){//2n=20( ? ), 30 => 2n=20?, 30
str = str.replaceAll("\\(\\s+?\\s+\\)", "?");
}
//str = str.replaceAll(p1, "").replaceAll(p2, "").replaceAll("\\s+", " ").trim();
}
return str;
}
/**
* make "suffused with dark blue and purple or green" one token
* ch-ptn"color % color color % color @ color"
* @return {color~list~color1~color2}
*/
private String normalizeColorPatterns() {
String list = "";
String result = "";
String header = "ttt";
for(int i = this.charactertokensReversed.size() -1; i>=0; i--){
list+=this.charactertokensReversed.get(i)+" ";
}
list = list.trim()+" "; //need to have a trailing space
Pattern p = Pattern.compile("(.*?)((color|coloration)\\s+%\\s+(?:(?:color|coloration|@|%) )+)(.*)");
Matcher m = p.matcher(list);
int base = 0;
while(m.matches()){
int start = (m.group(1).trim()+" a").trim().split("\\s+").length+base-1;
int end = start+(m.group(2).trim()+" b").trim().split("\\s+").length-1;
String ch = m.group(3)+header;
list = m.group(4);
m = p.matcher(list);
//form result string, adjust chunkedtokens
for(int i = base; i<start; i++){
result += this.chunkedtokens.get(i)+" ";
}
if(end>start){ //if it is a list
String t= "{"+ch+"~list~";
for(int i = start; i<end; i++){
t += this.chunkedtokens.get(i).trim().replaceAll("[{}]", "").replaceAll("[,;\\.]", "punct")+"~";
this.chunkedtokens.set(i, "");
}
t = t.replaceFirst("~$", "}");
t = distributePrep(t)+" ";
this.chunkedtokens.set(end-1, t.trim());//"suffused with ..." will not form a list with other previously mentioned colors, but may with following colors, so put this list close to the next token.
result +=t;
}
//prepare for the next step
base = end;
}
//dealing with the last segment of the list or the entire list if no match
for(int i = base; i<(list.trim()+" b").trim().split("\\s+").length+base-1; i++){
result += this.chunkedtokens.get(i)+" ";
}
return result;
}
/**
*
* @param t: {color~list~suffused~with~red~or~purple}
* @return {color~list~suffused~with~red~or~purple}
*/
private String distributePrep(String t) {
Pattern p = Pattern.compile("(^.*~list~)(.*?~with~)(.*?~or~)(.*)");
Matcher m = p.matcher(t);
if(m.matches()){
t = m.group(1)+m.group(2)+m.group(3)+m.group(2)+m.group(4);
}
return t;
}
/**
*
* @param text
* @return two strings: one contains all text from text with rearranged spaces, the other contains numbers as the place holder of the area expressions
*/
private String[] normalizeArea(String text){
String[] result = new String[2];
String text2= text;
Pattern p = Pattern.compile("(.*?)([\\d\\.()+-]+ \\{[cmd]?m\\}�\\S*\\s*[\\d\\.()+-]+ \\{[cmd]?m\\}�?(\\S*\\s*[\\d\\.()+-]+ \\{[cmd]?m\\})?)(.*)");
Matcher m = p.matcher(text);
while(m.matches()){
text = m.group(1)+m.group(2).replaceAll("[ \\{\\}]", "")+ m.group(4);
m = p.matcher(text2);
m.matches();
text2 = m.group(1)+m.group(2).replaceAll("[cmd]?m", "").replaceAll("[ \\{\\}]", "")+ m.group(4);
m = p.matcher(text);
}
result[0] = text;
result[1] = text2;
return result;
}
private void lookupCharacters(String str) {
if(str.trim().length() ==0){
return;
}
this.charactertokensReversed = new ArrayList<String>();
boolean save = false;
boolean ambiguous = false;
ArrayList<String> saved = new ArrayList<String>();
ArrayList<String> amb = new ArrayList<String>();
for(int i = this.chunkedtokens.size()-1; i>=0; i--){
String word = this.chunkedtokens.get(i);
if(word.indexOf("~list~")>0){
String ch = word.substring(0, word.indexOf("~list~")).replaceAll("\\W", "").replaceFirst("ttt$", "");
this.charactertokensReversed.add(ch);
}else if(word.indexOf('{')>=0 && word.indexOf('<')<0){
String ch = Utilities.lookupCharacter(word, conn, this.characterhash, glosstable, tableprefix); //remember the char for this word (this word is a word before (to|or|\\W)
if(ch==null){
this.charactertokensReversed.add(word.replaceAll("[{}]", "")); //
}else{
this.charactertokensReversed.add(ch); //color
if(save){
save(saved, this.chunkedtokens.size()-1-i, ch);
if(ch.indexOf(Utilities.or)>0){
ambiguous = true;
amb.add(this.chunkedtokens.size()-1-i+"");
}
}
save = false;
}
}else if (word.indexOf('<')>=0){
this.charactertokensReversed.add("#");
save = true;
}else if(word.matches("(to|or)")){
this.charactertokensReversed.add("@"); //to|or
save = true;
}else if(word.matches("\\W")){
this.charactertokensReversed.add(word); //,;.
save = true;
}else if(word.compareTo("�")==0){
this.charactertokensReversed.add("moreorless"); //,;.
save = true;
}else{
this.charactertokensReversed.add("%");
save = true;
}
}
//deal with a/b characters
if(ambiguous){
Iterator<String> it = amb.iterator();
while(it.hasNext()){
int i = Integer.parseInt(it.next());
Pattern p = Pattern.compile("("+this.charactertokensReversed.get(i)+"|"+this.charactertokensReversed.get(i).replaceAll(Utilities.or, "|")+")");
String tl = lastSaved(saved, i);
Matcher m = p.matcher(tl);
//if(m.matches()){
if(m.find()){
this.charactertokensReversed.set(i, m.group(1));
}else{
String tn = nextSaved(saved, i);
m = p.matcher(tn);
//if(m.matches()){
if(m.find()){
this.charactertokensReversed.set(i, m.group(1));
}
}
}
}
}
private String lastSaved(ArrayList<String> saved, int index){
for(int i = index-1; i >=0 && i<saved.size(); i--){
if(saved.get(i).trim().length()>0){
return saved.get(i);
}
}
return "";
}
private String nextSaved(ArrayList<String> saved, int index){
for(int i = index+1; i <saved.size(); i++){
if(saved.get(i).trim().length()>0){
return saved.get(i);
}
}
return "";
}
private void save(ArrayList<String> saved, int index, String ch){
while(saved.size()<=index){
saved.add("");
}
saved.set(index, ch);
}
/**
* put a list of states of the same character connected by to/or in a chunk
* color, color, or color
* color or color to color
*
* {color~list~blue~to~red}
* @return updated string
*/
private String normalizeCharacterLists(){
//charactertokens.toString
String list = "";
String result = "";
for(int i = this.charactertokensReversed.size() -1; i>=0; i--){
list+=this.charactertokensReversed.get(i)+" ";
}
list = list.trim()+" "; //need to have a trailing space
//pattern match: collect state one by one
int base = 0;
//Pattern pt = Pattern.compile("(.*?(?:^| ))(([0-9a-z�\\[\\]\\+-]+ly )*([a-z-]+ )+([@,;\\.] )+\\s*)(([a-z-]+ )*(\\4)+[@,;\\.%\\[\\]\\(\\)#].*)");//
Pattern pt = Pattern.compile("(.*?(?:^| ))(([0-9a-z�\\[\\]\\+-]+ly )*([_a-z-]+ )+([@,;\\.] )+\\s*)(([_a-z-]+ )*(\\4)+([0-9a-z�\\[\\]\\+-]+ly )*[@,;\\.%\\[\\]\\(\\)#].*)");//
Matcher mt = pt.matcher(list);
while(mt.matches()){
int start = (mt.group(1).trim()+" a").trim().split("\\s+").length+base-1; //"".split(" ") == 1
String l = mt.group(2);
String ch = mt.group(4).trim();
list = mt.group(6);
//Pattern p = Pattern.compile("(([a-z-]+ )*([a-z-]+ )+([@,;\\.] )+\\s*)(([a-z-]+ )*(\\3)+[@,;\\.%\\[\\]\\(\\)#].*)");//merely shape, @ shape
Pattern p = Pattern.compile("(([a-z-]+ )*([a-z-]+ )+([0-9a-z�\\[\\]\\+-]+ly )*([@,;\\.] )+\\s*)(([a-z-]+ )*(\\3)+([0-9a-z�\\[\\]\\+-]+ly )*[@,;\\.%\\[\\]\\(\\)#].*)");//merely shape, @ shape
Matcher m = p.matcher(list);
while(m.matches()){
l += m.group(1);
//list = m.group(5);
list = m.group(6);
m = p.matcher(list);
}
l += list.replaceFirst("[@,;\\.%\\[\\]\\(\\)#].*$", "");//take the last seg from the list
int end = start+(l.trim()+" b").trim().split("\\s+").length-1;
if(! l.matches(".*?@[^,;\\.]*") && l.matches(".*?,.*")){ //the last state is not connected by or/to, then it is not a list
start = end;
}
list = list.replaceFirst("^.*?(?=[@,;\\.%\\[\\]\\(\\)#])", "");
mt = pt.matcher(list);
for(int i = base; i<start; i++){
result += this.chunkedtokens.get(i)+" ";
}
if(end>start){ //if it is a list
String t= "{"+ch+"~list~";
for(int i = start; i<end; i++){
if(this.chunkedtokens.get(i).length()>0){
t += this.chunkedtokens.get(i).trim().replaceAll("[{}]", "").replaceAll("[,;\\.]", "punct")+"~";
}else if(i == end-1){
while(this.chunkedtokens.get(i).length()==0){
i++;
}
t+=this.chunkedtokens.get(i).trim().replaceAll("[{}]", "").replaceAll("[,;\\.]", "punct")+"~";
}
this.chunkedtokens.set(i, "");
}
t = t.replaceFirst("~$", "}")+" ";
if(t.indexOf("ttt~list")>=0) t = t.replaceAll("~color.*?ttt~list", "");
this.chunkedtokens.set(start, t);
result +=t;
if(this.printList){
System.out.println(">>>"+t);
}
}
base = end;
}
for(int i = base; i<(list.trim()+" b").trim().split("\\s+").length+base-1; i++){
result += this.chunkedtokens.get(i)+" ";
}
return result.trim();
}
/**
* the same as => same-as/IN
* as wide as or/to wider than inner
* as wide as inner
* as wide as long
* @return
*/
/*private String normalizeAssociationWith(String str) {
String result = "";
Pattern p = Pattern.compile("(.*?\\b)(in\\b.*?\\bassociation\\W+(?:with|to))(\\b.*)");
Matcher m = p.matcher(str);
while(m.matches()){
result+=m.group(1);
result+="{"+m.group(2).replaceAll("\\s+", "-").replaceAll("[{}<>]", "")+"}";
str = m.group(3);
m = p.matcher(str);
}
result+=str;
return result.replaceAll("\\{+", "{").replaceAll("\\}+", "}").trim();
}*/
/**
* the same as => same-as/IN
* as wide as or/to wider than inner
* as wide as inner
* as wide as long
* @return
*/
private String normalizeSameAs(String str) {
String result = "";
Pattern p = Pattern.compile("(.*?\\b)(same\\b[ \\w{}<>]+\\s+as)(\\b.*)");
Matcher m = p.matcher(str);
while(m.matches()){
result+=m.group(1);
result+="{"+m.group(2).replaceAll("\\s+", "-").replaceAll("[{}<>]", "")+"}";
str = m.group(3);
m = p.matcher(str);
}
result+=str;
return result.replaceAll("\\{+", "{").replaceAll("\\}+", "}").trim();
}
/**
* as wide as => as-wide-as/IN
* as wide as or/to wider than inner
* as wide as inner
* as wide as long
* @return
*/
private String normalizeAsAs(String str) {
String result = "";
Pattern p = Pattern.compile("(.*?\\b)(as\\s+[\\w{}<>]+\\s+as)(\\b.*)");
Matcher m = p.matcher(str);
while(m.matches()){
result+=m.group(1);
result+="{"+m.group(2).replaceAll("\\s+", "-").replaceAll("[{}<>]", "")+"}";
str = m.group(3);
m = p.matcher(str);
}
result+=str;
return result.trim();
}
/**
* @param args
*/
public static void main(String[] args) {
/*
//File posedfile = new File(posedfile);
//File parsedfile = new File("");
String database = "fnav19_benchmark";
String tableprefix = "fnav19";
String POSTaggedSentence="POSedSentence";
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password;
conn = DriverManager.getConnection(URL);
//Statement stmt = conn.createStatement();
//stmt.execute("create table if not exists "+tableprefix+"_"+POSTaggedSentence+"(source varchar(100) NOT NULL, posedsent TEXT, PRIMARY KEY(source))");
//stmt.execute("delete from "+tableprefix+"_"+POSTaggedSentence);
//stmt.close();
}
}catch(Exception e){
e.printStackTrace();
}
POSTagger4StanfordParser tagger = new POSTagger4StanfordParser(conn, tableprefix, "fnaglossaryfixed");
//String str="<Cypselae> {tan} , {subcylindric} , {subterete} to 5-{angled} , 8�10 {mm} , {indistinctly} 8�10-{ribbed}";
//String src="364.txt-15";
//String str="{often} 2- , 3- , or 5-{ribbed}";
//String src="625.txt-16";
//String str = "<heads> in {paniculiform} arrays .";
//String src = "10.txt-4";
//String str = "<{middle}> <phyllaries> {acuminate} at <apex> with <point> 22 � 38 {mm} and <{spine}> <tip> 6 � 9 {mm} , or in some {cultivated} {forms} {broadly} {obtuse} to {truncate} and {mucronate} with or without <{spine}> <tip> 1 � 2 {mm} , {distal} <margins> with or without {indistinct} {yellowish} <margins> .";
//String src = "41.txt-1";
//String str = " <outer> 5 � 6 {lance-ovate} to {lanceolate} , 4 � 7 {mm} , {basally} {cartilaginous} , {distally} {herbaceous} , <inner> 8 + {lance-linear} to {linear} , 6 � 12 {mm} , {herbaceous} , all {usually} with some <{gland}>-{tipped} <hairs> 0 . 5 � 0 . 8 {mm} on <margins> near <bases> or on {abaxial} <faces> toward <tips> .";
//String src = "273.txt-6";
//String str = "<stems> {usually} 1 , {branched} {distally} or {openly} so throughout , {leafy} , {glabrous} or {thinly} {arachnoid-tomentose} .";
String src = "157.txt-1";
String str = "laminae 6 17 cm . long , 2 - 7 cm . broad , lanceolate to narrowly oblong or elliptic_oblong , abruptly and narrowly acuminate , obtuse to acute at the base , margin entire , the lamina drying stiffly chartaceous to subcoriaceous , smooth on both surfaces , essentially glabrous and the midvein prominent above , glabrous to sparsely puberulent beneath , the 8 to 18 pairs of major secondary veins prominent beneath and usually loop_connected near the margin , microscopic globose_capitate or oblongoid_capitate hairs usually present on the lower surface , clear or orange distally .";
try{
System.out.println(tagger.POSTag(str, src, "description")); //type is one of "character" and "description"
}catch(Exception e){
e.printStackTrace();
}*/
}
}