/* $Id: CharStateHandler.java 827 2011-06-05 03:36:57Z hong1.cui $ */ package fna.charactermarkup; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.Statement; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; @SuppressWarnings("unused") public class CharStateHandler { static protected Connection conn = null; static protected String database = null; static protected String username = "root"; static protected String password = "root"; static protected ArrayList<String> adverbs = new ArrayList<String>(); static protected ArrayList<String> notadverbs = new ArrayList<String>(); static protected String glosstable = null; public CharStateHandler(String database) { CharStateHandler.database = database; if(database.endsWith("fna")){ CharStateHandler.glosstable = "fnaglossaryfixed"; }else if(database.endsWith("treatise")){ CharStateHandler.glosstable = "treatisehglossaryfixed"; } try{ if(conn == null){ Class.forName("com.mysql.jdbc.Driver"); String URL = "jdbc:mysql://localhost/"+database+"?user="+username+"&password="+password; conn = DriverManager.getConnection(URL); } } catch(Exception e){ e.printStackTrace(); } } /** * * @param plaincharset : styles 2[10] mm diam. * @param state: <styles> 2[10] mm {diam}. * @return: characters marked up in XML format <character name="" value=""> */ public String characterstate(String plaincharset, String state){ String innertagstate = ""; try{ Statement stmt2 = conn.createStatement(); int i,j; plaincharset = plaincharset.replaceAll("\\([\\s]?|\\[[\\s]?", "["); plaincharset = plaincharset.replaceAll("[\\s]?\\)|[\\s]?\\]", "]"); //System.out.println("plain:"+plaincharset); //System.out.println("state:"+state); Pattern pattern19 = Pattern.compile("[�]?[\\[]?[\\d\\s\\.]+[\\]]?[\\s]?[\\[]?[\\�\\-]+[\\]]?[\\s]?[\\[]?[\\d\\s\\.]+[+]?[\\]]?[\\s]?[dcm�]?[m]?[\\s]?[xX\\�]+[\\s]?[\\[]?[\\d\\s\\.]+[\\]]?[\\s]?[\\[]?[\\�\\-]+[\\]]?[\\s]?[\\[]?[\\d\\s\\.]+[+]?[\\]]?[\\s]?[dcm�]?m"); Matcher matcher2 = pattern19.matcher(plaincharset); while ( matcher2.find()){ if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String match = plaincharset.substring(i, j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m"); Matcher matcher3 = pattern18.matcher(match); String[] unit = new String[2]; int num = 0; while ( matcher3.find()){ unit[num] = match.substring(matcher3.start(), matcher3.end()); num++; } match = matcher3.replaceAll("#"); matcher3.reset(); int en = match.indexOf('-'); int lasten = match.lastIndexOf('-'); if (match.substring(en+1, match.indexOf('�',en+1)).contains("+")) innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"length\" from=\""+match.substring(0,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('+',en+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[0].trim()+"\"/>"); else innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"length\" from=\""+match.substring(0,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('�',en+1)).trim()+"\" unit=\""+unit[0].trim()+"\"/>"); if (num>1){ if (match.substring(lasten+1, match.indexOf('#',lasten+1)).contains("+")) innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[1].trim()+"\"/>"); else innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()+"\" unit=\""+unit[1].trim()+"\"/>"); } else{ if (match.substring(lasten+1, match.indexOf('#',lasten+1)).contains("+")) innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[0].trim()+"\"/>"); else innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()+"\" unit=\""+unit[0].trim()+"\"/>"); } } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); //System.out.println("plaincharset1:"+plaincharset); Pattern pattern24 = Pattern.compile("l/w[\\s]?=[\\d\\.\\s\\+\\�\\-]+"); matcher2 = pattern24.matcher(plaincharset); while ( matcher2.find()){ if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String match = plaincharset.substring(i, j); int en = match.indexOf('-'); if (match.contains("+")) innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"l_w_ratio\" from=\""+match.substring(match.indexOf('=')+2,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('+',en+1)).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"l_w_ratio\" from=\""+match.substring(match.indexOf('=')+2,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf(' ',en+1)).trim()+"\"/>"); } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); int sizect = 0; Pattern pattern13 = Pattern.compile("[xX\\ױ\\d\\[\\]\\�\\-\\.\\s\\+]+[\\s]?[dcm�]?m(?![\\w])(([\\s]diam)?([\\s]wide)?)"); matcher2 = pattern13.matcher(plaincharset); String toval=""; String fromval=""; while ( matcher2.find()){ if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String extreme = plaincharset.substring(i,j); i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } else{ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character name=\"atypical_size\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); if(extreme.substring(i,j).contains("�")|extreme.substring(i,j).contains("-") && !extreme.substring(i,j).contains("�") && !extreme.substring(i,j).contains("x") && !extreme.substring(i,j).contains("X")){ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m(([\\s]diam)?([\\s]wide)?)"); Matcher matcher3 = pattern18.matcher(extract); String unit=""; /*if ( matcher3.find()){ unit = extract.substring(matcher3.start(), matcher3.end()); } if(unit.length()>0) extract = matcher3.replaceAll("#"); if(extract.indexOf("#")<0) extract +="#";*/ if ( matcher3.find()){ unit = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); String from = extract.substring(0, extract.indexOf('-')).trim(); String to = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')).trim(); boolean upperrestricted = ! to.endsWith("+"); to = to.replaceFirst("\\+$", "").trim(); innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"size\" from=\""+from+"\" from_unit=\""+unit.trim()+"\" to=\""+to+"\" to_unit=\""+unit.trim()+"\" upper_restricted=\""+upperrestricted+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')); sizect+=1; } else{ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m(([\\s]diam)?([\\s]wide)?)"); Matcher matcher3 = pattern18.matcher(extract); String unit=""; if ( matcher3.find()){ unit = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); innertagstate = innertagstate.concat("<character name=\"size\" value=\""+extract.substring(0,extract.indexOf('#')).trim()+"\" unit=\""+unit.trim()+"\"/>"); toval = extract.substring(0,extract.indexOf('#')); fromval = extract.substring(0,extract.indexOf('#')); } StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset(); } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); //System.out.println("plaincharset2:"+plaincharset); Pattern pattern14 = Pattern.compile("[�\\d\\[\\]\\�\\-\\./\\s]+[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); matcher2 = pattern14.matcher(plaincharset); toval=""; fromval=""; while ( matcher2.find()){ if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String extreme = plaincharset.substring(i,j); i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-') innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } else{ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"atypical_size\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); if(extreme.substring(i,j).contains("�")|extreme.substring(i,j).contains("-") && !extreme.substring(i,j).contains("�") && !extreme.substring(i,j).contains("x") && !extreme.substring(i,j).contains("X")){ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); Matcher matcher3 = pattern18.matcher(extract); String relative=""; if ( matcher3.find()){ relative = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"size\" from=\""+extract.substring(0, extract.indexOf('-')).trim()+"\" to=\""+extract.substring(extract.indexOf('-')+1,extract.indexOf('#')).trim()+"\" relative_constraint=\""+relative.trim()+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')); sizect+=1; } else{ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); Matcher matcher3 = pattern18.matcher(extract); String relative=""; if ( matcher3.find()){ relative = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"size\" value=\""+extract.substring(0,extract.indexOf('#')).trim()+"\" relative_constraint=\""+relative.trim()+"\"/>"); toval = extract.substring(0,extract.indexOf('#')); fromval = extract.substring(0,extract.indexOf('#')); } StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset(); } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); int countct = 0; //Pattern pattern15 = Pattern.compile("([\\[]?[�]?[\\d]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?|[\\[]?[�]?[\\d]+[+]?[\\]]?[\\s]?)[\\�\\�\\-]+[a-zA-Z]+"); Pattern pattern15 = Pattern.compile("([\\[]?[�]?[\\d]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?|[\\[]?[�]?[\\d]+[+]?[\\]]?[\\s]?)[\\�\\�\\-]+[a-zA-Z]+"); matcher2 = pattern15.matcher(plaincharset); plaincharset = matcher2.replaceAll("#"); matcher2.reset(); //Pattern pattern16 = Pattern.compile("(?<!([/][\\s]?))([\\[]?[�]?[\\d]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?[\\s]?([\\[]?[\\�\\-]?[\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?)*|[�]?[\\d]+[+]?)(?!([\\s]?[n/]|[\\s]?[\\�\\-]?% of [\\w]+ length|[\\s]?[\\�\\-]?height of [\\w]+|[\\s]?[\\�\\-]?times|[\\s]?[\\�\\-]?total length|[\\s]?[\\�\\-]?their length|[\\s]?[\\�\\-]?(times)?[\\s]?length of|[\\s]?[dcm�]?m))"); //add \\. to allow 0.5-0.6+ //TODO: match also just a period . Pattern pattern16 = Pattern.compile("(?<!([/][\\s]?))([\\[]?[�]?[\\d\\./]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d\\./]+[+]?[\\]]?[\\s]?([\\[]?[\\�\\-]?[\\]]?[\\s]?[\\[]?[\\d\\./]+[+]?[\\]]?)*|[�]?[\\d\\./]+[+]?)(?!([\\s]?[n/]|[\\s]?[\\�\\-]?% of [\\w]+ length|[\\s]?[\\�\\-]?height of [\\w]+|[\\s]?[\\�\\-]?times|[\\s]?[\\�\\-]?total length|[\\s]?[\\�\\-]?their length|[\\s]?[\\�\\-]?(times)?[\\s]?length of|[\\s]?[dcm�]?m))"); matcher2 = pattern16.matcher(plaincharset); while ( matcher2.find()){ i=matcher2.start(); j=matcher2.end(); String extreme = plaincharset.substring(i,j); if(!extreme.matches(".*\\d.*")) continue; i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; j = extreme.length(); Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); j = p; if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } else{ if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } matcher1.reset(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); j = p; if (extreme.charAt(q-2)=='+') innertagstate = innertagstate.concat("<character name=\"atypical_count\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); else innertagstate = innertagstate.concat("<character name=\"atypical_count\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } matcher1.reset(); if(extreme.substring(i,j).contains("�")|extreme.substring(i,j).contains("-") && !extreme.substring(i,j).contains("�") && !extreme.substring(i,j).contains("x") && !extreme.substring(i,j).contains("X")){ String extract = extreme.substring(i,j); Pattern pattern22 = Pattern.compile("[\\[\\]]+"); matcher1 = pattern22.matcher(extract); extract = matcher1.replaceAll(""); matcher1.reset(); innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"count\" from=\""+extract.substring(0, extract.indexOf('-')).trim()+"\" to=\""+extract.substring(extract.indexOf('-')+1,extract.length()).trim()+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.length()); countct+=1; } else{ String extract = extreme.substring(i,j); innertagstate = innertagstate.concat("<character name=\"count\" value=\""+extract.trim()+"\"/>"); toval = extract; fromval = extract; } StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset(); } matcher2.reset(); Pattern pattern27 = Pattern.compile("([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)[\\s]to[\\s]([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)"); matcher2 = pattern27.matcher(state); String state1 = ""; String state2 = ""; String resstate1 = ""; String resstate2 = ""; while (matcher2.find()){ String chstate1 = "", chstate2 = ""; state1=matcher2.group(2); resstate1 = state1; state2=matcher2.group(5); resstate2 = state2; resstate1 = resstate1.replaceAll("\\_", " "); resstate2 = resstate2.replaceAll("\\_", " "); if(state1.contains("-")|state1.contains("�")){ state1=state1.substring(state1.indexOf("-")+1|state1.indexOf("�")+1, state1.length()); } if(state1.contains("_")){ state1=state1.substring(state1.indexOf("_")+1); } if(state2.contains("-")|state2.contains("�")){ state2=state2.substring(state2.indexOf("-")+1|state2.indexOf("�")+1, state2.length()); } if(state2.contains("_")){ state2=state2.substring(state2.indexOf("_")+1); } ResultSet rs1 = stmt2.executeQuery("select category from "+CharStateHandler.glosstable+" where term='"+state1+"'"); if(rs1.next()){ chstate1=rs1.getString("category"); if(chstate1.contains("/")){ String [] terms = chstate1.split("/"); chstate1=terms[0]; for(int t=1;t<terms.length;t++) chstate1=chstate1.concat("_or_"+terms[t]); } } ResultSet rs2 = stmt2.executeQuery("select category from "+CharStateHandler.glosstable+" where term='"+state2+"'"); if(rs2.next()){ chstate2=rs2.getString("category"); if(chstate2.contains("/")){ String [] terms = chstate2.split("/"); chstate2=terms[0]; for(int t=1;t<terms.length;t++) chstate2=chstate2.concat("_or_"+terms[t]); } } if((chstate1.contains(chstate2)|chstate2.contains(chstate1)) && chstate1.compareTo("")!=0 && chstate2.compareTo("")!=0){ innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\""+chstate1+"\" from=\""+resstate1.trim()+"\" to=\""+resstate2.trim()+"\"/>"); } } matcher2.reset(); Pattern pattern7 = Pattern.compile("([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)"); matcher2 = pattern7.matcher(state); String str3 = ""; while (matcher2.find()){ int flag5=0; int flag6=0; String first = ""; String chstate = ""; String resstate = ""; i=matcher2.start(); j=matcher2.end(); str3=matcher2.group(2);//state.subSequence(i,j).toString(); resstate=str3; resstate=resstate.replaceAll("\\_", " "); if(str3.contains("-")|str3.contains("�")){ first = str3.substring(0, str3.indexOf("-")); str3=str3.substring(str3.indexOf("-")+1|str3.indexOf("�")+1, str3.length()); flag5=1; } if(str3.contains("_")){ str3=str3.substring(str3.indexOf("_")+1); flag6=1; } ResultSet rs1 = stmt2.executeQuery("select category from "+CharStateHandler.glosstable+" where term='"+str3+"'"); if(rs1.next()){ chstate=rs1.getString("category"); if(chstate.contains("/")){ String [] terms = chstate.split("/"); chstate=terms[0]; for(int t=1;t<terms.length;t++) chstate=chstate.concat("_or_"+terms[t]); } if(state.indexOf(i)=='[' && state.indexOf(j-1)==']'){ if(flag5==1) innertagstate = innertagstate.concat("<character name=\"atypical_"+chstate+"\" value=\""+first.trim()+"-"+str3.trim()+"\"/>"); else if(flag6==1) innertagstate = innertagstate.concat("<character name=\"atypical_"+chstate+"\" value=\""+resstate.trim()+"\"/>"); else innertagstate = innertagstate.concat("<character name=\"atypical_"+chstate+"\" value=\""+str3.trim()+"\"/>"); } else{ if(flag5==1) innertagstate = innertagstate.concat("<character name=\""+chstate+"\" value=\""+first.trim()+"-"+str3.trim()+"\"/>"); else if(flag6==1) innertagstate = innertagstate.concat("<character name=\""+chstate+"\" value=\""+resstate.trim()+"\"/>"); else innertagstate = innertagstate.concat("<character name=\""+chstate+"\" value=\""+str3.trim()+"\"/>"); } } } matcher2.reset(); Pattern pattern28 = Pattern.compile("([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)[\\s](with[\\s])?([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)([\\s]to[\\s]([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?)|[\\s]or[\\s]([\\[]?[{])([\\w�\\+\\�\\-\\.:=/\\_]+)([}][\\]]?))?"); matcher2 = pattern28.matcher(state); while (matcher2.find()){ String mod = matcher2.group(2); //System.out.println(mod); //System.out.println(matcher2.group(6)); //System.out.println(matcher2.group(10)); //System.out.println(matcher2.group(13)); if(Utilities.isAdv(mod, adverbs, notadverbs)){ String chstate = matcher2.group(6).replaceAll("\\_", " "); StringBuffer sb = new StringBuffer(); Pattern pattern29 = Pattern.compile("value=\""+chstate+"\""); Matcher matcher1 = pattern29.matcher(innertagstate); String value=""; while ( matcher1.find()){ matcher1.appendReplacement(sb, "modifier=\""+mod+"\" "+matcher1.group()); value = matcher1.group(); } matcher1.appendTail(sb); innertagstate=sb.toString(); if(value.length()> 0) innertagstate= combineModifiers(innertagstate, value); matcher1.reset(); if(matcher2.group(10)!=null){ StringBuffer sb1 = new StringBuffer(); Pattern pattern30 = Pattern.compile("from=\""+chstate+"\""); matcher1 = pattern30.matcher(innertagstate); value=""; while ( matcher1.find()){ matcher1.appendReplacement(sb1, "modifier=\""+mod+"\" "+matcher1.group()); value = matcher1.group(); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); if(value.length()> 0) innertagstate= combineModifiers(innertagstate, value); matcher1.reset(); chstate = matcher2.group(10).replaceAll("\\_", " "); StringBuffer sb2 = new StringBuffer(); Pattern pattern31 = Pattern.compile("value=\""+chstate+"\""); matcher1 = pattern31.matcher(innertagstate); value=""; while ( matcher1.find()){ matcher1.appendReplacement(sb2, "modifier=\""+mod+"\" "+matcher1.group()); value = matcher1.group(); } matcher1.appendTail(sb2); innertagstate=sb2.toString(); if(value.length()> 0) innertagstate= combineModifiers(innertagstate, value); matcher1.reset(); } if(matcher2.group(13)!=null){ chstate = matcher2.group(13).replaceAll("\\_", " "); StringBuffer sb3 = new StringBuffer(); Pattern pattern32 = Pattern.compile("value=\""+chstate+"\""); matcher1 = pattern32.matcher(innertagstate); value = ""; while ( matcher1.find()){ matcher1.appendReplacement(sb3, "modifier=\""+mod+"\" "+matcher1.group()); value = matcher1.group(); } matcher1.appendTail(sb3); innertagstate=sb3.toString(); if(value.length()> 0) innertagstate = combineModifiers(innertagstate, value); matcher1.reset(); } } } matcher2.reset(); } catch (Exception e) { System.err.println(e); e.printStackTrace(); } return(innertagstate.replaceAll("\\s+\\.\\s+", ".")); //turn 4 . 5 to 4.5 } /** * modifier="a" modifier="b" value="c" * @param element: <character name="n" modifier="a" modifier="b" value="c"/> * @param value: value="c" * @return: <character name="n" modifier="a;b" value="c"/> */ private static String combineModifiers(String text, String value){ String rtext = ""; Pattern p = Pattern.compile("(.*?)(<character [^>]*? "+value+".*?/>)(.*)"); Matcher m0 = p.matcher(text); while(m0.matches()){ rtext +=m0.group(1); rtext +=combineModifiers4Element(m0.group(2)); text = m0.group(3); m0 = p.matcher(text); } return rtext+text; } private static String combineModifiers4Element(String element) { Pattern ptn = Pattern.compile("(.*? )(modifier=\\S+)(['\"].*)"); Matcher m = ptn.matcher(element); String result = ""; String modifiers = ""; while(m.matches()){ result +=m.group(1).replaceFirst("^['\"]", ""); modifiers += m.group(2).replaceAll("modifier=", "")+";"; element = m.group(3); m = ptn.matcher(element); } result += element.replaceFirst("^['\"]", ""); modifiers = "modifier=\""+modifiers.replaceAll("['\"]", "").replaceAll("\\W+$", "").trim()+"\""; result = result.replaceFirst(" value", modifiers+" value").replaceAll("\\s+", " "); return result; } /** * @param args */ public static void main(String[] args) { CharStateHandler ch = new CharStateHandler("annotationevaluation_heuristics_fna"); // TODO Auto-generated method stub String str1 = "0.5-0.6+"; String str2 = "0.5-0.6+"; System.out.println(ch.characterstate(str1, str2)); str1 = "0.5-0.6+ cm"; str2 = "0.5-0.6+ cm"; System.out.println(ch.characterstate(str1, str2)); str1 = "1/3-1/2"; str2 = "1/3-1/2"; System.out.println(ch.characterstate(str1, str2)); str1 = "[5-]8+"; str2 = "[5-]8+"; System.out.println(ch.characterstate(str1, str2)); str1 = "outer and mid phyllaries acute"; str2 = "{outer} and <{mid}> <phyllaries> {acute}"; System.out.println(ch.characterstate(str1, str2)); } }