/* $Id: NumericalHandler.java 907 2011-08-12 22:07:11Z hong1.cui $ */ /** * */ package fna.charactermarkup; //import java.sql.ResultSet; //import java.sql.Statement; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.*; import org.jdom.*; //import org.jdom.input.*; //import org.jdom.xpath.*; //import org.jdom.output.*; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; /** * @author hongcui * chara rewrite of CharStateHandler in JDOM terms * */ public class NumericalHandler { //static public String numberpattern = "[ ()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]*?[�/�\\d][ ()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]{2,}(?!~[a-z])"; static public String numberpattern = "[()\\[\\]\\-\\�\\d\\.�x\\+���/�\\*/%\\?]*?[�/�\\d][()\\[\\]\\-\\�\\d\\.,?�x\\+���/�\\*/%\\?]{2,}(?![a-z{}])"; //added , and ? for chromosome counts static private boolean debug = false; public NumericalHandler() { } /** * * @param tobechunkedmarkedsent: e.g. <Florets> 4�25 [ �60 ] , {bisexual} , {fertile} ; * @return <Florets> 4�25[�60] , {bisexual} , {fertile} ; */ public static String normalizeNumberExp(String sentence) { sentence = sentence.replaceAll("-\\s*LRB-/-LRB\\s*-", "[").replaceAll("-\\s*RRB-/-RRB\\s*-", "]"); String norm = ""; /*Pattern p = Pattern.compile("(.*?)("+NumericalHandler.numberpattern+")(.*)"); Matcher m = p.matcher(sentence); while(m.matches()){ sentence = m.group(3); norm += m.group(1); norm += " "+m.group(2).replaceAll("\\s+", "")+" "; m = p.matcher(sentence); } norm += sentence;*/ norm = sentence; norm = norm.trim().replaceFirst("(?<=[0-9])\\.$", " .").replaceAll("\\[","-LRB-/-LRB-").replaceAll("\\]","-RRB-/-RRB-"); return norm; } public static String originalNumForm(String token){ if(token.matches(".*[a-z].*?")){ return token.replaceAll("-\\s*LRB-/-LRB\\s*-?", "(").replaceAll("-\\s*RRB-/-RRB\\s*-?", ")"); }else{ return token.replaceAll("-\\s*LRB-/-LRB\\s*-?", "[").replaceAll("-\\s*RRB-/-RRB\\s*-?", "]"); } } /** * * @param token * @return true if token represents an expression of a discrete numerical value, not a range which is represented by this.numberpattern */ public static boolean isNumerical(String token){ String t = token.replaceAll("([({\\[]|-L[RS]B-)", "("); t = t.replaceAll("([)}\\]]|-R[RS]B-)", ")"); if(t.matches("\\(?\\d.*?\\d+\\+?%?\\??\\)?$")){ return true; } //if(token.matches(".*?\\d+.*-RRB-/-RRB-$")){ //if(token.matches(".*?\\d+.*-R[RS]B-/-R[RS]B-$")){ // return true; //} return false; } /** * * @param numberexp : styles 2[10] mm diam. * @param cname: * @return: characters marked up in XML format <character name="" value=""> */ //public static ArrayList<Element> characterstate(String plaincharset, String state){ public static ArrayList<Element> parseNumericals(String numberexp, String cname){ //new CharStateHandler(); if(debug) { System.out.println(); System.out.println(">>>>>>>>>>>>>"+numberexp); } ArrayList<Element> innertagstate = new ArrayList<Element>(); try{ int i,j; numberexp = numberexp.replaceAll("\\([\\s]?|\\[[\\s]?", "["); numberexp = numberexp.replaceAll("[\\s]?\\)|[\\s]?\\]", "]").trim(); //4-5[+] => 4-5[-5+] Pattern p1 = Pattern.compile("(.*?\\b(\\d+))\\s*\\[\\+\\](.*)"); Matcher m = p1.matcher(numberexp); if(m.matches()){ numberexp = m.group(1)+"[-"+m.group(2)+"+]"+m.group(3); m = p1.matcher(numberexp); } //1-[2-5] => 1-1[2-5] => 1[2-5] //1-[4-5] => 1-3[4-5] p1 = Pattern.compile("(.*?)(\\d+)-(\\[(\\d)-.*)"); m = p1.matcher(numberexp); if(m.matches()){ int n = Integer.parseInt(m.group(4))-1; if(n==Integer.parseInt(m.group(2))){ numberexp = m.group(1)+n+m.group(3); }else{ numberexp = m.group(1)+m.group(2)+"-"+n+m.group(3); } } /////////////////////////////////////////////////////////////////// // area //////// Pattern pattern19 = Pattern.compile("([ \\d\\.\\[\\]+-]+\\s*([cmd�u]?m?))\\s*[�x]?(\\s*[ \\d\\.\\[\\]+-]+\\s*([cmd�u]?m?))?\\s*[�x]\\s*([ \\d\\.\\[\\]+-]+\\s*([cmd�u]?m))"); Matcher matcher2 = pattern19.matcher(numberexp); if(matcher2.matches()){ //get l, w, and h String width = ""; String height = ""; String lunit = ""; String wunit = ""; String hunit = ""; String length = matcher2.group(1).trim(); String g5 = matcher2.group(5).trim(); if(matcher2.group(3)==null){ width = g5; }else{ width = matcher2.group(3); height = g5; } //make sure each has a unit if(height.length()==0){//2 dimensions wunit = matcher2.group(6); if(matcher2.group(2)==null || matcher2.group(2).trim().length()==0){ lunit = wunit; }else{ lunit = matcher2.group(2); } }else{//3 dimensions hunit = matcher2.group(6); if(matcher2.group(4)==null || matcher2.group(4).trim().length()==0){ wunit = hunit; }else{ wunit = matcher2.group(4); } if(matcher2.group(2)==null || matcher2.group(2).trim().length()==0){ lunit = wunit; }else{ lunit = matcher2.group(2); } } //format expression value+unit length = length.matches(".*[cmd�]?m$")? length : length + " "+lunit; width = width.matches(".*[cmd�]?m$")? width : width + " "+wunit; if(height.length()>0) height = height.matches(".*[cmd�]?m$")? height : height + " "+hunit; //annotation annotateSize(length, innertagstate, "length"); annotateSize(width, innertagstate, "width"); if(height.length()>0) annotateSize(height, innertagstate, "height"); numberexp = matcher2.replaceAll("#"); matcher2.reset(); } /* * can't handle atypical values in area Pattern pattern19 = Pattern.compile("[�]?[\\[]?[\\d\\s\\.]+[\\]]?[\\s]?[\\[]?[\\�\\-]+[\\]]?[\\s]?[\\[]?[\\d\\s\\.]+[+]?[\\]]?[\\s]?[dcm�]?[m]?[\\s]?[xX\\�]+[\\s]?[\\[]?[\\d\\s\\.]+[\\]]?[\\s]?[\\[]?[\\�\\-]+[\\]]?[\\s]?[\\[]?[\\d\\s\\.]+[+]?[\\]]?[\\s]?[dcm�]?m"); Matcher matcher2 = pattern19.matcher(plaincharset); while ( matcher2.find()){ if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String match = plaincharset.substring(i, j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m"); Matcher matcher3 = pattern18.matcher(match); String[] unit = new String[2]; int num = 0; while ( matcher3.find()){ unit[num] = match.substring(matcher3.start(), matcher3.end()); num++; } match = matcher3.replaceAll("#"); matcher3.reset(); int en = match.indexOf('-'); int lasten = match.lastIndexOf('-'); if (match.substring(en+1, match.indexOf('�',en+1)).contains("+")){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "length"); character.setAttribute("from", match.substring(0,en).trim()); character.setAttribute("from_unit",unit[0].trim()); character.setAttribute("to", match.substring(en+1, match.indexOf('+',en+1)).trim()); character.setAttribute("to_unit", unit[0].trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"length\" from=\""+match.substring(0,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('+',en+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[0].trim()+"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "length"); character.setAttribute("from", match.substring(0,en).trim()); character.setAttribute("from_unit",unit[0].trim()); //character.setAttribute("to", match.substring(en+1, match.indexOf('#',en+1)).trim()); character.setAttribute("to", match.substring(en+1, match.indexOf('�',en+1)).trim()); character.setAttribute("to_unit", unit[0].trim()); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"length\" from=\""+match.substring(0,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('�',en+1)).trim()+"\" unit=\""+unit[0].trim()+"\"/>"); } if (num>1){ if (match.substring(lasten+1, match.indexOf('#',lasten+1)).contains("+")){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "width"); //character.setAttribute("from", match.substring(match.indexOf('�')+2,lasten).trim()); character.setAttribute("from", match.substring(match.indexOf('�')+1,lasten).trim()); character.setAttribute("from_unit",unit[1].trim()); character.setAttribute("to", match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()); character.setAttribute("to_unit", unit[1].trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[1].trim()+"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "width"); //character.setAttribute("from", match.substring(match.indexOf('�')+2,lasten).trim()); character.setAttribute("from", match.substring(match.indexOf('�')+1,lasten).trim()); character.setAttribute("from_unit",unit[1].trim()); character.setAttribute("to", match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()); character.setAttribute("to_unit", unit[1].trim()); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()+"\" unit=\""+unit[1].trim()+"\"/>"); } }else{ if (match.substring(lasten+1, match.indexOf('#',lasten+1)).contains("+")){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "width"); //character.setAttribute("from", match.substring(match.indexOf('�')+2,lasten).trim()); character.setAttribute("from", match.substring(match.indexOf('�')+1,lasten).trim()); character.setAttribute("from_unit",unit[0].trim()); character.setAttribute("to", match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()); character.setAttribute("to_unit", unit[0].trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('+',lasten+1)).trim()+"\" upper_restricted=\"false\" unit=\""+unit[0].trim()+"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "width"); //character.setAttribute("from", match.substring(match.indexOf('�')+2,lasten).trim());//3-5x1.5-2 character.setAttribute("from", match.substring(match.indexOf('�')+1,lasten).trim()); character.setAttribute("from_unit",unit[0].trim()); character.setAttribute("to", match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()); character.setAttribute("to_unit", unit[0].trim()); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"width\" from=\""+match.substring(match.indexOf('�')+2,lasten).trim()+"\" to=\""+match.substring(lasten+1, match.indexOf('#',lasten+1)).trim()+"\" unit=\""+unit[0].trim()+"\"/>"); } } } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); */ //////////////////////////////////////////////////////////////////////////////////// // ratio //////////// Pattern pattern24 = Pattern.compile("l/w[\\s]?=[\\d\\.\\s\\+\\�\\-]+"); matcher2 = pattern24.matcher(numberexp); while ( matcher2.find()){ if(numberexp.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String match = numberexp.substring(i, j); int en = match.indexOf('-'); if (match.contains("+")){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "l_w_ratio"); //character.setAttribute("from", match.substring(match.indexOf('=')+2,en).trim()); character.setAttribute("from", match.substring(match.indexOf('=')+1,en).trim()); character.setAttribute("to", match.substring(en+1, match.indexOf('+',en+1)).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"l_w_ratio\" from=\""+match.substring(match.indexOf('=')+2,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf('+',en+1)).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "l_w_ratio"); //character.setAttribute("from", match.substring(match.indexOf('=')+2,en).trim()); character.setAttribute("from", match.substring(match.indexOf('=')+1,en).trim()); character.setAttribute("to", match.substring(en+1, match.indexOf(' ',en+1)).trim()); innertagstate.add(character); //innertagstate=innertagstate.concat("<character char_type=\"range_value\" name=\"l_w_ratio\" from=\""+match.substring(match.indexOf('=')+2,en).trim()+"\" to=\""+match.substring(en+1, match.indexOf(' ',en+1)).trim()+"\"/>"); } } numberexp = matcher2.replaceAll("#"); matcher2.reset(); ///////////////////////////////////////////////////////////////////////////////////////////////////////// // size: deal with "[5-]10-15[-20] cm", not deal with "5 cm - 10 cm" //////////// //int sizect = 0; String toval; String fromval; numberexp = annotateSize(numberexp, innertagstate, "size"); //////////////////////////////////////////////////////////////////////////////////////////// // size ///// Pattern pattern14 = Pattern.compile("[�\\d\\[\\]\\�\\-\\./\\s]+[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); matcher2 = pattern14.matcher(numberexp); toval=""; fromval=""; while ( matcher2.find()){ if(numberexp.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String extreme = numberexp.substring(i,j); i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-'){ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("to", ""); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", ""); character.setAttribute("to", extreme.substring(p+2,q-2).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from",""); character.setAttribute("to", extreme.substring(p+2,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } } else{ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim() ); //character.setAttribute("upper_restricted", "true"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "relative_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "relative_value"); character.setAttribute("name", "atypical_size"); character.setAttribute("value", extreme.substring(p+1,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"atypical_size\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); if(extreme.substring(i,j).contains("�")|extreme.substring(i,j).contains("-") && !extreme.substring(i,j).contains("�") && !extreme.substring(i,j).contains("x") && !extreme.substring(i,j).contains("X")){ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); Matcher matcher3 = pattern18.matcher(extract); String relative=""; if ( matcher3.find()){ relative = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); Element character = new Element("character"); character.setAttribute("char_type", "relative_range_value"); character.setAttribute("name", "size"); character.setAttribute("from", extract.substring(0, extract.indexOf('-')).trim()); character.setAttribute("to", extract.substring(extract.indexOf('-')+1,extract.indexOf('#')).trim()); character.setAttribute("relative_constraint",relative.trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_range_value\" name=\"size\" from=\""+extract.substring(0, extract.indexOf('-')).trim()+"\" to=\""+extract.substring(extract.indexOf('-')+1,extract.indexOf('#')).trim()+"\" relative_constraint=\""+relative.trim()+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')); //sizect+=1; } else{ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[\\�\\-]?(% of [\\w]+ length|height of [\\w]+|times as [\\w]+ as [\\w]+|total length|their length|(times)?[\\s]?length of [\\w]+)"); Matcher matcher3 = pattern18.matcher(extract); String relative=""; if ( matcher3.find()){ relative = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); Element character = new Element("character"); character.setAttribute("char_type", "relative_value"); character.setAttribute("name", "size"); character.setAttribute("value", extract.substring(0,extract.indexOf('#')).trim()); character.setAttribute("relative_constraint", relative.trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"relative_value\" name=\"size\" value=\""+extract.substring(0,extract.indexOf('#')).trim()+"\" relative_constraint=\""+relative.trim()+"\"/>"); toval = extract.substring(0,extract.indexOf('#')); fromval = extract.substring(0,extract.indexOf('#')); } Iterator<Element> it = innertagstate.iterator(); while(it.hasNext()){ Element e = it.next(); if(e.getAttribute("to") != null && e.getAttributeValue("to").compareTo("")==0){ if(toval.endsWith("+")){ toval = toval.replaceFirst("\\+$", ""); e.setAttribute("upper_restricted", "false"); } e.setAttribute("to", toval.trim()); e.setAttribute("to_inclusive", "false"); } if(e.getAttribute("from") != null && e.getAttributeValue("from").compareTo("")==0){ e.setAttribute("from", fromval.trim()); e.setAttribute("from_inclusive", "false"); } } /*StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset();*/ } numberexp = matcher2.replaceAll("#"); matcher2.reset(); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // count /////////////// /*p1 = Pattern.compile("^\\[(\\d+)\\](.*)"); m = p1.matcher(numberexp); if(m.matches()){ Element character = new Element("character"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("value", m.group(1)); innertagstate.add(character); numberexp = m.group(2).trim(); } p1 = Pattern.compile("^\\[(\\d+)\\+\\](.*)"); m = p1.matcher(numberexp); if(m.matches()){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", m.group(1)); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); numberexp = m.group(2); }*/ //int countct = 0; Pattern pattern15 = Pattern.compile("([\\[]?[�]?[\\d]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?|[\\[]?[�]?[\\d]+[+]?[\\]]?[\\s]?)[\\�\\�\\-]+[a-zA-Z]+"); matcher2 = pattern15.matcher(numberexp); numberexp = matcher2.replaceAll("#"); matcher2.reset(); //Pattern pattern16 = Pattern.compile("(?<!([/][\\s]?))([\\[]?[�]?[\\d]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?[\\s]?([\\[]?[\\�\\-]?[\\]]?[\\s]?[\\[]?[\\d]+[+]?[\\]]?)*|[�]?[\\d]+[+]?)(?!([\\s]?[n/]|[\\s]?[\\�\\-]?% of [\\w]+ length|[\\s]?[\\�\\-]?height of [\\w]+|[\\s]?[\\�\\-]?times|[\\s]?[\\�\\-]?total length|[\\s]?[\\�\\-]?their length|[\\s]?[\\�\\-]?(times)?[\\s]?length of|[\\s]?[dcm�]?m))"); //Pattern pattern16 = Pattern.compile("(?<!([/][\\s]?))([\\[]?[�]?[\\d\\./%]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d\\./%]+[+]?[\\]]?[\\s]?([\\[]?[\\�\\-]?[\\]]?[\\s]?[\\[]?[\\d\\./%]+[+]?[\\]]?)*|[�]?[\\d\\./%]+[+]?)(?!([\\s]?[n/]|[\\s]?[\\�\\-]?% of [\\w]+ length|[\\s]?[\\�\\-]?height of [\\w]+|[\\s]?[\\�\\-]?times|[\\s]?[\\�\\-]?total length|[\\s]?[\\�\\-]?their length|[\\s]?[\\�\\-]?(times)?[\\s]?length of|[\\s]?[dcm�]?m))"); Pattern pattern16 = Pattern.compile("(?<!([/][\\s]?))([\\[]?[�]?[\\d\\./%]+[\\]]?[\\s]?[\\[]?[\\�\\-][\\]]?[\\s]?[\\[]?[\\d\\./%]+[+]?[\\]]?[\\s]?([\\[]?[\\�\\-]?[\\]]?[\\s]?[\\[]?[\\d\\./%]+[+]?[\\]]?)*|\\[?[�]?[\\d\\./%]+[+]?\\]?)(?!([\\s]?[n/]|[\\s]?[\\�\\-]?% of [\\w]+ length|[\\s]?[\\�\\-]?height of [\\w]+|[\\s]?[\\�\\-]?times|[\\s]?[\\�\\-]?total length|[\\s]?[\\�\\-]?their length|[\\s]?[\\�\\-]?(times)?[\\s]?length of|[\\s]?[dcm�]?m))"); matcher2 = pattern16.matcher(numberexp); while ( matcher2.find()){ i=matcher2.start(); j=matcher2.end(); String extreme = numberexp.substring(i,j); i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("to", ""); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); String tmp = extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim(); character.setAttribute("to", tmp.replaceFirst("[^0-9]+$", "")); if(tmp.endsWith("+")){ character.setAttribute("upper_restricted", "false"); } innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; j = extreme.length(); //process from # to the end of extreme. but in 1-[2-5] (1-#), the value is before # Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); j = p; if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", ""); character.setAttribute("to", extreme.substring(p+2,q-2).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", ""); character.setAttribute("to", extreme.substring(p+2,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } } else{ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_count\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } } matcher1.reset(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); j = p; if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"atypical_count\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("name", "atypical_"+(cname==null?"count": cname)); character.setAttribute("value", extreme.substring(p+1,q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"atypical_count\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } } matcher1.reset(); //# to the end String extract = extreme.substring(i,j); if(extract.contains("�")|extract.contains("-") && !extract.contains("�") && !extract.contains("x") && !extract.contains("X")){ //String extract = extreme.substring(i,j); Pattern pattern22 = Pattern.compile("[\\[\\]]+"); matcher1 = pattern22.matcher(extract); extract = matcher1.replaceAll(""); matcher1.reset(); String to = extract.substring(extract.indexOf('-')+1,extract.length()).trim(); boolean upperrestricted = true; if(to.endsWith("+")){ upperrestricted = false; to = to.replaceFirst("\\+$", ""); } Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", cname==null?"count": cname); character.setAttribute("from", extract.substring(0, extract.indexOf('-')).trim()); character.setAttribute("to", to); if(!upperrestricted) character.setAttribute("upper_restricted", upperrestricted+""); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"count\" from=\""+extract.substring(0, extract.indexOf('-')).trim()+"\" to=\""+extract.substring(extract.indexOf('-')+1,extract.length()).trim()+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.length()); //countct+=1; }else{ //String extract = extreme.substring(i,j).trim(); if(extract.length()>0){ Element character = new Element("character"); character.setAttribute("name", cname==null?"count": cname); if(extract.endsWith("+")){ extract = extract.replaceFirst("\\+$", "").trim(); character.setAttribute("char_type", "range_value"); character.setAttribute("from", extract); character.setAttribute("upper_restricted", "false"); }else{ character.setAttribute("value", extract); } innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"count\" value=\""+extract.trim()+"\"/>"); toval = extract; fromval = extract; } } //start to #, dupllicated above if(i-1>0){ extract = extreme.substring(0, i-1); if(extract.contains("�")|extract.contains("-") && !extract.contains("�") && !extract.contains("x") && !extract.contains("X")){ //String extract = extreme.substring(i,j); Pattern pattern22 = Pattern.compile("[\\[\\]]+"); matcher1 = pattern22.matcher(extract); extract = matcher1.replaceAll(""); matcher1.reset(); String to = extract.substring(extract.indexOf('-')+1,extract.length()).trim(); boolean upperrestricted = true; if(to.endsWith("+")){ upperrestricted = false; to = to.replaceFirst("\\+$", ""); } Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", cname==null?"count": cname); character.setAttribute("from", extract.substring(0, extract.indexOf('-')).trim()); character.setAttribute("to", to); if(!upperrestricted) character.setAttribute("upper_restricted", upperrestricted+""); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"count\" from=\""+extract.substring(0, extract.indexOf('-')).trim()+"\" to=\""+extract.substring(extract.indexOf('-')+1,extract.length()).trim()+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.length()); //countct+=1; }else{ //String extract = extreme.substring(i,j).trim(); if(extract.length()>0){ Element character = new Element("character"); character.setAttribute("name", cname==null?"count": cname); if(extract.endsWith("+")){ extract = extract.replaceFirst("\\+$", "").trim(); character.setAttribute("char_type", "range_value"); character.setAttribute("from", extract); character.setAttribute("upper_restricted", "false"); }else{ character.setAttribute("value", extract); } innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"count\" value=\""+extract.trim()+"\"/>"); toval = extract; fromval = extract; } } } Iterator<Element> it = innertagstate.iterator(); while(it.hasNext()){ Element e = it.next(); if(e.getAttribute("to") != null && e.getAttributeValue("to").compareTo("")==0){ if(toval.endsWith("+")){ toval = toval.replaceFirst("\\+$", ""); e.setAttribute("upper_restricted", "false"); } e.setAttribute("to", toval.trim()); e.setAttribute("to_inclusive", "false"); } if(e.getAttribute("from") != null && e.getAttributeValue("from").compareTo("")==0){ e.setAttribute("from", fromval.trim()); e.setAttribute("from_inclusive", "false"); } } /* StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset();*/ } matcher2.reset(); } catch (Exception e) { e.printStackTrace(); System.err.println(e); } if(debug){ try{ XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); Iterator<Element> it = innertagstate.iterator(); while(it.hasNext()){ Element e = it.next(); System.out.println(outputter.outputString(e)); } }catch(Exception e){ e.printStackTrace(); } } return innertagstate; } private static String annotateSize(String plaincharset, ArrayList<Element> innertagstate, String chara) { int i; int j; Matcher matcher2; Pattern pattern13 = Pattern.compile("[xX\\ױ\\d\\[\\]\\�\\-\\.\\s\\+]+[\\s]?([dcm�]?m)(?![\\w])(([\\s]diam)?([\\s]wide)?)"); matcher2 = pattern13.matcher(plaincharset); String toval=""; String fromval=""; while ( matcher2.find()){ String unit = matcher2.group(1); if(plaincharset.charAt(matcher2.start())==' '){ i=matcher2.start()+1; } else{ i=matcher2.start(); } j=matcher2.end(); String extreme = plaincharset.substring(i,j); i = 0; j = extreme.length(); Pattern pattern20 = Pattern.compile("\\[[�\\d\\.\\s\\+]+[\\�\\-]{1}[�\\d\\.\\s\\+\\�\\-]*\\]"); Matcher matcher1 = pattern20.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if(extreme.charAt(q-2)=='�' | extreme.charAt(q-2)=='-'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("to", ""); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); //character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" to=\"\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); //character.setAttribute("upper_restricted", "??"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); if(extreme.contains("#")) i = extreme.indexOf("#")+1; Pattern pattern21 = Pattern.compile("\\[[�\\d\\.\\s\\+\\�\\-]*[\\�\\-]{1}[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern21.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(p+1)=='�' | extreme.charAt(p+1)=='-'){ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", ""); character.setAttribute("to", extreme.substring(p+2,q-2).trim()); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", ""); character.setAttribute("to", extreme.substring(p+2,q-1).trim()); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); //character.setAttribute("upper_restricted", "true"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\"\" to=\""+extreme.substring(p+2,q-1).trim()+"\"/>"); } } else{ if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()); character.setAttribute("to", extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); //character.setAttribute("upper_restricted", "true"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"atypical_size\" from=\""+extreme.substring(p+1,extreme.indexOf("-",p+1)).trim()+"\" to=\""+extreme.substring(extreme.indexOf("-",p+1)+1,q-1).trim()+"\"/>"); } } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); Pattern pattern23 = Pattern.compile("\\[[�\\d\\.\\s\\+]+\\]"); matcher1 = pattern23.matcher(extreme); if ( matcher1.find()){ int p = matcher1.start(); int q = matcher1.end(); if (extreme.charAt(q-2)=='+'){ Element character = new Element("character"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("from", extreme.substring(p+1,q-2).trim()); character.setAttribute("to", ""); character.setAttribute("from_unit", unit); character.setAttribute("to_unit", unit); character.setAttribute("upper_restricted", "false"); innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"atypical_size\" from=\""+extreme.substring(p+1,q-2).trim()+"\" upper_restricted=\"false\"/>"); }else{ Element character = new Element("character"); character.setAttribute("name", "atypical_"+chara); character.setAttribute("value", extreme.substring(p+1,q-1).trim()); character.setAttribute("unit", unit); //character.setAttribute("unit", extreme.substring(q-1).trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"atypical_size\" value=\""+extreme.substring(p+1,q-1).trim()+"\"/>"); } } extreme = matcher1.replaceAll("#"); matcher1.reset(); j = extreme.length(); if(extreme.substring(i,j).contains("�")|extreme.substring(i,j).contains("-") && !extreme.substring(i,j).contains("�") && !extreme.substring(i,j).contains("x") && !extreme.substring(i,j).contains("X")){ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m(([\\s]diam)?([\\s]wide)?)"); Matcher matcher3 = pattern18.matcher(extract); unit=""; if ( matcher3.find()){ unit = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); String from = extract.substring(0, extract.indexOf('-')).trim(); String to = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')).trim(); boolean upperrestricted = ! to.endsWith("+"); to = to.replaceFirst("\\+$", "").trim(); Element character = new Element("character"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", chara); character.setAttribute("from", from); character.setAttribute("from_unit", unit.trim()); character.setAttribute("to", to); character.setAttribute("to_unit", unit.trim()); if(!upperrestricted) character.setAttribute("upper_restricted", upperrestricted+""); innertagstate.add(character); //innertagstate = innertagstate.concat("<character char_type=\"range_value\" name=\"size\" from=\""+from+"\" from_unit=\""+unit.trim()+"\" to=\""+to+"\" to_unit=\""+unit.trim()+"\" upper_restricted=\""+upperrestricted+"\"/>"); toval = extract.substring(0, extract.indexOf('-')); fromval = extract.substring(extract.indexOf('-')+1,extract.indexOf('#')); //sizect+=1; } else{ String extract = extreme.substring(i,j); Pattern pattern18 = Pattern.compile("[\\s]?[dcm�]?m(([\\s]diam)?([\\s]wide)?)"); Matcher matcher3 = pattern18.matcher(extract); unit=""; if ( matcher3.find()){ unit = extract.substring(matcher3.start(), matcher3.end()); } extract = matcher3.replaceAll("#"); matcher3.reset(); Element character = new Element("character"); character.setAttribute("name", chara); character.setAttribute("value", extract.substring(0,extract.indexOf('#')).trim()); character.setAttribute("unit", unit.trim()); innertagstate.add(character); //innertagstate = innertagstate.concat("<character name=\"size\" value=\""+extract.substring(0,extract.indexOf('#')).trim()+"\" unit=\""+unit.trim()+"\"/>"); toval = extract.substring(0,extract.indexOf('#')); fromval = extract.substring(0,extract.indexOf('#')); } Iterator<Element> it = innertagstate.iterator(); while(it.hasNext()){ Element e = it.next(); if(e.getAttribute("to") != null && e.getAttributeValue("to").compareTo("")==0){ if(toval.endsWith("+")){ toval = toval.replaceFirst("\\+$", ""); e.setAttribute("upper_restricted", "false"); } e.setAttribute("to", toval.trim()); e.setAttribute("to_inclusive", "false"); } if(e.getAttribute("from") != null && e.getAttributeValue("from").compareTo("")==0){ e.setAttribute("from", fromval.trim()); e.setAttribute("from_inclusive", "false"); } } /* StringBuffer sb = new StringBuffer(); Pattern pattern25 = Pattern.compile("to=\"\""); matcher1 = pattern25.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb, "to=\""+toval.trim()+"\""); } matcher1.appendTail(sb); innertagstate=sb.toString(); matcher1.reset(); StringBuffer sb1 = new StringBuffer(); Pattern pattern26 = Pattern.compile("from=\"\""); matcher1 = pattern26.matcher(innertagstate); while ( matcher1.find()){ matcher1.appendReplacement(sb1, "from=\""+fromval.trim()+"\""); } matcher1.appendTail(sb1); innertagstate=sb1.toString(); matcher1.reset(); */ } plaincharset = matcher2.replaceAll("#"); matcher2.reset(); //System.out.println("plaincharset2:"+plaincharset); return plaincharset; } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub //String str1 = "stems 2�3"; //String str2 = "stems 2�3"; //String str1 = "10-20 mm � 40-50 mm"; //String str1 = "4-5[+]"; //should output atypical count, but output count //String str1 = "4-5[-5+]"; //String str1 = "[5-]10-15[-20]"; //String str1 = "[30-]80-250[-450+];"; //String str1 = "[5+]6"; //String str1 ="3-5 �(0.6-)1.5-2 cm"; // //String str1 = "5+"; //String str1 ="3 � 2 cm"; // //String str1 = "[30-70+]"; // //String str1 = "1-[2-10]";//todo //String str1 = "3-5 [7-8]"; //String str1 = "[0]3-5[-12+]"; //String str1 = "80[72]"; //String str1 = "(2-)2.5-3.5(-4) � (1.5-)2-3(-4) cm"; String str1 = "(4�)5�6 � 1.5�2"; String str2 = "area"; System.out.println(NumericalHandler.parseNumericals(str1, str2)); } }