/* $Id: Test.java 827 2011-06-05 03:36:57Z hong1.cui $ */ /** * */ package fna.charactermarkup; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.Statement; import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import fna.parsing.ApplicationUtilities; /** * @author hongcui * */ @SuppressWarnings({ "unused" }) public class Test { static Connection conn = null; ArrayList<String> chunkedtokens = new ArrayList<String>(); /** * */ public Test() { //as long as its bony portion /*this.chunkedtokens.add("{as-long-as}"); this.chunkedtokens.add("its"); this.chunkedtokens.add("{bony}"); this.chunkedtokens.add("(portion)");*/ //as deep as broad /*this.chunkedtokens.add("{as-deep-as}"); this.chunkedtokens.add("{broad}");*/ //ad short as width of the tip /*this.chunkedtokens.add("{as-short-as}"); this.chunkedtokens.add("{width}"); this.chunkedtokens.add("r[p[of] the (tip)]");*/ } private void normalizeAsAsThan() { for(int i = 0; i< this.chunkedtokens.size(); i++){ String token = this.chunkedtokens.get(i); String chunk = token+" "; boolean success = false; if(token.matches("\\{?as-("+ChunkedSentence.asasthan+")-as\\}?")){//{as-long-as}: treat these as ChunkTHAN //looking for the 2nd part int j = 0; String t = ""; for(j = i+1; j<this.chunkedtokens.size(); j++){ t = this.chunkedtokens.get(j); if(t.length()!=0) break; } if(t.matches("\\{?("+ChunkedSentence.asasthan+")\\}?")){ //case 1 chunk +=t+" "; success = true; } else if(t.matches("\\{?(height|width|length|depth|thickness)\\}?")){ //case 3 chunk +=t+" "; for(int k = j+1; k < this.chunkedtokens.size(); k++){ if(this.chunkedtokens.get(k).length()==0) continue; if(this.chunkedtokens.get(k).startsWith("r[p[of")){ chunk += this.chunkedtokens.get(k)+" "; j = k; success = true; break; } } } if(!success){ //case 2 while(!t.startsWith("(") && !t.equals(",")){//found bony in {bony} (portion) chunk +=t+" "; if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j); else break; success = true; } while((t.length()==0 || t.startsWith("("))){ //found (portion) chunk +=t+" "; if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j); else break; } } //form n[chunk] if(success){ this.chunkedtokens.set(i, "n["+chunk.trim()+"]"); for(int k=i+1; k<=j; k++){ this.chunkedtokens.set(k, ""); } } } } } public void constraint(){ String[] organ = new String[]{"long", "cauline", "leaf", "abaxial", "surface", "trichomode"}; Hashtable<String, String> mapping = new Hashtable<String, String>(); mapping.put("cauline", "type"); mapping.put("leaf", "parent_organ"); mapping.put("long", "null"); mapping.put("surface", "parent_organ"); mapping.put("abaxial", "type"); mapping.put("trichomode", "type"); int j = 5; boolean terminate =false; for(;j >=0; j--){ if(terminate) break; String w = organ[j].replaceAll("(\\w+\\[|\\]|\\{|\\})", ""); String type = "null"; if(w.startsWith("(")) type="parent_organ"; else type = mapping.get(w); if(!type.equals("null")){ organ[j] = ""; if(type.equals("type")){ System.out.println("constraint_"+type+": "+w.replaceAll("(\\(|\\))", "").trim()); //may not have. }else{//"parent_organ": collect all until a null constraint is found String constraint = w; j--; for(; j>=0; j--){ w = organ[j].replaceAll("(\\w+\\[|\\]|\\{|\\})", ""); if(w.startsWith("(")) type="parent_organ"; else type = mapping.get(w);; if(!type.equals("null")){ constraint = w+" "+constraint; organ[j] = ""; } else{ System.out.println("constraint_parent_organ: "+constraint.replaceAll("(\\(|\\))", "").trim()); //may not have. terminate = true; break; } } } }else{ break; } } j++; System.out.println(j); } public void test1(){ String tsent = "<a b> a b <a b c> {a b} <a> <b>"; Pattern p = Pattern.compile("(.*?<[^>]*) ([^<]*>.*)");//<floral cup> => <floral-cup> Matcher m = p.matcher(tsent); while(m.matches()){ tsent = m.group(1)+"-"+m.group(2); m = p.matcher(tsent); } System.out.println(tsent); } private ArrayList<String> breakText(String text) { ArrayList<String> tokens = new ArrayList<String>(); String[] words = text.split("\\s+"); String t = ""; int left = 0; for(int i = 0; i<words.length; i++){ String w = words[i]; if(w.indexOf("[")<0 && w.indexOf("]")<0 && left==0){ if(!w.matches("\\b(this|have|that|may|be)\\b")){tokens.add(w);}; }else{ left += w.replaceAll("[^\\[]", "").length(); left -= w.replaceAll("[^\\]]", "").length(); t += w+" "; if(left==0){ tokens.add(t.trim()); t = ""; } } } return tokens; } public String addSentmod(String subject, String sentmod) { String[] tokens = subject.split("\\s+"); String substring = ""; for(int i = 0; i<tokens.length; i++){ if(!sentmod.matches(".*?\\b"+tokens[i].replaceAll("[{()}]", "")+"\\b.*")){ substring +=tokens[i]+" "; } } substring = substring.trim(); substring ="{"+sentmod.replaceAll("[\\[\\]]", "").replaceAll(" ", "} {").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ")+"} "+substring; return substring; } private static String combineModifiers(String element){ Pattern ptn = Pattern.compile("(.*? )(modifier=\\S+)(['\"].*)"); Matcher m = ptn.matcher(element); String result = ""; String modifiers = ""; while(m.matches()){ result +=m.group(1).replaceFirst("^['\"]", ""); modifiers += m.group(2).replaceAll("modifier=", "")+";"; element = m.group(3); m = ptn.matcher(element); } result += element.replaceFirst("^['\"]", ""); modifiers = "modifier=\""+modifiers.replaceAll("['\"]", "").replaceAll("\\W+$", "").trim()+"\""; result = result.replaceFirst("value", modifiers+" value").replaceAll("\\s+", " "); return result; } private String normalizeCountList(String str) { ArrayList<String> chunkedtokens = new ArrayList<String>(Arrays.asList(str.split("\\s+"))); String countp = "one|two|three|four|five|six|seven|eight|nine|ten|more|fewer"; Pattern p = Pattern.compile("(\\b(?:"+countp+") (?:or|to) (?:"+countp+")\\b)"); Matcher m = p.matcher(str); while(m.find()){ int start = m.start(1); int end = m.end(1); String count = m.group(1); String rcount = "{count~list~"+count.replaceAll(" ","~")+"}"; //synchronise this.chunkedtokens //split by single space to get an accurate count to elements that would be in chunkedtokens int index = (str.substring(0, start).trim()+" a").trim().split("\\s").length-1; //number of tokens before the count pattern chunkedtokens.set(index, rcount); int num = count.split("\\s+").length; for(int i = index+1; i < index+num; i++){ chunkedtokens.set(i, ""); } //resemble the str from chunkedtokens, counting all empty elements, so the str and chunkedtokens are in synch. str = ""; for(String t: chunkedtokens){ str +=t+" "; } m = p.matcher(str); } return str.replaceAll("\\s+", " ").trim(); } private String normalizeSharedOrganObject(String object) { // TODO Auto-generated method stub if(object.matches(".*?\\b(and|or)\\b.*")){ String norm = ""; String[] segs = object.split("\\s+"); String lastN = segs[segs.length-1].replaceAll("\\]+$", "").trim(); for(int i= segs.length-1; i>=0; i--){ norm = segs[i]+" "+norm; if(segs[i].matches("(,|and|or)") && !segs[i-1].contains("(")){ norm = lastN+" "+norm; } if(segs[i].matches("(,|and|or)") && segs[i-1].contains("(")){ lastN = segs[i-1].trim(); } } return norm; } return object; } /** * @param args */ public static void main(String[] args) { Test t = new Test(); //t.normalizeAsAsThan(); /*String object = "o[the {frontal} , the (sphenotic) ({spine}) and the (flower)]"; object = t.normalizeSharedOrganObject(object); System.out.println(object);*/ //String str = "epural bones two or more present"; //str = t.normalizeCountList(str); //System.out.println(str); //System.out.println( //t.addSentmod("{distal} (face)", "distal [basal leaf]") //t.combineModifiers("<character name=\"n\" modifier=\"a\" value=\"c\"/>") //); //String text = "that often do not overtop the heads"; //t.breakText(text); } }