/* $Id: TermOutputerUtilities.java 971 2011-09-13 18:32:55Z hong1.cui $ */ /** * */ package fna.charactermarkup; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.sql.Connection; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Hashtable; import fna.parsing.ApplicationUtilities; import fna.parsing.MainForm; import fna.parsing.state.StateCollector; import fna.parsing.state.WordNetWrapper; import java.util.ArrayList; import java.util.regex.*; /** * @author hongcui * */ @SuppressWarnings({ "unused" }) public class Utilities { public static String or = "_or_"; private static String selectivepreps = ChunkedSentence.prepositions.replaceFirst("\\|of\\|", "|"); private static Pattern prepphraseptn = Pattern.compile(".*?((?:^| )in \\w+ (?:"+selectivepreps+")\\b)(.*)"); public static ArrayList<String> sureVerbs = new ArrayList<String>(); public static ArrayList<String> sureAdvs = new ArrayList<String>(); public static ArrayList<String> partOfPrepPhrase = new ArrayList<String>(); //public static ArrayList<String> prepPhrases = new ArrayList<String>(); public static ArrayList<String> notSureVerbs = new ArrayList<String>(); public static ArrayList<String> notSureAdvs = new ArrayList<String>(); public static ArrayList<String> notPartOfPrepPhrase = new ArrayList<String>(); public static boolean debug = false; public static boolean debugPOS = true; /** * word must be a verb if * 1. its pos is "verb" only, or * 2. "does not" word * 3. has "verb" pos and seen patterns (word "a/the", or word prep <organ>) and not seen pattern (word \w+ly$). * 4. -ed, -ing * @param word * @param conn * @return */ public static boolean mustBeVerb(String word, Connection conn, String prefix){ if(sureVerbs.contains(word)) return true; if(notSureVerbs.contains(word)) return false; WordNetWrapper wnw = new WordNetWrapper(word); boolean v = wnw.isV(); //wordnet contains verb sense only if(!wnw.isAdj() && !wnw.isAdv() && !wnw.isN() && v && !word.endsWith("ing")){ sureVerbs.add(word); if(debugPOS) System.out.println(word+" is sureVerb"); return true; } if(!v) return false; try{ Statement stmt = conn.createStatement(); String q = "select * from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where originalsent rlike '(does|do) not "+word+"'"; ResultSet rs = stmt.executeQuery(q); if(rs.next()){ sureVerbs.add(word); if(debugPOS) System.out.println(word+" is sureVerb"); return true; } if(v){ q = "select * from "+prefix+"_"+ApplicationUtilities.getProperty("HEURISTICNOUNS")+" " + "where word = '"+word+"'"; rs = stmt.executeQuery(q); if(rs.next()){ notSureVerbs.add(word); return false; } q = "select * from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where sentence rlike '(^| )"+word+" +[-a-z_]+ly$'"; rs = stmt.executeQuery(q); if(rs.next()){ notSureVerbs.add(word); return false; } q = "select * from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where sentence rlike '(^| )(a|an|the) "+word+"( |$)'"; rs = stmt.executeQuery(q); if(rs.next()){ notSureVerbs.add(word); return false; } q = "select sentence from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where sentence rlike '(^| )"+word+" (a|an|the) '"; rs = stmt.executeQuery(q); if(rs.next()){ sureVerbs.add(word); if(debugPOS) System.out.println(word+" is sureVerb"); return true; } if(word.endsWith("ed") || word.endsWith("ing")){ q = "select sentence from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where sentence rlike '(^| )"+word+" '"; rs = stmt.executeQuery(q); while(rs.next()){ String sent = rs.getString("sentence"); Pattern p = Pattern.compile("\\b"+word+"\\b(?: (?:"+selectivepreps+")) +(\\S+)"); Matcher m = p.matcher(sent); while(m.find()){ String term = m.group(1); if(term.matches("(a|an|the|some|any|this|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)")){ sureVerbs.add(word); if(debugPOS) System.out.println(word+" is sureVerb"); return true; }else if(isOrgan(term, conn, prefix)){ sureVerbs.add(word); if(debugPOS) System.out.println(word+" is sureVerb"); return true; } } } } } }catch(Exception e){ e.printStackTrace(); } notSureVerbs.add(word); return false; } private static boolean isOrgan(String term, Connection conn, String tablePrefix) { try{ Statement stmt = conn.createStatement(); String wordrolesable = tablePrefix+ "_"+ApplicationUtilities.getProperty("WORDROLESTABLE"); ResultSet rs = stmt.executeQuery("select word from "+wordrolesable+" where semanticrole in ('os', 'op') and word='"+term+"'"); if(rs.next()){ if(debugPOS) System.out.println(term+" is an organ"); return true; } }catch(Exception e){ e.printStackTrace(); } return false; } /** * position terms from BSPO are in the glossary as of 052513 relational adjectives from URBERON are considered structures (and not spatial term) and are in the glossary too * @param term * @param conn * @param glossary * @return */ public static boolean isPosition(String term, Connection conn, String glossary) { try{ //position term in gloss Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select term from "+glossary+" where category in ('position') and term='"+term+"'"); if(rs.next()){ if(debugPOS) System.out.println(term+" is a position"); return true; } }catch(Exception e){ e.printStackTrace(); } return false; } public static boolean mustBeAdv(String word){ if(sureAdvs.contains(word)) return true; if(notSureAdvs.contains(word)) return false; WordNetWrapper wnw = new WordNetWrapper(word); if(!wnw.isAdj() && wnw.isAdv() && !wnw.isN() && !wnw.isV()){ sureAdvs.add(word); if(debugPOS) System.out.println(word+" is sureAdv"); return true; } notSureAdvs.add(word); return false; } public static boolean partOfPrepPhrase(String word, Connection conn, String prefix){ if(partOfPrepPhrase.contains(word)) return true; if(notPartOfPrepPhrase.contains(word)) return true; if(isOrgan(word, conn, prefix)){ notPartOfPrepPhrase.add(word); return false; } try{ Statement stmt = conn.createStatement(); String sql = "select sentence from "+prefix+"_"+ApplicationUtilities.getProperty("SENTENCETABLE")+" " + "where originalsent rlike '(^| )in "+word+" ("+selectivepreps+")( |$)'"; ResultSet rs = stmt.executeQuery(sql); boolean select = true;//add other rules in the future boolean exist = false; while(rs.next()){ exist = true; partOfPrepPhrase.add(word); if(debugPOS) System.out.println(word+" is partOfPrepPhrase"); Matcher m = prepphraseptn.matcher(rs.getString("sentence")); while(m.matches()){ add2table(m.group(1).trim(), conn, prefix); m = prepphraseptn.matcher(m.group(2)); } return true; } /*if(exist && select){ return true; } */ }catch(Exception e){ e.printStackTrace(); } notPartOfPrepPhrase.add(word); return false; } private static void add2table(String phrase, Connection conn, String prefix) { try{ Statement stmt = conn.createStatement(); stmt.execute("insert into "+prefix+"_prepphrases values ('"+phrase+"')"); }catch(Exception e){ e.printStackTrace(); } } /////////////////////////////////////////////////////////////////////// public static boolean isNoun(String word, ArrayList<String> nouns, ArrayList<String> notnouns){ word = word.trim(); if(word.indexOf(' ')>0) return false; word = word.replaceAll("[<>{}\\]\\[]", ""); if(!word.matches(".*?[a-z]+.*")){ notnouns.add(word); return false; } if(word.matches("\\b("+ChunkedSentence.stop+")\\b")){ notnouns.add(word); return false; } if(nouns.contains(word)){ return true; } if(notnouns.contains(word)){ return false; } WordNetWrapper wnw = new WordNetWrapper(word); String pos = wnw.mostlikelyPOS(); if(pos != null){ if(pos.compareTo("noun") == 0){ nouns.add(word); return true; } } notnouns.add(word); return false; } public static boolean isVerb(String word, ArrayList<String> verbs, ArrayList<String> notverbs) { word = word.replaceAll("[<>{}\\]\\[]", "").trim(); if(!word.matches(".*?[a-z]+.*")){ return false; } if(word.matches("\\b("+ChunkedSentence.stop+")\\b")){ return false; } if(verbs.contains(word)){ return true; } if(notverbs.contains(word)){ return false; } WordNetWrapper wnw = new WordNetWrapper(word); String pos = wnw.mostlikelyPOS(); if(pos != null){ if(pos.compareTo("verb") == 0){ verbs.add(word); return true; }else{ if(wnw.isV() && word.endsWith("ed")){ verbs.add(word); return true; } } } notverbs.add(word); return false; } public static boolean isAdv(String word, ArrayList<String> adverbs, ArrayList<String> notadverbs) { word = word.replaceAll("[<>{}\\]\\[()\\d+-]", "").trim(); if(word.matches("(not|at-?least|throughout|much)")){ return true; } if(word.matches("in.*?(profile|view)")){//covers in-dorsal-view, in-profile return true; } if(word.compareTo("moreorless")==0){ return true; } if(word.compareTo("becoming")==0){ return true; } if(word.compareTo("�")==0){ return true; } if(!word.matches(".*?[a-z]+.*")){ notadverbs.add(word); return false; } if(word.matches("\\b("+ChunkedSentence.stop+")\\b")){ notadverbs.add(word); return false; } if(adverbs.contains(word)){ return true; } if(notadverbs.contains(word)){ return false; } WordNetWrapper wnw = new WordNetWrapper(word); String pos = wnw.mostlikelyPOS(); if(pos != null && pos.length()>0){ if(pos.compareTo("adv") == 0){ adverbs.add(word); return true; } }else{ if(word.endsWith("ly")){ adverbs.add(word); return true; } } notadverbs.add(word); return false; } /** * 5-{merous} * @param w * @return null if not found */ public static String lookupCharacter(String w, Connection conn, Hashtable<String, String> characterhash, String glosstable, String prefix) { if(w.trim().length()==0) return null; if(w.indexOf(" ")>0) w = w.substring(w.lastIndexOf(" ")+1).trim(); w = w.replaceAll("[{}<>()]", "").replaceAll("\\d+[�-]", "_").replaceAll("�", "-")./*replaceAll(" ", "").*/replaceAll("_+", "_");//"(3-)5-merous" =>_merous w = w.replaceFirst(".*?_(?=[a-z]+$)", ""); //_or_ribbed String wc = w; String ch = characterhash.get(w); if(ch != null){ return ch; }else{ ch = ""; if(w.endsWith("shaped")){ return "shape"; } if(w.indexOf('-')>0){ String[] ws = w.split("-+"); w = ws[ws.length-1]; } ch = lookup(w, conn, characterhash, glosstable, wc, prefix); if(ch == null && wc.indexOf('-')>0){//pani_culiform ch = lookup(wc.replaceAll("-", ""), conn, characterhash, glosstable, wc, prefix); } } return ch; } private static String lookup(String w, Connection conn, Hashtable<String, String> characterhash, String glosstable, String wc, String prefix) { String ch =""; HashSet<String> chs = new HashSet<String>(); try{ Statement stmt = conn.createStatement(); //check glossarytable ResultSet rs = stmt.executeQuery("select distinct category from "+glosstable+" where term = '"+w+"' or term ='_"+w+"' order by category"); while(rs.next()){ String cat = rs.getString("category"); chs.add(cat); //if(! ch.matches(".*?(^|_)"+cat+"(_|$).*")){ // ch += rs.getString("category").trim().replaceAll("\\s+", "_")+"_or_"; //} } //check _term_category table String q = "select distinct category from "+prefix+"_term_category where term='"+w+"' and category !='structure' order by category"; rs = stmt.executeQuery(q); while(rs.next()){ String cat = rs.getString("category"); chs.add(cat); //if(! ch.matches(".*?(^|_)"+cat+"(_|$).*")){ // ch += rs.getString("decision").trim().replaceAll("\\s+", "_")+"_or_"; //} } rs.close(); stmt.close(); String[] charas = chs.toArray(new String[]{}); Arrays.sort(charas); for(String character: charas){ ch += character.replaceAll("\\s+", "_")+"_or_"; } if(ch.length()>0){ ch = ch.replaceFirst(Utilities.or+"$", ""); characterhash.put(wc, ch); return ch; } }catch(Exception e){ e.printStackTrace(); } return null; } public static void insert2TermCategoryTable(String term, String cat, Connection conn, String prefix) { try{ String sql = "insert into " + prefix +"_term_category values (?,?)"; PreparedStatement pstmt = conn.prepareStatement(sql); pstmt.setString(1, term); pstmt.setString(2, cat); pstmt.execute(); }catch(Exception e){ e.printStackTrace(); } } /** * * @param term * @param conn * @param glosstable * @return */ public static boolean inGlossary(String term, Connection conn, String glosstable, String prefix) { term = term.replaceAll(".*[_-]", ""); String termcopy = term; term = term.replaceFirst("(semi|sub|un)", ""); boolean in = false; try{ Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select term, category from "+glosstable+" where term ='"+term+"'"); if(rs.next()){ String cat = rs.getString("category"); in = true; Statement stmt1 = conn.createStatement(); stmt1.execute("insert into "+prefix+"_term_category (term, category) values ('"+termcopy+"', '"+cat+"')"); } }catch(Exception e){ e.printStackTrace(); } return in; } /** * break text into correct tokens: * @param text: that is {often} {concealed} r[p[by] o[(trichomes)]]; * @return */ public static ArrayList<String> breakText(String text) { ArrayList<String> tokens = new ArrayList<String>(); String[] words = text.split("\\s+"); String t = ""; int left = 0; for(int i = 0; i<words.length; i++){ String w = words[i]; if(w.indexOf("[")<0 && w.indexOf("]")<0 && left==0){ if(!w.matches("\\b(this|have|that|may|be|which|where|when)\\b")){tokens.add(w);}; }else{ left += w.replaceAll("[^\\[]", "").length(); left -= w.replaceAll("[^\\]]", "").length(); t += w+" "; if(left==0){ tokens.add(t.trim()); t = ""; } } } return tokens; } public static String threeingSentence(String str) { //hide the numbers in count list: {count~list~9~or~less~} <fin> <rays> ArrayList<String> lists = new ArrayList<String>(); str = hideLists(str, lists); //threeing str = str.replaceAll("(?<=\\d)-(?=\\{)", " - "); //this is need to keep "-" in 5-{merous} after 3ed (3-{merous} and not 3 {merous}) //Pattern pattern3 = Pattern.compile("[\\d]+[\\-\\�]+[\\d]+"); Pattern pattern3 = Pattern.compile(NumericalHandler.numberpattern); //Pattern pattern4 = Pattern.compile("(?<!(ca[\\s]?|diam[\\s]?))([\\d]?[\\s]?\\.[\\s]?[\\d]+[\\s]?[\\�\\-]+[\\s]?[\\d]?[\\s]?\\.[\\s]?[\\d]+)|([\\d]+[\\s]?[\\�\\-]+[\\s]?[\\d]?[\\s]?\\.[\\s]?[\\d]+)|([\\d]/[\\d][\\s]?[\\�\\-][\\s]?[\\d]/[\\d])|(?<!(ca[\\s]?|diam[\\s]?))([\\d]?[\\s]?\\.[\\s]?[\\d]+)|([\\d]/[\\d])"); //Pattern pattern5 = Pattern.compile("[\\d�\\+\\�\\-\\���:�/�\"��\\_�\\׵%\\*\\{\\}\\[\\]=]+"); //Pattern pattern5 = Pattern.compile("[\\d\\+���/�\"���\\׵%\\*]+(?!~[a-z])"); Pattern pattern5 = Pattern.compile("[\\d\\+���/�\"���\\׵%\\*]+(?![a-z])"); //not including individual "-", would turn 3-branched to 3 branched //Pattern pattern6 = Pattern.compile("([\\s]*0[\\s]*)+(?!~[a-z])"); //condense multiple 0s. Pattern pattern6 = Pattern.compile("(?<=\\s)[0\\s]+(?=\\s)"); //Pattern pattern5 = Pattern.compile("((?<!(/|(\\.[\\s]?)))[\\d]+[\\-\\�]+[\\d]+(?!([\\�\\-]+/|([\\s]?\\.))))|((?<!(\\{|/))[\\d]+(?!(\\}|/)))"); //[\\d�\\+\\�\\-\\��.�:�/�\"��\\_;x�\\�\\s,�%\\*\\{\\}\\[\\]=(<\\{)(\\}>)]+ Pattern pattern7 = Pattern.compile("[(\\[]\\s*\\d+\\s*[)\\]]"); // deal with ( 2 ), (23) is dealt with by NumericalHandler.numberpattern Matcher matcher1 = pattern3.matcher(str); //str = matcher1.replaceAll(" 0 "); str = matcher1.replaceAll("0"); matcher1.reset(); /*matcher1 = pattern4.matcher(str); str = matcher1.replaceAll("0"); matcher1.reset();*/ matcher1 = pattern5.matcher(str);//single numbers str = matcher1.replaceAll("0"); matcher1.reset(); /* should not remove space around 0, because: pollen 70-80% 3-porate should keep 2 separate numbers: 70-80% and 3-porate * String scptemp = str; matcher1 = pattern6.matcher(str);//remove space around 0 str = matcher1.replaceAll("0"); if(!scptemp.equals(str)){ System.out.println(); } matcher1.reset();*/ matcher1 = pattern7.matcher(str);//added for (2) str = matcher1.replaceAll("0"); matcher1.reset(); //further normalization //3 -{many} or 3- {many}=> {3-many} str = str.replaceAll("0\\s*-\\s*", "0-").replaceAll("0(?!~[a-z])", "3").replaceAll("3\\s*[�-]\\{", "{3-").replaceAll("�(?!~[a-z])","{moreorless}").replaceAll("�","moreorless"); //stanford parser gives different results on 0 and other numbers. //2-or-{3-lobed} => {2-or-3-lobed} str = str.replaceAll("(?<=-(to|or)-)\\{", "").replaceAll("[^\\{]\\b(?=3-(to|or)-3\\S+\\})", " {"); //unhide count list str = unCountLists(str, lists); return str; } /** * hide lists such as * {upper} {pharyngeal} <tooth> <plates_4_and_5> * count~list~2~to~4 * so the numbers will not be turned into 3. * @param str * @param countlists * @return */ private static String hideLists(String str, ArrayList<String> lists) { if(str.contains("count~list~") || str.matches(".*?<\\S+_\\d.*")){ String newstr = ""; String[] tokens = str.split("\\s+"); int count = 0; for(String t: tokens){ if(t.indexOf("count~list~")>=0 || t.matches("<\\S+_\\d.*")){ newstr +="# "; lists.add(t); count++; }else{ newstr +=t+" "; } } return newstr.trim(); }else{ return str; } } private static String unCountLists(String str, ArrayList<String> lists) { if(str.contains("#")){ String newstr = ""; String[] tokens = str.split("\\s+"); int count = 0; for(String t: tokens){ if(t.contains("#")){ newstr += lists.get(count)+" "; count++; }else{ newstr +=t+" "; } } return newstr.trim(); }else{ return str; } } /** * * @param text does [not] overlap * @return '[not] overlapping' --remove does and add -ing to signal overlap is a word */ public static String reformAuxiliaryVerbs(String text) { String[] segments = text.split("\\b(does|do)\\b"); String newtext = segments[0]; for(int i = 1; i<segments.length; i++){ String segment = segments[i].trim(); boolean found = false; while(segment.indexOf(" ")>0){ String word = segment.substring(0, segment.indexOf(" ")); segment = segment.substring(segment.indexOf(" ")).trim(); WordNetWrapper wnw = new WordNetWrapper(word); if(wnw.isV()){ newtext += " "+word+"ing"; newtext += " "+segment; found = true; break; } newtext+=" "+word; } if(!found){ //last word in text WordNetWrapper wnw = new WordNetWrapper(segment); if(wnw.isV()){ newtext += " "+segment+"ing"; } } } return newtext.trim(); } /** * clean off [;\\]\\[{}(),+\\.&%@<>=`:] from string * @param str * @return */ public static String cleanup(String str){ while(str.matches(".*?\\[.*?\\].*")){ str = str.replaceAll("\\[.*?\\]", "").trim(); } while(str.matches(".*?<.*?>.*")){ str = str.replaceAll("<.*?>", "").trim(); } while(str.matches(".*?\\(.*?\\).*")){ str = str.replaceAll("\\(.*?\\)", "").trim(); } while(str.matches(".*?\\{.*?\\}.*")){ str = str.replaceAll("\\{.*?\\}", "").trim(); } str = str.replaceAll("`", "'"); if(str.contains("%")) str = ""; //%2cxyz%67 str = str.replaceFirst("[@=+;,.&:<>\\]\\[\\(\\)\\{\\}].*", ""); //@fr, x=y, x+y+z str = str.replaceFirst("[_-]$", ""); //if(str.matches(".*?[;\\]\\[,+\\.&%@<>=`:].*")){ // System.out.print(str+ " "); // } return str; } public static void main(String[] argv){ System.out.println(reformAuxiliaryVerbs("does often overlap abc , does not usually overlap")); //TermOutputerUtilities.lookupCharacter(w, conn, characterhash) //System.out.println(TermOutputerUtilities.isNoun(",", new ArrayList<String>())); //System.out.println(TermOutputerUtilities.isAdv("much", new ArrayList<String>())); } }