/* $Id: CharacterAnnotatorChunked.java 997 2011-10-07 01:14:22Z hong1.cui $ */ /** * */ package fna.charactermarkup; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.ObjectInputStream; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import java.util.ArrayList; import java.util.Enumeration; import java.util.Hashtable; import java.util.List; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.filter.ElementFilter; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.jdom.xpath.*; import outputter.ApplicationUtilities; import outputter.search.SpatialModifiedEntityStrategy; import outputter.knowledge.TermOutputerUtilities; import conceptmapping.*; import fna.parsing.PhraseMarker; /** * @author hongcui fnaglossaryfixed: move verbs such as comprising from the * glossary * */ @SuppressWarnings({ "unchecked", "unused", "static-access" }) public class CharacterAnnotatorChunked { private static final Logger LOGGER = Logger.getLogger(CharacterAnnotatorChunked.class); private Element statement = null; private ChunkedSentence cs = null; private static ArrayList<Element> subjects = new ArrayList<Element>(); // static so a ditto sent can see the last subject private ArrayList<Element> latestelements = new ArrayList<Element>(); // save the last set of elements added. independent from adding elements to <Statement> private ArrayList<Element> elementlog = new ArrayList<Element>(); // log the sequence in which elements were created private String delims = "comma|or"; private static int structid = 1; private static int relationid = 1; private String unassignedcharacter = null; // private String unassignedmodifiers = null; //holds modifiers that may be // applied to the next chunk protected Connection conn = null; private String tableprefix = null; private String glosstable = null; private boolean inbrackets = false; private String text = null; private String notInModifier = "a|an|the"; private String negationpt = "not|never"; private String nonrelation = "through|by|to|into"; private String size ="size|length|width"; private String lifestyle = ""; private String characters; private boolean partofinference = false; private ArrayList<Element> pstructures = new ArrayList<Element>(); private ArrayList<Element> cstructures = new ArrayList<Element>(); private boolean attachToLast = false; // this switch controls where a character will be attached to. // "true": attach to last organ seen. "false":attach to the // subject of a clause private boolean printAnnotation = true; private boolean debugNum = false; private boolean printComma = false; private boolean printAttach = false; private boolean evaluation = false; private boolean printOR = true; private String sentsrc; private boolean nosubject; private boolean debugextraattributes=false; private ArrayList<String> phrases; private Hashtable<String, String> p2sphrases; private Pattern chptn ; /** * */ public CharacterAnnotatorChunked(Connection conn, String tableprefix, String glosstable, String characters, boolean evaluation) { this.conn = conn; this.tableprefix = tableprefix; this.glosstable = glosstable; this.evaluation = evaluation; this.nosubject = false; this.characters = characters; if (this.evaluation) this.partofinference = false; // partofinterference causes huge number of "relations" // collect life_style terms try { Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select distinct term from " + this.glosstable + " where category='life_style'"); while (rs.next()) { this.lifestyle += rs.getString(1) + "|"; } this.lifestyle = lifestyle.replaceFirst("\\|$", ""); } catch (Exception e) { e.printStackTrace(); } //construct chptn chptn = Pattern.compile("\\b("+this.characters+")\\b", Pattern.CASE_INSENSITIVE); //prematched structure names: File file = new File(ApplicationUtilities.getProperty("ontology.dir"), ApplicationUtilities.getProperty("ontology.uberon")+"_"+ApplicationUtilities.getProperty("uberonphrases.update.bin")); File p2sfile = new File(ApplicationUtilities.getProperty("ontology.dir"), ApplicationUtilities.getProperty("ontology.uberon")+"_"+ApplicationUtilities.getProperty("uberonphrases.p2s.bin")); ObjectInputStream in; try { in = new ObjectInputStream(new FileInputStream( file)); // Deserialize the object phrases = (ArrayList<String>) in.readObject(); in.close(); in = new ObjectInputStream(new FileInputStream( p2sfile)); // Deserialize the object p2sphrases = (Hashtable<String, String>) in.readObject(); in.close(); } catch (Exception e) { LOGGER.error("", e); } } /** * reset annotator to process next description paragraph. */ public void reset() { this.subjects = new ArrayList<Element>();// static so a ditto sent can // see the last subject this.latestelements = new ArrayList<Element>();// save the last set of // elements added. // independent from // adding elements to // <Statement> this.unassignedcharacter = null; this.inbrackets = false; this.pstructures = new ArrayList<Element>(); this.cstructures = new ArrayList<Element>(); this.nosubject = false; } public Element annotate(String sentindex, String sentsrc, ChunkedSentence cs) throws Exception { this.statement = new Element("statement"); // sentindex: Buckup_1998.xml_088683b8-4718-48de-ad0e-eb1de9c58eb6.txt-0 String segid = sentsrc.replaceFirst(".*-(?=[^-]+$)", ""); sentsrc = sentsrc.replaceFirst(".*?\\.xml_", "").replaceFirst("\\.txt.*", ""); String[] ids = sentsrc.split("_"); String charaid = ids[0]; String stateid = null; if (ids.length > 1) { stateid = ids[1]; } boolean isstate = false; this.statement.setAttribute("statement_type", "character"); this.statement.setAttribute("character_id", charaid); if (stateid != null) { this.statement.setAttribute("statement_type", "character_state"); this.statement.setAttribute("state_id", stateid); isstate = true; } this.statement.setAttribute("seg_id", segid); this.cs = cs; this.text = cs.getText(); this.sentsrc = sentsrc; Element text = new Element("text");// make <text> the first element in // statement text.addContent(this.text); if (!this.evaluation) this.statement = addContent(this.statement, text);// add Element, // record in // elementlog int i = 0; String token = cs.getTokenAt(i++); while (token.length() == 0 || token.matches(ChunkedSentence.stop)) { token = cs.getTokenAt(i++); } if (token.startsWith("z[") || token.startsWith("l[") || token.startsWith("u[")) { annotateByChunk(cs, false); }else { establishSubject("("+ApplicationUtilities.getProperty("unknown.structure.name")+")", false); cs.setInSegment(true); cs.setRightAfterSubject(true); annotateByChunk(cs, false); } /* * String subject= cs.getSubjectText(); if(subject==null && * cs.getPointer()==0){ Chunk ck = cs.nextChunk(); cs.resetPointer(); * establishSubject("(ApplicationUtilities.getProperty("unknown.structure.name"))"); annotateByChunk(cs, false); * }//end mohan code else if(subject.equals("measurements")){ * this.annotatedMeasurements(this.text); }else * if(!subject.equals("ignore")){ if(subject.equals("ditto")){ * reestablishSubject(); }else{ establishSubject(subject); * if(this.partofinference){ * this.pstructures.addAll(CharacterAnnotatorChunked.subjects); } } * cs.setInSegment(true); cs.setRightAfterSubject(true); * annotateByChunk(cs, false); } */ // postprocess functions // lifeStyle(); // if(!this.evaluation) mayBeSameRelation(); // if(this.partofinference){ // puncBasedPartOfRelation(); // } //XMLOutputter xo = new XMLOutputter(Format.getPrettyFormat()); //System.out.println(); //System.out.println(xo.outputString(this.statement)); /*Normalization*/ normalizeModifierCharacters(sentsrc); removeIsolatedCharacters(); removeIsolatedWholeOrganismPlaceholders(); annotateBareStatements(); //manus digits i-iii => manus digit i, manus digit ii, manus digit iii decomposeMultipleStructures();//Changed by Zilong standardization(); markCharacterInText(); if (printAnnotation) { XMLOutputter xo1 = new XMLOutputter(Format.getPrettyFormat()); System.out.println(); System.out.println(xo1.outputString(this.statement)); } return this.statement; } /** * if a character statement contains character terms, mark them as [character] in the text */ private void markCharacterInText() { try{ List<Element> texts = StanfordParser.path24.selectNodes(this.statement); for(Element text: texts){ Element statement = text.getParentElement(); if (statement.getAttribute("state_id") == null){ String string = statement.getChild("text").getTextTrim(); Matcher m = chptn.matcher(string); String marked = ""; while(m.find()){ marked += string.substring(0, m.start()) +"["+m.group(1)+"]"; string = string.substring(m.end()); m = chptn.matcher(string); } marked +=string; this.statement.getChild("text").setText(marked.trim()); } } }catch(Exception e){ e.printStackTrace(); } } /** * for character type: * move characters from 'ApplicationUtilities.getProperty("unknown.structure.name")' to the first 'non-ApplicationUtilities.getProperty("unknown.structure.name")' structure: * for example: * turn <statement statement_type="character" character_id="s15c3c868-aa20-4526-b1cb-d187d6bca66e" seg_id="0"> <text>L-shaped proximal tarsal element</text> <structure id="o48" name="ApplicationUtilities.getProperty("unknown.structure.name")" name_original=""> <character name="shape" value="l-shaped" /> </structure> <structure id="o49" name_original="element" name="element" constraint="proximal tarsal" /> </statement> * into <statement statement_type="character" character_id="s15c3c868-aa20-4526-b1cb-d187d6bca66e" seg_id="0"> <text>L-shaped proximal tarsal element</text> <structure id="o49" name_original="element" name="element" constraint="proximal tarsal" > <character name="shape" value="l-shaped" is_modifier="true" /> </structure </statement> *TODO: how about 'contact btw A and B'? */ private void normalizeModifierCharacters(String sentsrc) { if(this.statement.getAttributeValue("statement_type").compareTo("character")==0){ //gather data List<Element> children = this.statement.getChildren("structure"); ArrayList<Element> characters = new ArrayList<Element>(); if(children.size()>0 && children.get(0).getAttributeValue("name").compareTo(ApplicationUtilities.getProperty("unknown.structure.name"))==0){ List<Element> charas = children.get(0).getChildren("character"); for(Element chara : charas){ if(chara.getAttributeValue("name").compareTo("character")!=0) //characters such as shape and number are dealt with later characters.add(chara); } } //make changes if(characters.size()>=1 && children.size()>1){ LOGGER.debug(sentsrc+" is normalized"); Element firststructure=children.get(1); for(Element character: characters){ character.setAttribute("is_modifier", "true"); character.detach(); firststructure.addContent(character); } children.get(0).detach(); //remove 'ApplicationUtilities.getProperty("unknown.structure.name")' } } } /** * count = "none" =>presence = "absent" */ private void standardization() { try { /* count = "none" =>count = 0 */ List<Element> es = StanfordParser.path1.selectNodes(this.statement); for (Element e : es) { e.setAttribute("value", "absent"); e.setAttribute("name", "presence"); } /* * <structure id="o437" name="tooth"> <character name="presence" * value="no" /> <character name="presence" value="present" * constraint="on fourth upper pharyngeal tooth plate" * constraintid="o438" /> </structure> * * ==> * * <structure id="o437" name="tooth"> <character name="presence" * value="absent" * constraint="on fourth upper pharyngeal tooth plate" * constraintid="o438" /> </structure> * * * <text>no circuli present on posterior surface of scales</text> * <structure id="o357" name="circulus"> * <character name="presence"value="no" /> * </structure> <structure id="o358" name="surface" * constraint="posterior" /> <relation id="r130" name="present on" * from="o357" to="o358" negation="false" /> * * ==> <text>no circuli present on posterior surface of * scales</text> <structure id="o357" name="circulus"> </structure> * <structure id="o358" name="surface" constraint="posterior" /> * <relation id="r130" name="present on" from="o357" to="o358" * negation="true" /> */ // XPath nopresencech = // XPath.newInstance(".//character[@name='presence'][@value='no']"); es = StanfordParser.path2.selectNodes(this.statement); ArrayList<Element> esstructures = new ArrayList<Element>(); for (Element e : es) { esstructures.add(e.getParentElement()); } List<Element> esc = StanfordParser.path3.selectNodes(this.statement); ArrayList<Element> escstructures = new ArrayList<Element>(); for (Element e : esc) { escstructures.add(e.getParentElement()); } List<Element> esr = StanfordParser.path4.selectNodes(this.statement); ArrayList<Element> esrstructures = new ArrayList<Element>(); for (Element e : esr) { String strid = e.getAttributeValue("from"); esrstructures.add((Element) XPath.selectSingleNode(this.statement, ".//structure[@id='" + strid + "']")); } esc = intersect(esstructures, escstructures); esr = intersect(esstructures, esrstructures); for (Element e : esc) { Element c = (Element) StanfordParser.path2.selectSingleNode(e); c.detach(); List<Element> cl = StanfordParser.path3.selectNodes(e); for (Element c1 : cl) c1.setAttribute("value", "absent"); } for (Element e : esr) { List<Element> rs = StanfordParser.path2.selectNodes(e); for (Element r : rs) r.detach(); rs = XPath.selectNodes(this.statement, ".//relation[@from='" + e.getAttributeValue("id") + "'][starts-with(@name, 'present']"); for (Element r : rs) r.setAttribute("negation", "true"); } /* the remaining presence = no cases */ es = StanfordParser.path2.selectNodes(this.statement); for (Element e : es) { e.setAttribute("value", "absent"); } //ApplicationUtilities.getProperty("unknown.structure.name") => ApplicationUtilities.getProperty("unknown.structure.name") es = StanfordParser.path6.selectNodes(this.statement); for (Element e : es) { e.setAttribute("name", ApplicationUtilities.getProperty("unknown.structure.name")); e.setAttribute("name_original", ""); } } catch (Exception e) { e.printStackTrace(); } } private List<Element> intersect(List<Element> es, List<Element> esc) { ArrayList<Element> common = new ArrayList<Element>(); for (Element e : es) { if (esc.contains(e)) common.add(e); } return common; } /** * characters such as "orientation", "length" may be used in a character * statement as <character name="character" value="orientation" /> these * elements are removed by this function * * @throws JDOMException */ private void removeIsolatedCharacters() throws JDOMException { //if (this.statement.getAttribute("state_id") == null) { List<Element> chars = StanfordParser.path5.selectNodes(this.statement); for (Element chara : chars) { if(chara.getAttributes().size()>2) continue; //isolated characters should only have name and value attributes. String v = chara.getAttributeValue("value"); List<Element> childreninorder = chara.getParentElement().getContent(new ElementFilter()); Element nextchara = null; int i = childreninorder.indexOf(chara); if(i<childreninorder.size()-1) nextchara = childreninorder.get(++i); while(nextchara!=null && nextchara.getAttributeValue("name").compareTo("size")==0){ //next element is a size character, replace size with this character nextchara.setAttribute("name", v); nextchara = null; if(i<childreninorder.size()-1) nextchara = childreninorder.get(++i);; } chara.detach(); chara = null; } //} } /** * some state statements may have 1 adverb, or 20-40% as its entire * statement, these will not get annotated in the annotation process * annotate them here */ private void annotateBareStatements() { if (this.statement.getAttribute("state_id") != null) { if (this.statement.getChildren().size() == 1) {// holding a <text> // element alone String text = this.statement.getChildText("text"); if (!text.matches("(" + ChunkedSentence.binaryTvalues + "|" + ChunkedSentence.binaryFvalues + ")")) {// non // binary // states Element str = new Element("structure"); // ApplicationUtilities.getProperty("unknown.structure.name") as // a placeholder str.setAttribute("id", this.structid + ""); this.structid++; str.setAttribute("name", ApplicationUtilities.getProperty("unknown.structure.name")); str.setAttribute("name_original", ""); Element ch = new Element("character"); ch.setAttribute("name", "unknown"); // TODO: unknown as a // placeholder ch.setAttribute("value", text.trim()); str.addContent(ch); statement.addContent(str); } } } } // Copied from XML2EQ.java /** * abc_iv_and_v * * @param entity * @return */ private String reformatRomans(String entity) { String[] parts = entity.split("_"); String reformatted = ""; for (String part : parts) { if (part.matches("[ivx]+")) reformatted += this.turnRoman2Number(part) + "_"; else reformatted += part + "_"; } return reformatted.replaceFirst("_$", ""); } /** * * @param entity * @return */ private String turnRoman2Number(String word) { int total = 0; if (word.endsWith("iv")) { total += 4; word = word.replaceFirst("iv$", ""); } if (word.endsWith("ix")) { total += 9; word = word.replaceFirst("ix$", ""); } int length = word.length(); for (int i = 0; i < length; i++) { if (word.charAt(i) == 'i') total += 1; if (word.charAt(i) == 'v') total += 5; if (word.charAt(i) == 'x') total += 10; } return total + ""; } // conpied end private void decomposeAddToXML(Element e, String newname, String newid, String originalname) throws JDOMException { Element tempe = (Element) e.clone(); //all new elements would still have type="multi" attribute tempe.setAttribute("name", newname); tempe.setAttribute("name_original", originalname); tempe.setAttribute("id", newid);// id starts from 0 addContent(e.getParentElement(), tempe); // handle relations String id = e.getAttributeValue("id"); List<Element> rels = XPath.selectNodes(statement, ".//relation[@from='" + id + "'|@to='" + id + "']"); for (Element r : rels) { Element tempr = (Element) r.clone(); if (r.getAttribute("from").equals(id)) { tempr.setAttribute("from", newid); } if (r.getAttribute("to").equals(id)) { tempr.setAttribute("to", newid); } addContent(r.getParentElement(), tempr); } } /** * * by Zilong * Handle different relationships such as "connected to", "associate with".. * *Original statement: * <statement statement_type="character_state" character_id="975551bf-c0d8-4d97-9cbc-f7c5b7b38b99" state_id="14e6ee40-450d-4d5f-9573-3c04d8dc0954" seg_id="0"> * <text>loss of connection between pseudobranchial and suprabranchial arteries</text> * <structure id="o234" name="ApplicationUtilities.getProperty("unknown.structure.name")"> * <character name="presence" value="loss" constraint="of connection" constraintid="o235" /> * </structure> * <structure id="o235" name="connection" /> * <structure id="o236" name="artery" constraint="pseudobranchial" /> * <structure id="o237" name="artery" constraint="suprabranchial" /> * <relation id="r72" name="between" from="o235" to="o236 o237" negation="false" /> * </statement> * * @param reltype * @throws JDOMException */ private void relationHandler(String reltype) throws JDOMException { List<Element> rs = XPath.newInstance(".//structure[@name='"+reltype+"']").selectNodes(statement); if(rs.isEmpty()){ return;//no such relationship, return } String id = rs.get(0).getAttributeValue("id"); //detach the structure "connection" rs.get(0).detach(); List<Element> preps = XPath.selectNodes(statement, ".//relation[@from='" + id + "']"); for(Element prep:preps){ String to=prep.getAttributeValue("to"); String[] tos = to.trim().split("//s+"); List<Element> characters = XPath.selectNodes(statement, ".//chracter[@name='presence']"); //presence of the relationship if(characters.isEmpty()){ return; } Element character=characters.get(0); String constraintStr = ""; if(tos.length>1){ //get all the constraint entities (from the second "to" entities of the relation element) for(int i=1;i<tos.length;i++){ constraintStr+=tos[i]+" ";//concatenate the "to" entities from the second one } //replace the current constraintid which refers to structure "connection" with //the new contraintStr. character.setAttribute("constraintid", constraintStr); //first "to" structure in the relation List<Element> first = XPath.selectNodes(statement, ".//structure[@id='"+tos[0]+"']"); if(!first.isEmpty()){ first.get(0).addContent(character); } } } } /** * This method is called to decompose an entity such as "manus digits i-iii" * into several entities "manus digit i", "manus digit ii", and "manus digit iii" * * @throws JDOMException */ private void decomposeMultipleStructures() throws JDOMException { List<Element> mss = XPath.newInstance(".//structure[@type='multi']").selectNodes(statement); Pattern p2 = Pattern.compile("(.*?)(\\d+) to (\\d+)"); for (Element e : mss) { boolean isRoman=false; String originalname = e.getAttributeValue("name_original"); String entity = e.getAttributeValue("name"); //organ->singular String organ = entity.substring(0, entity.indexOf("_")).trim(); String before = ""; if(organ.indexOf(" ")>0){ before = organ.substring(0, organ.lastIndexOf(" ")); organ = organ.substring(organ.lastIndexOf(" ")).trim(); } organ = before +" "+TermOutputerUtilities.toSingular(organ); entity =entity.replaceFirst(entity.substring(0, entity.indexOf("_")), organ); if (entity.matches(".*?_[\\divx]+-[\\divx]+")) {// abc_1-3 if(entity.matches(".*?_[ivx]+-[ivx]+")){ isRoman=true; } entity = entity.replaceAll("-", "_to_");// before // reformatRomans,replace // "-" with "_to_" } entity = reformatRomans(entity); entity = entity.replaceAll("_(?=\\d+)", " ").replaceAll("(?<=\\d)_", " "); // abc_1_and_3 // => // abc // 1 // and // 3 if (entity.indexOf(" and ") < 0 && entity.indexOf(" to ") < 0) { // single // entity e.setAttribute("name", entity); } else {// abc 1 and 2 if (entity.indexOf(" and ") > 0) { entity = entity.replaceFirst(" and ", "," + organ + " "); String[] entities = entity.split(","); for (int i = 0; i < entities.length; i++) { this.decomposeAddToXML(e, entities[i], e.getAttributeValue("id") + "-" + (i + 1), originalname); // id starts from 0 } } // abc 1 , 2 to 5 ; abc 2 to 5 Matcher m = p2.matcher(entity); if (m.matches()) { String part1 = m.group(1); int from = Integer.parseInt(m.group(2)); int to = Integer.parseInt(m.group(3)); for (int i = from; i <= to; i++) { String temp1 = organ + " " + (isRoman?RomanConversion.binaryToRoman(i):i); this.decomposeAddToXML(e, temp1, e.getAttributeValue("id") + "-" + i, originalname);// id=i } // abc 1, 2 to 5; part1 = part1.replaceAll("\\D", "").trim(); if (part1.length() > 0) { String[] nums = part1.split("\\s+"); for (String n : nums) { String temp1 = organ + " " + (isRoman? RomanConversion.binaryToRoman(Integer.parseInt(n)):n); this.decomposeAddToXML(e, temp1, e.getAttributeValue("id") + "-" + n, originalname);// id=i } } // transformed = transformed+ temp + temp1; // transformed.replaceFirst(",$", "").trim(); // entityhash.put(entitylist, transformed); // return transformed; } // detach e and those relationships e.detach(); // detach relations String id = e.getAttributeValue("id"); List<Element> rels = XPath.selectNodes(statement, ".//relation[@from='" + id + "'|@to='" + id + "']"); for (Element r : rels) { r.detach(); } } } } private void removeIsolatedWholeOrganismPlaceholders() throws JDOMException { // remove ApplicationUtilities.getProperty("unknown.structure.name") structures that without any character children // and are not involved in a relation. // These structures were put in as placeholders in processing // characters/character states descriptions List<Element> wholeOrgans = StanfordParser.path6.selectNodes(statement); Iterator<Element> it = wholeOrgans.iterator(); while (it.hasNext()) { Element wo = it.next(); String id = wo.getAttributeValue("id"); List<Element> rels = XPath.selectNodes(statement, ".//relation[@from='" + id + "'|@to='" + id + "']"); if (wo.getChildren().size() == 0 && rels.size() == 0) { wo.detach(); } } } private Element addContent(Element element, Element ch) { element.addContent(ch); this.elementlog.add(ch); return element; } /** * assuming subject organs of subsentences in a sentence are parts of the * subject organ of the sentence this assumption seemed hold for FNA data. */ private void puncBasedPartOfRelation() { for (int p = 0; p < this.pstructures.size(); p++) { for (int c = 0; c < this.cstructures.size(); c++) { String pid = this.pstructures.get(p).getAttributeValue("id"); String cid = this.cstructures.get(c).getAttributeValue("id"); this.addRelation("part_of", "", false, cid, pid, false, "based_on_punctuation"); } } } /** * re-annotate "trees" from structure to character lifestyle */ private void lifeStyle() { try { // find life_style structures List<Element> structures = StanfordParser.path7.selectNodes(this.statement); Iterator<Element> it = structures.iterator(); // Element structure = null; while (it.hasNext()) { Element structure = it.next(); String name = structure.getAttributeValue("name").trim(); if (name.length() <= 0) continue; if (lifestyle.matches(".*\\b" + name + "\\b.*")) { if (structure.getAttribute("constraint_type") != null) name = structure.getAttributeValue("constraint_type") + " " + name; if (structure.getAttribute("constraint_parent_organ") != null) name = structure.getAttributeValue("constraint_parent_organ") + " " + name; if (structure.getAttribute("constraint") != null) name = structure.getAttributeValue("constraint") + " " + name; Element wo = (Element) StanfordParser.path6.selectSingleNode(this.statement); if (wo != null) { List<Element> content = structure.getContent(); structure.removeContent(); /* * for(int i = 0; i<content.size(); i++){ Element e = * content.get(i); e.detach(); content.set(i, e); } */ wo.addContent(content); structure.detach(); structure = wo; } structure.setAttribute("name", ApplicationUtilities.getProperty("unknown.structure.name")); Element ch = new Element("character"); ch.setAttribute("name", "life_style"); ch.setAttribute("value", name); structure = addContent(structure, ch); // structure.addContent(ch); } // keep each life_style structure /* * if(lifestyle.matches(".*\\b"+name+"\\b.*")){ * if(structure.getAttribute("constraint") !=null) name = * structure.getAttributeValue("constraint")+" "+name; * structure.setAttribute("name", "ApplicationUtilities.getProperty("unknown.structure.name")"); Element ch * = new Element("character"); ch.setAttribute("name", * "life_style"); ch.setAttribute("value", name); * structure.addContent(ch); } */ } } catch (Exception e) { e.printStackTrace(); } } /** * if there are structure with the same name and constraint but different * ids add a relation 'may_be_the_same' among them, set symmetric="true" */ private void mayBeSameRelation() { try { List<Element> structures = StanfordParser.path7.selectNodes(this.statement); Hashtable<String, ArrayList<String>> names = new Hashtable<String, ArrayList<String>>(); Iterator<Element> it = structures.iterator(); // structure => ids hash while (it.hasNext()) { Element structure = it.next(); String name = structure.getAttributeValue("name"); // one the two contraint types if (structure.getAttribute("constraint_type") != null) name = structure.getAttributeValue("constraint_type") + " " + name; if (structure.getAttribute("constraint_parent_organ") != null) name = structure.getAttributeValue("constraint_parent_organ") + " " + name; if (structure.getAttribute("constraint") != null) name = structure.getAttributeValue("constraint") + " " + name; String id = structure.getAttributeValue("id"); if (names.containsKey(name)) { names.get(name).add(id);// update the value for name // names.put(name, names.get(name)); } else { ArrayList<String> ids = new ArrayList<String>(); ids.add(id); names.put(name, ids); } } // use the hash to create relations Enumeration<String> en = names.keys(); while (en.hasMoreElements()) { String name = en.nextElement(); ArrayList<String> ids = names.get(name); if (ids.size() > 1) { for (int i = 0; i < ids.size(); i++) { for (int j = i + 1; j < ids.size(); j++) { this.addRelation("may_be_the_same", "", true, ids.get(i), ids.get(j), false, "based_on_text"); } } } } } catch (Exception e) { e.printStackTrace(); } } private void annotateByChunk(ChunkedSentence cs, boolean inbrackets) { if (cs == null) return; this.inbrackets = inbrackets; String t; while (cs.hasNext()) { Chunk ck = cs.nextChunk(); if (ck instanceof ChunkOR) { int afterorindex = cs.getPointer(); Element last = this.latestelements.get(this.latestelements.size() - 1); ck = cs.nextChunk(); if (ck != null && last.getName().compareTo("character") == 0) { String cname = last.getAttributeValue("name"); //or greater, less, fewer, more, etc String content = ck.toString(); if(content.indexOf(" ")<0 && content.matches(".*?\\b(\\w+er|more|less)\\b.*")){ //add content to the value of last character String newvalue = last.getAttributeValue("value")+" or "+content.replaceAll("(\\w+\\[|\\])", ""); last.setAttribute("value", newvalue); continue; } if (!(ck instanceof ChunkSimpleCharacterState) && !(ck instanceof ChunkNumericals)) { // these cases can be handled by the normal annotation procedure Element e = new Element("character"); if (this.inbrackets) { e.setAttribute("in_bracket", "true"); } e.setAttribute("name", cname); String v = ck.toString(); // may be a character list if (v.length() >= 1) {// chunk contains text if (v.indexOf("~list~") >= 0) { v = v.replaceFirst("\\w{2,}\\[.*?~list~", "").replaceAll("punct", ",").replaceAll("~", " "); } v = v.replaceAll("(\\w\\[|\\]|\\{|\\}|\\(|\\)|<|>)", ""); e.setAttribute("value", v); addClauseModifierConstraint(cs, e); // last.getParentElement().addContent(e); addContent(last.getParentElement(), e); } else {// chunk not contain text: or nearly so, or not, // or throughout e = traceBack4(e, last, afterorindex, cs.getPointer()); // last.getParentElement().addContent(e); addContent(last.getParentElement(), e); } } ArrayList<Element> e = new ArrayList<Element>(); e.add(new Element("or")); updateLatestElements(e); }else if (ck != null && last.getName().compareTo("structure") == 0) { //scattered <patches> or absent if(this.printOR) System.out.println("created "+ApplicationUtilities.getProperty("unknown.structure.name")+" for OR"); ArrayList<Element> structure = this.createStructureElements("("+ApplicationUtilities.getProperty("unknown.structure.name")+")", false); updateLatestElements(structure); } /*ArrayList<Element> e = new ArrayList<Element>(); e.add(new Element("or")); updateLatestElements(e);*/ } if (ck instanceof ChunkCharacterComparison){//{relative~{A~char}~{relation}~{B~char}} ArrayList<Element> structures = processChunkCharacterComparison(ck.toString()); updateLatestElements(structures); //may still be modifiers etc following this chunk } if (ck instanceof ChunkOrgan) {// this is the subject of a segment. // May contain multiple organs //if this chunk appears right after a comma boolean aftercomma = false; if(this.latestelements.size()>0 && this.latestelements.get(this.latestelements.size()-1).getName().compareTo("comma")==0){ aftercomma = true; } String content = ck.toString().replaceFirst("^z\\[", "").replaceFirst("\\]$", ""); establishSubject(content, aftercomma/* , false */); if (this.partofinference) { this.cstructures.addAll(this.subjects); } cs.setInSegment(true); cs.setRightAfterSubject(true); } else if (ck instanceof ChunkNonSubjectOrgan) { String content = ck.toString().replaceFirst("^u\\[", "").replaceFirst("\\]$", ""); String structure = ""; if (content.indexOf("o[") >= 0) { String m = content.substring(0, content.indexOf("o[")).replaceAll("m\\[", "{").replaceAll("\\]", "}"); String o = content.substring(content.indexOf("o[")).replaceAll("o\\[", "").replaceAll("\\]", ""); structure = m + o; } else { structure = content; } //if this chunk appears right after a comma boolean aftercomma = false; if(this.latestelements.size()>0 && this.latestelements.get(this.latestelements.size()-1).getName().compareTo("comma")==0){ aftercomma = true; } ArrayList<Element> structures = createStructureElements(structure, aftercomma/* * , * false */); updateLatestElements(structures); } else if (ck instanceof ChunkPrep) { /* * String content = ck.toString(); * if(content.matches(".*?\\bwith\\b.*")){ this.attachToLast = * true; } if(content.indexOf("c[")>=0){ content = * content.replaceFirst(".*?\\bc\\[", "").replaceAll("\\]", ""); * this.annotateNumericals(content, "count", "", * lastStructures()); }else{ */ processPrep((ChunkPrep) ck); // } } else if (ck instanceof ChunkCHPP) {// t[c/r[p/o]] this chunk is // converted internally and // not shown in the parsing // output String content = ck.toString().replaceFirst("^t\\[", "").replaceFirst("\\]$", ""); processCHPP(content); } else if (ck instanceof ChunkNPList) {// NPList as a seperate chunk String content = ck.toString().replaceFirst("^l\\[", "").replaceFirst("\\]$", ""); if (!content.endsWith(")")) {// format it content = content.replaceAll(" +(?=(,|and\\b|or\\b))", ") ") + ")"; content = content.replaceAll(" +(?=\\w+\\))", " ("); if(!content.startsWith("(")) content="("+content;//added by Hari } //if this chunk appears right after a comma boolean aftercomma = false; if(this.latestelements.size()>0 && this.latestelements.get(this.latestelements.size()-1).getName().compareTo("comma")==0){ aftercomma = true; } establishSubject(content, aftercomma/* , false */); } else if (ck instanceof ChunkSimpleCharacterState) { String content = ck.toString().replaceFirst("^a\\[", "").replaceFirst("\\]$", ""); // ArrayList<Element> chars = // processSimpleCharacterState(content, lastStructures());//with // teeth closely spaced // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; ArrayList<Element> chars = processSimpleCharacterState(content, lastStructures());// apices // of // basal // leaves // spread // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(content + // " attached to "+parents.get(0).getAttributeValue("name")); // } updateLatestElements(chars); } else if (ck instanceof ChunkSL) {// coloration[coloration-list-red-to-black] // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(ck.toString() + // " attached to "+parents.get(0).getAttributeValue("name")); // } ArrayList<Element> chars = processCharacterList(ck.toString(), lastStructures(), false/* * this * . * subjects */); updateLatestElements(chars); } else if (ck instanceof ChunkComma) { this.latestelements.add(new Element("comma")); } else if (ck instanceof ChunkVP) { // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; /* * if(printAttach && * subjects.get(0).getAttributeValue("name").compareTo * (lastStructures().get(0).getAttributeValue("name")) != 0){ * System.out.println(ck.toString() + * " attached to "+parents.get(0).getAttributeValue("name")); } */ ArrayList<Element> es = processTVerb(ck.toString().replaceFirst("^b\\[", "").replaceFirst("\\]$", ""), subjects); // ArrayList<Element> es = // processTVerb(ck.toString().replaceFirst("^b\\[", // "").replaceFirst("\\]$", ""), // CharacterAnnotatorChunked.subjects); updateLatestElements(es); } else if (ck instanceof ChunkComparativeValue) {//m[at-least] 2 times n[length[{longer}] constraint[than {wide}]] // ArrayList<Element> chars = // processComparativeValue(ck.toString().replaceAll("�", "-"), // lastStructures()); String content = ck.toString(); // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(content + // " attached to "+parents.get(0).getAttributeValue("name")); // } ArrayList<Element> chars = processComparativeValue(content.replaceAll("�", "-"), lastStructures()); updateLatestElements(chars); } else if (ck instanceof ChunkRatio) { // ArrayList<Element> chars = annotateNumericals(ck.toString(), // "lwratio", "", lastStructures()); String content = ck.toString(); // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(content + // " attached to "+parents.get(0).getAttributeValue("name")); // } ArrayList<Element> chars = annotateNumericals(content, ((ChunkRatio) ck).getLabel(), "", lastStructures(), false, false); updateLatestElements(chars); } else if (ck instanceof ChunkArea) { // ArrayList<Element> chars = annotateNumericals(ck.toString(), // "area", "", lastStructures()); String content = ck.toString(); // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(content + " attached to "+ // parents.get(0).getAttributeValue("name")); // } ArrayList<Element> chars = annotateNumericals(content, "area", "", lastStructures(), false, false); updateLatestElements(chars); } else if (ck instanceof ChunkNumericals) { // ** find parents, modifiers // TODO: check the use of [ and ( in extreme values // ArrayList<Element> parents = lastStructures(); String text = ck.toString().replaceAll("�", "-"); boolean resetfrom = false; if (text.matches(".*\\bto \\d.*")) { // m[mostly] to 6 m ==> // m[mostly] 0-6 m text = text.replaceFirst("to\\s+", "0-"); resetfrom = true; } // ArrayList<Element> parents = this.attachToLast? // lastStructures() : subjects; // if(printAttach && // subjects.get(0).getAttributeValue("name").compareTo(lastStructures().get(0).getAttributeValue("name")) // != 0){ // System.out.println(text + // " attached to "+parents.get(0).getAttributeValue("name")); // } if (debugNum) { System.out.println(); System.out.println(">>>>>>>>>>>>>" + text); } String modifier1 = "";// m[mostly] [4-]8�12[-19] mm m[distally]; // m[usually] 1.5-2 times // n[size[{longer} than // {wide}]]:consider a constraint String modifier2 = ""; modifier1 = text.replaceFirst("\\[?\\d.*$", ""); String rest = text.replace(modifier1, ""); modifier1 = modifier1.replaceAll("(\\w\\[|\\]|\\{|\\})", "").trim(); modifier2 = rest.replaceFirst(".*?(\\d|\\[|\\+|\\-|\\]|%|\\s|" + ChunkedSentence.units + ")+\\s?(?=[a-z]|$)", "");// 4-5[+] String content = rest.replace(modifier2, "").replaceAll("(\\{|\\})", "").trim(); modifier2 = modifier2.replaceAll("(\\w+\\[|\\]|\\{|\\})", "").trim(); ArrayList<Element> chars = annotateNumericals(content, text.matches(".*?\\b("+this.size+")\\b.*") || content.indexOf('/') > 0 || content.indexOf('%') > 0 || content.indexOf('.') > 0 ? "size" : null, (modifier1 + ";" + modifier2).replaceAll("(^\\W|\\W$)", ""), lastStructures(), resetfrom, false); updateLatestElements(chars); } else if (ck instanceof ChunkTHAN) { ArrayList<Element> chars = processTHAN(ck.toString().replaceFirst("^n\\[", "").replaceFirst("\\]$", ""), this.subjects); updateLatestElements(chars); } else if (ck instanceof ChunkTHANC) {// n[(longer) than {wide}] . ArrayList<Element> chars = processTHAN(ck.toString().replaceFirst("^n\\[", "").replaceFirst("\\]$", ""), this.subjects); updateLatestElements(chars); } else if (ck instanceof ChunkBracketed) { annotateByChunk(new ChunkedSentence(ck.getChunkedTokens(), ck.toString(), conn, glosstable, this.tableprefix), true); // no // need // to // updateLatestElements this.inbrackets = false; } else if (ck instanceof ChunkSBAR) { ArrayList<Element> subjectscopy = this.subjects; if (this.latestelements.get(this.latestelements.size() - 1).getName().compareTo("structure") == 0) { this.subjects = latest("structure", this.latestelements); } else { int p = cs.getPointer() - 2; String last = ""; // the chunk before ck?? do { last = cs.getTokenAt(p--); } while (last.matches(",") || last.matches("")); String constraintId = null; if (last.matches(".*?\\)\\]+")) { constraintId = "o" + (this.structid - 1); try { Element laststruct = (Element) XPath.selectSingleNode(this.statement, ".//structure[@id='" + constraintId + "']"); ArrayList<Element> temp = new ArrayList<Element>(); temp.add(laststruct); this.subjects = temp; } catch (Exception e) { e.printStackTrace(); } } else { // do nothing System.err.println("no structure element found for the SBARChunk, use subjects instead "); // this only works for situations where states before // subjects got reintroduced after subjects in skiplead // this will not work for misidentified nouns before // "that/which" statements, in "of/among which", and // other cases } } ArrayList<String> chunkedTokens = ck.getChunkedTokens(); String connector = ck.toString().substring(0, ck.toString().indexOf(" ")); String text = ck.toString(); while (!connector.matches("that|when|where|which")) { text = text.replace(connector, "").trim(); for (int i = 0; i < chunkedTokens.size(); i++) { if (chunkedTokens.get(i).equals(connector)) { chunkedTokens.set(i, ""); break; } } connector = text.substring(0, text.indexOf(" ")); } String content = text.substring(text.indexOf(" ") + 1); ChunkedSentence newcs = new ChunkedSentence(chunkedTokens, content, conn, glosstable, this.tableprefix); Chunk firstck = newcs.getNextChunk(); if (firstck instanceof ChunkNonSubjectOrgan || firstck instanceof ChunkOrgan) { //if this chunk appears right after a comma boolean aftercomma = false; if(this.latestelements.size()>0 && this.latestelements.get(this.latestelements.size()-1).getName().compareTo("comma")==0){ aftercomma = true; } establishSubject(firstck.toString().replaceAll("\\w\\[", "").replaceAll("\\]", "").trim(), aftercomma); } if (connector.compareTo("when") == 0) {// rewrite content and // its chunkedTokens Pattern p = Pattern.compile("[\\.,:;]"); Matcher m = p.matcher(ck.toString()); int end = 0; if (m.find()) { end = m.start(); } // int end = ck.toString().indexOf(",") > 0? // ck.toString().indexOf(",") : ck.toString().indexOf("."); String modifier = ck.toString().substring(0, end).trim();// when // mature, content = ck.toString().substring(end).replaceAll("^\\W+", "").trim(); if (content.length() > 0) { ck.setChunkedTokens(Utilities.breakText(content)); newcs = new ChunkedSentence(ck.getChunkedTokens(), content, conn, glosstable, this.tableprefix); } else { newcs = null; } // attach modifier to the last characters if (this.latestelements.get(this.latestelements.size() - 1).getName().compareTo("character") == 0) { Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { this.addAttribute(it.next(), "modifier", modifier); } } else { if (newcs != null) newcs.unassignedmodifier = "m[" + modifier + "]";// this // when // clause // is // a // modifier // for // the // subclause else { if (this.latestelements.get(this.latestelements.size() - 1).getName().compareTo("comma") == 0) { this.latestelements.remove(this.latestelements.size() - 1); // remove // comma, // so // what // follows // when-clause // may // refer // to // the // structure // mentioned // before // as // in // <apex> // r[p[of] // o[(scape)]] // , // s[when // laid // {straight} // {back} // r[p[from] // o[its // (insertion)]] // ,] // just // touches // the // {midpoint} // r[p[of] // o[the // {posterior} // (margin)]] // r[p[in] // o[(fullface)]] // {view} // ; } cs.unassignedmodifier = "m[" + modifier.replaceAll("(\\w+\\[|\\]|\\(|\\)|\\{|\\})", "") + "]"; } } } if (connector.compareTo("where") == 0) { // retrieve the last non-comma, non-empty chunk int p = cs.getPointer() - 2; String last = ""; do { last = cs.getTokenAt(p--); } while (!last.matches(".*?\\w.*")); String constraintId = null; if (last.matches(".*?\\)\\]+")) constraintId = "o" + (this.structid - 1); cs.setClauseModifierConstraint(last.replaceAll("(\\w+\\[|\\]|\\{|\\}|\\)|\\()", ""), constraintId); } if (newcs != null) newcs.setInSegment(true); annotateByChunk(newcs, false); // no need to // updateLatestElements this.subjects = subjectscopy;// return to original status cs.setClauseModifierConstraint(null, null); // return to // original status // this.unassignedmodifiers = null; } else if (ck instanceof ChunkChrom) { String content = ck.toString().replaceAll("[^\\d()\\[\\],+ -]", "").trim(); // Element structure = new Element("chromosome"); Element structure = new Element("structure"); this.addAttribute(structure, "name", "chromosome"); this.addAttribute(structure, "name_original", ""); //what value should it be? this.addAttribute(structure, "id", "o" + this.structid); this.structid++; ArrayList<Element> list = new ArrayList<Element>(); list.add(structure); this.annotateNumericals(content, "count", "", list, false, false); /* * for(int i = 0; i<counts.length; i++){ Element character = new * Element("character"); this.addAttribute(character, "count", * counts[i]); structure.addContent(character); } */ addClauseModifierConstraint(cs, structure); // this.statement.addContent(structure); addContent(this.statement, structure); } else if (ck instanceof ChunkValuePercentage || ck instanceof ChunkValueDegree) { String content = ck.toString(); Element lastelement = this.latestelements.get(this.latestelements.size() - 1); if (lastelement != null && lastelement.getName().compareTo("character") == 0) { this.addAttribute(lastelement, "modifier", content); } else { content = content.replaceAll("(m\\[|\\])", "").replaceAll("(?<=[^\\d])-(?=[^\\d])", " "); ArrayList<Element> chars = annotateNumericals(content, "size", "", lastStructures(), false, false); updateLatestElements(chars); //cs.unassignedmodifier = content; } } else if (ck instanceof ChunkEOS || ck instanceof ChunkEOL) { if (cs.unassignedmodifier != null && cs.unassignedmodifier.length() > 0 && this.latestelements.size()>=1) { Element latestelement = this.latestelements.get(this.latestelements.size() - 1); // if(latestelement == null){ // latestelement = // this.createStructureElements("(placeholder)").get(0); // } if (latestelement.getName().compareTo("structure") == 0) { Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { String sid = it.next().getAttributeValue("id"); try { List<Element> relations = XPath.selectNodes(this.statement, ".//relation[@to='" + sid + "']"); Iterator<Element> rit = relations.iterator(); int greatestid = 0; Element relation = null; while (rit.hasNext()) { Element r = rit.next(); int rid = Integer.parseInt(r.getAttributeValue("id").replaceFirst("r", "")); if (rid > greatestid) { greatestid = rid; relation = r; } } if (relation != null) this.addAttribute(relation, "modifier", cs.unassignedmodifier); // TODO: otherwise, categorize modifier and // create a character for the structure // e.g.{thin} {dorsal} {median} <septum> // {centrally} only ; } catch (Exception e) { e.printStackTrace(); } } } else if (latestelement.getName().compareTo("character") == 0) { Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { this.addAttribute(it.next(), "modifier", cs.unassignedmodifier); } } } this.attachToLast = false; cs.unassignedmodifier = null; this.unassignedcharacter = null; } } } /** * {relative~{A~charA}~{relation}~{B~charB'}} * * <structure id='1' name='A'/> * <structure id='2' name='B'/> * <relation name='charA relation charB'' from='1' to='2'> * @param string * @return */ private ArrayList<Element> processChunkCharacterComparison(String content) { ArrayList<Element>result = new ArrayList<Element>(); String [] parts = content.replaceFirst("^\\{relative~", "").split("\\{"); //three parts String[] part1 = parts[1].replaceAll("[{}]", "").split("~"); //parts[0] = "" String organA = part1[0]; String charA = part1[1]; String[] part2 = parts[3].replaceAll("[{}]", "").split("~"); String organB="", charB=""; //part2 may have one or two elements if(part2.length==1){ if(part2[0].matches("\\b("+this.characters+")\\b")){ charB = part2[0]; organB = organA; }else{ organB = part2[0]; charB = charA; } }else{ organB = part2[0]; charB = part2[1]; } String relation = charA+" "+parts[2].replaceAll("[{}~]", "")+" "+charB; ArrayList<Element> structureA = this.createStructureElements("("+organA+")", false); ArrayList<Element> structureB = null; if(organB.compareTo(organA)!=0){ structureB = this.createStructureElements(organB, false); }else{ structureB = structureA; } this.createRelationElements("#quality comparison# "+relation, structureA, structureB, "", false); return result; //empty as don't want to expose the elements } private void addClauseModifierConstraint(ChunkedSentence cs, Element e) { ArrayList<String> cm = cs.getClauseModifierConstraint(); if (cm != null) { if (cm.size() > 1) {// is a constraint this.addAttribute(e, "constraint", cm.get(0)); this.addAttribute(e, "constraintid", cm.get(1)); } else { this.addAttribute(e, "modifier", cm.get(0)); } } } /** * track back in this.chunkedTokens to populate the afteror element afteror * shares the same character name and value with beforeor, but have * different modifier--which is found from the missing text branched * distally or throughout constricted distally or not subequal or weakly to * strongly well distributed or not dioecious or nearly so spinulose or not * openly branched distally or throughout branched proximally or distally * usually 1 cm or less * * @param afteror * @param beforor */ private Element traceBack4(Element afteror, Element beforeor, int afterorindex, int endindex) { String text = cs.getText(afterorindex, endindex); // from afterorindex // (include) to // endindex (not // include) text = text.replaceAll("SG", "").replaceAll("\\W+", " ").replaceAll("\\s+", " ").trim(); text = text.replaceFirst("\\s+so$", ""); afteror = (Element) beforeor.clone(); this.addAttribute(afteror, "modifier", text); return afteror; } private ArrayList<Element> annotateNumericals(String chunktext, String character, String modifier, ArrayList<Element> parents, boolean resetfrom, boolean characterismodifier) { ArrayList<Element> chars = null; // if(character!=null && character.compareTo("size")==0 && // chunktext.contains("times")){ if (character == null){ character = "count"; // convenient for phenoscape parsing as it // doesn't care numerical values } chunktext = chunktext.replaceAll("("+ChunkedSentence.percentage+")\\b", " %"); chars = parseNumericals(chunktext, character); // annotate "2 times" // without changing // NumericalHandler.parseNumericals // }else{ // chars = NumericalHandler.parseNumericals(chunktext, character); // //full numerical parsing for FNA-like data // } if (chars.size() == 0) {// failed, simplify chunktext chunktext = chunktext.replaceAll("[()\\]\\[]", ""); if (character != null && character.compareTo("size") == 0 && chunktext.contains("times")) { chars = parseNumericals(chunktext, character); } else { chars = NumericalHandler.parseNumericals(chunktext, character); } } Iterator<Element> it = chars.iterator(); ArrayList<Element> results = new ArrayList<Element>(); while (it.hasNext()) { Element e = it.next(); if (resetfrom && e.getAttribute("from") != null && e.getAttributeValue("from").equals("0") && (e.getAttribute("from_inclusive") == null || e.getAttributeValue("from_inclusive").equals("true"))) {// to // 6[-9] // m. e.removeAttribute("from"); if (e.getAttribute("from_unit") != null) { e.removeAttribute("from_unit"); } } if (modifier != null && modifier.compareTo("") != 0) { this.addAttribute(e, "modifier", modifier); } if (this.inbrackets) { e.setAttribute("in_bracket", "true"); } if(characterismodifier){ e.setAttribute("is_modifier", "true"); if(debugextraattributes) System.out.println("is modifier:"+e.getAttributeValue("value")); } /* * if(this.unassignedmodifiers != null && * this.unassignedmodifiers.compareTo("") !=0){ this.addAttribute(e, * "modifier", this.unassignedmodifiers); this.unassignedmodifiers = * ""; } */ Iterator<Element> pit = parents.iterator(); while (pit.hasNext()) { Element ec = (Element) e.clone(); ec.detach(); Element p = pit.next(); // p.addContent(ec); addContent(p, ec); results.add(ec); } } return results; } /** * this is a vastly simplified version of NumericalHandler.parseNumericals() * which was developed for FNA numerical experssions * * @param chunktext * @param character * @return */ private ArrayList<Element> parseNumericals(String chunktext, String character) { ArrayList<Element> chars = new ArrayList<Element>(); Element chara = new Element("character"); chara.setAttribute("name", character);// "size" chara.setAttribute("value", chunktext.trim());// "2 times" [the length // of organ a] as // constraint annotated // elsewhere. chars.add(chara); return chars; } private ArrayList<Element> lastStructures() { ArrayList<Element> parents; if (this.latestelements.size() > 0 && this.latestelements.get(this.latestelements.size() - 1).getName().compareTo("structure") == 0) { parents = this.latestelements; }else { parents = this.subjects; } return parents; } /** * 3 times n[...than...] lengths 0.5�0.6+ times <bodies> ca .3.5 times * length of <throat> 1�3 times {pinnately} {lobed} 1�2 times * shape[{shape~list~pinnately~lobed~or~divided}] 4 times longer than wide * * * * @param content * : 0.5�0.6+ times a[type[bodies]] * @param subjects2 * @return */ private ArrayList<Element> processComparativeValue(String content, ArrayList<Element> parents) { if (content.startsWith("n[")) { content = content.replaceFirst("^n\\[", "").replaceFirst("\\]", "").trim(); } String v = content.replaceAll("(" + ChunkedSentence.times + ").*$", "").trim(); // v // holds // numbers String n = content.replace(v, "").trim(); if (n.indexOf("constraint") >= 0) { n = n.replaceFirst("constraint\\[", "").replaceFirst("\\]$", ""); // n // holds // "times.... } if (n.indexOf("n[") >= 0) {// 1.5�2.5 times n[{longer} than (throat)] // content = "n["+content.replace("n[", ""); // added, not tested if (n.matches("\\b(" + ChunkedSentence.times + ")\\b.*")) { n = n.replaceFirst("\\b(" + ChunkedSentence.times + ")\\b", "").trim(); v = v + " times"; } content = v.replaceFirst("(^| )(?=\\d)", " size[") + "] constraint[" + n.replaceFirst("n\\[", "").trim(); // m[usually] // 1.5-2 return this.processTHAN(content, parents); } else if (n.indexOf("type[") == 0 || n.indexOf(" type[") > 0) {// size[{longer}] // constraint[than // (object}] // this.processSimpleCharacterState("a[size["+v.replace(" times", // "")+"]]", parents); // ArrayList<Element> structures = this.processObject(n); // this.createRelationElements("times", parents, structures, // this.unassignedmodifiers); // this.unassignedmodifiers = null; // return structures; // added, not tested if (n.matches("\\b(" + ChunkedSentence.times + ")\\b.*")) { n = n.replaceFirst("\\b(" + ChunkedSentence.times + ")\\b", "").trim(); v = v + " times"; } n = "constraint[" + n.replaceFirst("type\\[", "(").replaceFirst("\\]", ")").replaceAll("a\\[", ""); // 1-1.6 // times // u[o[bodies]] // => // constraint[(bodies)] content = "size[" + v + "] " + n; return this.processTHAN(content, parents); } else if (n.indexOf("o[") >= 0 || n.indexOf("z[") >= 0) {// ca .3.5 // times // length // r[p[of] // o[(throat)]] if (n.matches("\\b(" + ChunkedSentence.times + ")\\b.*")) { n = n.replaceFirst("\\b(" + ChunkedSentence.times + ")\\b", "").trim(); v = v + " times"; } n = "constraint[" + n.replaceAll("[o|z]\\[", ""); // times // o[(bodies)] // => // constraint[times // (bodies)] content = "size[" + v + "] " + n; return this.processTHAN(content, parents); } else if (n.indexOf("a[") == 0 || n.indexOf(" a[") > 0) { // characters:1�3 // times // {pinnately} // {lobed} String times = n.substring(0, n.indexOf(' ')); n = n.substring(n.indexOf(' ') + 1); n = n.replaceFirst("a\\[", "").replaceFirst("\\]$", ""); n = "m[" + v + " " + times + "] " + n; return this.processSimpleCharacterState(n, parents); } else if (content.indexOf("[") < 0) { // {forked} {moreorless} unevenly // ca . 3-4 times , // content = 3-4 times; v = 3-4; n=times // marked as a constraint to the last character "forked". "ca." // should be removed from sentences in SentenceOrganStateMarker.java Element lastelement = this.latestelements.get(this.latestelements.size() - 1); if (lastelement.getName().compareTo("character") == 0) { Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { lastelement = it.next(); if (cs.unassignedmodifier != null && cs.unassignedmodifier.trim().length() != 0) { lastelement.setAttribute("modifier", cs.unassignedmodifier); cs.unassignedmodifier = null; } lastelement.setAttribute("constraint", content); } } else if (lastelement.getName().compareTo("structure") == 0) { return null; // parsing failure } return this.latestelements; } return null; } /** * size[{longer}] constraint[than (object)]"; shape[{lobed} constraint[than * (proximal)]] * * m[at-least] 2 times n[length[{longer}] constraint[than {wide}] * size[less than 2 times {longer} constraint[than {wide}]] * * @param replaceFirst * @param subjects2 * @return */ private ArrayList<Element> processTHAN(String content, ArrayList<Element> parents) { ArrayList<Element> charas = new ArrayList<Element>(); String modifier = ""; while(content.startsWith("m[")){ modifier += content.substring(0, content.indexOf("]")+1); content = content.substring(content.indexOf("]")+1).trim(); } String[] parts = content.split("constraint\\["); if (content.startsWith("constraint")) { charas = latest("character", this.latestelements); } else { String ch = ""; if(parts[0].contains(" n[") && parts.length>1 && parts[1].matches(".*?\\b("+ChunkedSentence.asasthan+")\\b.*")){//both parts contains a dimension String t = parts[0].substring(parts[0].indexOf(" n[")+3); ch = t.substring(0, t.indexOf("[")); } if (parts[0].matches(".*?\\d.*") && parts[0].matches(".*(size|orientation)\\[.*")) {// size[m[mostly] // [0.5-]1.5-4.5] // ;// // often // wider // than // 2 // cm. if(ch.length()==0 && parts[0].indexOf("size[")>=0){ ch = "size"; } if(parts[0].indexOf("orientation[")>=0) ch = "orientation"; parts[0] = parts[0].trim().replace("size[", "").replace("orientation[", "").replaceFirst("\\]$", ""); Pattern p = Pattern.compile(NumericalHandler.numberpattern + " ?[{<(]?(?:"+ChunkedSentence.units+"|"+ChunkedSentence.percentage+"|"+ChunkedSentence.degree+")?[)>}]?\\b?(" + ChunkedSentence.times + ")?\\b"); Matcher m = p.matcher(parts[0]); String numeric = ""; if (m.find()) { // a series of number numeric = parts[0].substring(m.start(), m.end()).trim().replaceAll("[{<(]$", ""); } else { p = Pattern.compile("\\d+ ?[{<(]?(?:"+ChunkedSentence.units+"|"+ChunkedSentence.percentage+"|"+ChunkedSentence.degree+")?[)>}]?\\b?(" + ChunkedSentence.times + ")?\\b"); // 1 // number m = p.matcher(parts[0]); m.find(); numeric = parts[0].substring(m.start(), m.end()).trim().replaceAll("[{<(]$", ""); } modifier = modifier.replaceAll("(m\\[|\\])", ""); modifier = modifier+";"+ parts[0].substring(0, parts[0].indexOf(numeric)).replaceAll("(\\w+\\[|\\[|\\]|\\{|\\})", "").trim(); modifier = modifier+";"+ parts[0].substring(parts[0].indexOf(numeric)+numeric.length()).replaceAll("(\\w+\\[|\\[|\\]|\\{|\\})", "").trim(); modifier = modifier.replaceAll(";+", ";").replaceAll("(^;|;$)", "").replaceAll("-", " "); if (parts.length < 2) {// parse out a constraint for further // process //String constraint = parts[0].substring(parts[0].indexOf(numeric) + numeric.length()).trim(); //treated as a modifier above String t = parts[0]; parts = new String[2];// parsed out a constraint for further // process parts[0] = t; //parts[1] = constraint; parts[1] = ""; } /* * String modifier = parts[0].replaceFirst("size\\[.*?\\]", * ";").trim().replaceAll("(^;|;$|\\w\\[|\\])", ""); String * numeric = parts[0].substring(parts[0].indexOf("size[")); * numeric = numeric.substring(0, * numeric.indexOf("]")+1).replaceAll("(\\w+\\[|\\])", ""); */ if(modifier.indexOf(" or ")>0){ String[] mods = modifier.split(" or "); for(String mod: mods){ charas.addAll(this.annotateNumericals(numeric.replaceAll("[{<()>}]", ""), ch, mod.replaceAll("[{<()>}]", ""), parents, false, false)); } }else{ charas = this.annotateNumericals(numeric.replaceAll("[{<()>}]", ""), ch, modifier.replaceAll("[{<()>}]", ""), parents, false, false); } } else {// size[{shorter} than {plumose} {inner}]; size[{equal-to} or {greater} than] String value = ""; String mod = ""; if(modifier.length()>0){ if(modifier.contains("m[not]")) mod = "not"; if(modifier.contains("m[no]")) mod = "no"; if(modifier.contains("m[much]")) value = "much"; } if(parts[0].indexOf(" or ")>0){ if(ch.length()==0) ch = parts[0].substring(0, parts[0].indexOf("[")); parts[0] = parts[0].replaceAll("(\\w+\\[|\\])", ""); String[] subparts = parts[0].split("( or | , )"); for(String subpart: subparts){ subpart = mod+";"+subpart.replaceAll("(\\{|\\})", "").trim(); subpart = subpart.replaceAll("-", " "); subpart = subpart.replaceFirst("\\s+than$", ""); this.createCharacterElement(parents, charas, subpart.replaceAll(";+", ";").replaceAll("(^;|;$)", "").replaceAll("-", " "), value, ch, "", false); //charas.addAll(this.processSimpleCharacterState(ch+"["+subpart.replaceAll("(\\{|\\})", "").trim()+"]", parents)); } }else{ if(ch.length()==0) ch = parts[0].substring(0, parts[0].indexOf("[")); this.createCharacterElement(parents, charas, (mod+";"+parts[0].replaceAll("(\\{|\\})", "").trim()).replaceAll(";+", ";").replaceAll("(^;|;$)","").replaceAll("-", " "), value, ch, "", false); //charas = this.processSimpleCharacterState(parts[0].replaceAll("(\\{|\\})", "").trim(), parents); // numeric part } } } String object = null; ArrayList<Element> structures = new ArrayList<Element>(); if (parts.length > 1 && parts[1].length() > 0) {// parts[1]: than // (other) {pistillate} // (paleae)] if (parts[1].indexOf("(") >= 0) { String ostr = parts[1]; object = ostr.replaceFirst("^.*?(?=[({])", "").replaceFirst("\\]+$", ""); // (other) // {pistillate} // (paleae) object = "o[" + object + "]"; if (object != null) { structures.addAll(this.processObject(object)); } /* * while(ostr.indexOf('(')>=0){ object = * ostr.substring(ostr.indexOf('('), ostr.indexOf(')')+1); * object = "o["+object+"]"; ostr = * ostr.substring(ostr.indexOf(')')+1); if(object != null){ * structures.addAll(this.processObject(object)); } } */ } // have constraints even without an organ 12/15/10 Iterator<Element> it = charas.iterator(); while (it.hasNext()) { Element e = it.next(); // if(parts[1].indexOf("(")>=0){ // this.addAttribute(e, "constraint", // this.listStructureNames(parts[1])); // }else{ String constraint = parts[1].replaceAll("(\\(|\\)|\\{|\\}|\\w*\\[|\\])", ""); constraint = map2character(constraint); //long => length this.addAttribute(e, "constraint", constraint); // } if (object != null) { this.addAttribute(e, "constraintid", this.listStructureIds(structures));// TODO: // check: // some // constraints // are // without // constraintid } } } if (structures.size() > 0) { return structures; } else { return charas; } } /** * ChunkedSentence.asasthan: "long|wide|broad|tall|high|deep|short|narrow|thick" * * @param constraint * @return */ private String map2character(String constraint) { constraint = constraint.replaceAll("\\blong\\b", "length"); constraint = constraint.replaceAll("\\blonger\\b", "length"); //constraint = constraint.replaceAll("\\blongest\\b", "length"); constraint = constraint.replaceAll("\\bwide\\b", "width"); constraint = constraint.replaceAll("\\bwider\\b", "width"); //constraint = constraint.replaceAll("\\bwidest\\b", "width"); constraint = constraint.replaceAll("\\bbroad\\b", "width"); constraint = constraint.replaceAll("\\bbroader\\b", "width"); //constraint = constraint.replaceAll("\\bbroadest\\b", "width"); constraint = constraint.replaceAll("\\btall\\b", "height"); constraint = constraint.replaceAll("\\btaller\\b", "height"); //constraint = constraint.replaceAll("\\btallest\\b", "height"); constraint = constraint.replaceAll("\\bhigh\\b", "height"); constraint = constraint.replaceAll("\\bhigher\\b", "height"); //constraint = constraint.replaceAll("\\bhighest\\b", "height"); constraint = constraint.replaceAll("\\bdeep\\b", "height"); constraint = constraint.replaceAll("\\bdeeper\\b", "height"); //constraint = constraint.replaceAll("\\bdeepest\\b", "height"); constraint = constraint.replaceAll("\\bshort\\b", "length"); constraint = constraint.replaceAll("\\bshorter\\b", "length"); //constraint = constraint.replaceAll("\\bshortest\\b", "length"); constraint = constraint.replaceAll("\\bnarrow\\b", "width"); constraint = constraint.replaceAll("\\bnarrower\\b", "width"); //constraint = constraint.replaceAll("\\bnarrowest\\b", "width"); constraint = constraint.replaceAll("\\bthick\\b", "height"); constraint = constraint.replaceAll("\\bthicker\\b", "height"); //constraint = constraint.replaceAll("\\bthickest\\b", "height"); return constraint; } private ArrayList<Element> latest(String name, ArrayList<Element> list) { ArrayList<Element> selected = new ArrayList<Element>(); int size = list.size(); for (int i = size - 1; i >= 0; i--) { if (list.get(i).getName().compareTo(name) == 0) { selected.add(list.get(i)); } else { break; } } return selected; } /** * * @param replaceFirst */ private void processChunkBracketed(String content) { // TODO Auto-generated method stub } /** * * m[usually] v[comprising] o[a {surrounding} (involucre)] * * @param content * @param parents * @return */ private ArrayList<Element> processTVerb(String content, ArrayList<Element> parents) { ArrayList<Element> results = new ArrayList<Element>(); // String object = content.substring(content.indexOf("o[")); String object = content.substring(content.lastIndexOf("o[")); String rest = content.replace(object, "").trim(); String relation = rest.substring(rest.indexOf("v[")); String modifier = rest.replace(relation, "").trim().replaceAll("(m\\[|\\])", ""); /* * excluded from contact with frontal by sphenotic: "contact" is the * subject of with... if(object.indexOf("(")<0){//content: v[{present}] * regardless o[r[p[of] o[l[season or sex]]]] //take the v and make it a * character //relation: v[{present}] regardless o[r[p[of] String v = * relation.substring(0, * relation.indexOf("]")).replaceAll("(v\\[|\\{|\\}|\\])", ""); String * character = TermOutputerUtilities.lookupCharacter(v, conn, * ChunkedSentence.characterhash, glosstable, tableprefix); if(character * !=null){ this.createCharacterElement(this.subjects, results, "", v, * character, ""); } return results; } */ object = parenthesis(object); //o[(fibula) {size}]]] //object = this.normalizeSharedOrganObject(object); if (object.indexOf("(")>=0) { object = object.substring(0, object.lastIndexOf(")")+1)+"]"; ArrayList<Element> tostructures = this.processObject(object); // TODO: // fix // content // is // wrong. // i8: // o[a] // architecture[surrounding // (involucre)] results.addAll(tostructures); this.createRelationElements(relation.replaceAll("(\\w\\[|\\])", ""), this.subjects, tostructures, modifier, false); return results; } else { return latestelements; } } /** * @param content * : m[usually] coloration[dark brown]: there is only one * character states and several modifiers * @param parents * : of the character states */ private ArrayList<Element> processSimpleCharacterState(String content, ArrayList<Element> parents) { ArrayList<Element> results = new ArrayList<Element>(); String modifier = ""; String character = ""; String state = ""; String[] tokens = content.split("\\]\\s*"); for (int i = 0; i < tokens.length; i++) { //Changed by Zilong: update: this change created numerous errors -- abandoned by Hong 7/1/13. //more ventrally->more should be modifier of ventrally //however, parsed as adj[more] adv[ventrally] //if(tokens[i].matches("^comparison\\[.*")){ // if(i<tokens.length-1){//only if not the last token // if(tokens[i+1].matches("^m\\[.*")){//next token is a modifier. // modifier +=tokens[i].replaceAll("^comparison\\[", "").trim()+" "+tokens[i+1]+" "; // i++; // continue; // } // } //}else //changed by Zilong if (tokens[i].matches("^m\\[.*")) { modifier += tokens[i] + " "; } else if (tokens[i].matches("^\\w+\\[.*")) { String[] parts = tokens[i].split("\\["); character = parts[0]; if (this.unassignedcharacter != null) { character = this.unassignedcharacter; this.unassignedcharacter = null; } state = parts[1]; modifier += "; "; } } modifier = modifier.replaceAll("m\\[", "").trim().replaceAll("(^\\W|\\W$)", "").trim(); // make backups // String statecp = state; // String charactercp = character; Element lastelement = this.latestelements.get(this.latestelements.size() - 1); // Altered by Zilong // if the a simple character state is immediately preceded by a // conjunction, // the last element should not be changed to have the same attribute // "name" as // the current element. String conjunction = "(and|or)"; String previousToken = this.cs.getTokenAt((this.cs.getPointer() - 2)); if (previousToken != null) { if (previousToken.matches(conjunction)) { this.createCharacterElement(parents, results, modifier, state, character, "", false); } // deal with possible 3cm wide, or high relief cases: rewrite the // logic to make is simple and more robust else if (lastelement.getName().compareTo("character") == 0) { String eqcharacter = ChunkedSentence.eqcharacters.get(state); // find // the // equivalent // character // for // the // state, // e.g. // wide, // relief if (eqcharacter != null) {// yes, it is the case Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { lastelement = it.next(); lastelement.setAttribute("name", eqcharacter); } results = this.latestelements; } else {// no, it is not the case this.createCharacterElement(parents, results, modifier, state, character, "", false); } } else {// no, it is not the case this.createCharacterElement(parents, results, modifier, state, character, "", false); } } else { this.createCharacterElement(parents, results, modifier, state, character, "", false); } /* * seemed to be unnecessarily complexed and sensitive to the glossary * (the lookupcharacter step determines the logic) String eqcharacter = * ChunkedSentence.eqcharacters.get(state); if(eqcharacter != null){ * state = eqcharacter; character = * TermOutputerUtilities.lookupCharacter(eqcharacter, conn, * ChunkedSentence.characterhash, this.glosstable, this.tableprefix); * if(character ==null){ state = statecp; character = charactercp; } } * if(character.compareToIgnoreCase("character")==0 && modifier.length() * ==0){//high relief: character=relief, reset the character of "high" * to "relief" Iterator<Element> it = this.latestelements.iterator(); * while(it.hasNext()){ lastelement = it.next(); * lastelement.setAttribute("name", state); } }else * if(lastelement.getName().compareTo("structure")==0){ * this.unassignedcharacter = state; } results = this.latestelements; * }else if(state.length()>0){ //if(this.unassignedmodifiers!=null && * this.unassignedmodifiers.length()>0){ // modifier = * modifier+";"+this.unassignedmodifiers; // this.unassignedmodifiers = * ""; //} this.createCharacterElement(parents, results, modifier, * state, character, ""); } */ return results; } private void establishSubject(String content, boolean aftercomma/* , boolean makeconstraint */) { ArrayList<Element> structures = createStructureElements(content, aftercomma/* * , * makeconstraint */); this.subjects = new ArrayList<Element>(); this.latestelements = new ArrayList<Element>(); Iterator<Element> it = structures.iterator(); while (it.hasNext()) { Element e = it.next(); if (e.getName().compareTo("structure") == 0) { // ignore character // elements this.subjects.add(e); this.latestelements.add(e); } } } // fix: can not grab subject across treatments private void reestablishSubject() { Iterator<Element> it = this.subjects.iterator(); this.latestelements = new ArrayList<Element>(); while (it.hasNext()) { Element e = it.next(); e.detach(); // this.statement.addContent(e); addContent(this.statement, e); this.latestelements.add(e); } } /** * TODO: * {shape~list~usually-flat-to-convex-punct-sometimes-conic-or-columnar} * {pubescence-list-sometimes-bristly-or-hairy} * * @param content * : pubescence[m[not] * {pubescence-list-sometimes-bristly-or-hairy}] * @param parents * @param characterismodifier * @return */ private ArrayList<Element> processCharacterList(String content, ArrayList<Element> parents, boolean characterismodifier) { ArrayList<Element> results = new ArrayList<Element>(); String modifier = ""; if (content.indexOf("m[") >= 0) { modifier = content.substring(content.indexOf("m["), content.indexOf("{")); content = content.replace(modifier, ""); modifier = modifier.trim().replaceAll("(m\\[|\\])", ""); } content = content.replace(modifier, ""); String[] parts = content.split("\\["); String cname = ""; String list = ""; if (parts.length < 2) { // {count~list~2~or~fewer} int i = parts[0].indexOf("~list~"); if (i > 0) { cname = parts[0].substring(0, i).replace("{", ""); list = parts[0]; } else return results; // @TODO: parsing failure } else { cname = parts[0]; list = parts[1]; } if (this.unassignedcharacter != null) { cname = this.unassignedcharacter; this.unassignedcharacter = null; } String cvalue = list.replaceFirst("\\{" + cname + "~list~", "").replaceFirst("\\W+$", "").replaceAll("~", " ").trim(); if (cname.endsWith("ttt")) { this.createCharacterElement(parents, results, modifier, cvalue, cname.replaceFirst("ttt$", ""), "", characterismodifier); return results; } if (cvalue.indexOf(" to ") >= 0) { createRangeCharacterElement(parents, results, modifier, cvalue.replaceAll("punct", ",").replaceAll("(\\{|\\})", ""), cname, characterismodifier); // add // a // general // statement: // coloration="red to brown" } String mall = ""; boolean findm = false; // gather modifiers from the end of cvalues[i]. this modifier applies to // all states do { findm = false; String last = cvalue.substring(cvalue.lastIndexOf(' ') + 1); if (Utilities.lookupCharacter(last, conn, ChunkedSentence.characterhash, glosstable, tableprefix) == null && Utilities.isAdv(last, ChunkedSentence.adverbs, ChunkedSentence.notadverbs)) { mall += last + " "; cvalue = cvalue.replaceFirst(last + "$", "").trim(); findm = true; } } while (findm); String[] cvalues = cvalue.split("\\b(to|or|punct)\\b");// add individual // values for (int i = 0; i < cvalues.length; i++) { String state = cvalues[i].trim();// usually papillate to hirsute // distally // gather modifiers from the beginning of cvalues[i]. a modifier // takes effect for all state until a new modifier is found String m = ""; do { findm = false; if (state.length() == 0) { break; } int end = state.indexOf(' ') == -1 ? state.length() : state.indexOf(' '); String w = state.substring(0, end); if (Utilities.lookupCharacter(w, conn, ChunkedSentence.characterhash, glosstable, tableprefix) == null && Utilities.isAdv(w, ChunkedSentence.adverbs, ChunkedSentence.notadverbs)) { m += w + " "; w = w.replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}").replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)").replaceAll("\\+", "\\\\+"); state = state.replaceFirst(w, "").trim(); findm = true; } } while (findm); if (m.length() == 0) { state = (modifier + " " + mall + " " + state.replaceAll("\\s+", "#")).trim(); // prefix // the // previous // modifier } else { modifier = modifier.matches(".*?\\bnot\\b.*") ? modifier + " " + m : m; // update // modifier // cvalues[i] = (mall+" "+cvalues[i]).trim(); state = (modifier + " " + mall + " " + state.replaceAll("\\s+", "#")).trim(); // prefix // the // previous // modifier } String[] tokens = state.split("\\s+"); tokens[tokens.length - 1] = tokens[tokens.length - 1].replaceAll("#", " "); results.addAll(this.processCharacterText(tokens, parents, cname, characterismodifier)); // results.addAll(this.processCharacterText(new String[]{state}, // parents, cname)); } return results; } /** * crowded to open for categorical range-value * * @param parents * @param results * @param modifier * @param cvalue * @param cname * @param characterismodifier */ private String createRangeCharacterElement(ArrayList<Element> parents, ArrayList<Element> results, String modifiers, String cvalue, String cname, boolean characterismodifier) { Element character = new Element("character"); if (this.inbrackets) { character.setAttribute("in_bracket", "true"); } if(characterismodifier){ character.setAttribute("is_modifier", "true"); if(debugextraattributes) System.out.println("is modifier:"+cvalue); } character.setAttribute("char_type", "range_value"); character.setAttribute("name", cname); String[] range = cvalue.split("\\s+to\\s+");// a or b, c, to d, c, e String[] tokens = range[0].replaceFirst("\\W$", "").replaceFirst("^.*?\\s+or\\s+", "").split("\\s*,\\s*"); // a // or // b, // c, // => String from = getFirstCharacter(tokens[tokens.length - 1]); tokens = range[1].split("\\s*,\\s*"); String to = getFirstCharacter(tokens[0]); character.setAttribute("from", from.replaceAll("-c-", " ")); // a or b // to c // => b // to c character.setAttribute("to", to.replaceAll("-c-", " ")); boolean usedm = false; Iterator<Element> it = parents.iterator(); while (it.hasNext()) { Element e = it.next(); character = (Element) character.clone(); if (modifiers.trim().length() > 0) { addAttribute(character, "modifier", modifiers.trim()); // may // not // have usedm = true; } results.add(character); // add to results // e.addContent(character);//add to e addContent(e, character); } if (usedm) { modifiers = ""; } addClauseModifierConstraint(cs, character); return modifiers; } /** * * @param tokens * : usually large * @return: large */ private String getFirstCharacter(String character) { String[] tokens = character.trim().split("\\s+"); String result = ""; for (int i = 0; i < tokens.length; i++) { if (Utilities.lookupCharacter(tokens[i], conn, ChunkedSentence.characterhash, glosstable, tableprefix) != null) { result += tokens[i] + " "; } } return result.trim(); } /** * * @param elements */ private void updateLatestElements(ArrayList<Element> elements) { this.latestelements = new ArrayList<Element>(); if (elements != null) { latestelements.addAll(elements); } } /** * //t[c/r[p/o]] m[sometimes] v[subtended] r[p[by] o[(calyculi)]] m[loosely * loosely] architecture[arachnoid] r[p[at] o[m[distal] {end}]] * * t[c[{sometimes} with (bases) {decurrent}] r[p[onto] o[(stems)]]] * * nested:{often} {dispersed} r[p[with] o[aid r[p[from] o[(pappi)]]]] * * @param ck */ private void processCHPP(String content) { // having oval outline if (this.characterPrep(content)) { return; } String c = content.substring(0, content.indexOf("r[")); String r = content.replace(c, ""); if (r.lastIndexOf("o[") < 0) { // #{usually} {arising} r[p[in]]# // {distal} 1/2 // failed parse cs.setPointer2NextComma(); return; } String p = r.substring(0, r.lastIndexOf("o["));// {often} {dispersed} // r[p[with] o[aid // r[p[from] // o[(pappi)]]]] String o = r.replace(p, ""); String[] mc = c.split("(?<=\\])\\s*"); String m = ""; c = ""; for (int i = 0; i < mc.length; i++) { if (mc[i].startsWith("m[")) { m += mc[i] + " "; } else if (mc[i].startsWith("c[")/* mc[i].matches("^\\w+\\[.*") */) { c += mc[i] + " "; } } m = m.replaceAll("(m\\[|\\]|\\{|\\})", "").trim(); c = c.replaceAll("(c\\[|\\]|\\{|\\})", "").trim(); // TODO: will this // work for nested // chuncks? p = p.replaceAll("(\\w\\[|\\])", "").trim(); // c: {loosely} {arachnoid} String[] words = c.split("\\s+"); if (Utilities.isVerb(words[words.length - 1], ChunkedSentence.verbs, ChunkedSentence.notverbs) || p.compareTo("to") == 0) {// t[c[{connected}] // r[p[by] // o[{conspicuous} // {arachnoid} // <trichomes>]]] // TODO: // what // if // c // was // not // included // in // this // chunk? String relation = (c + " " + p).replaceAll("\\s+", " "); o = o.replaceAll("(o\\[|\\])", ""); /* * if(!o.endsWith(")") &&!o.endsWith("}")){ //1-5 series => 1-5 * (series) String t = o.substring(o.lastIndexOf(' ')+1); o = * o.replaceFirst(t+"$", "("+t)+")"; } */ if (!o.endsWith(")")) { // force () on the last word. Hong 3/4/11 String t = o.substring(o.lastIndexOf(' ') + 1); t = t.replace("{", "").replace("}", ""); o = o.substring(0, o.lastIndexOf(' ') + 1) + "(" + t + ")"; // System.out.println("forced organ in: "+o); } ArrayList<Element> structures = processObject("o[" + o + "]"); ArrayList<Element> entity1 = null; Element e = this.latestelements.get(this.latestelements.size() - 1); if (e.getName().matches("(" + this.delims + ")") || e.getName().compareTo("character") == 0) { entity1 = this.subjects; } else { entity1 = (ArrayList<Element>) this.latestelements.clone(); // entity1.remove(entity1.size()-1); } createRelationElements(relation, entity1, structures, m, false); updateLatestElements(structures); } else {// c: {loosely} {arachnoid} : should be m[loosly] // architecture[arachnoid] // String[] tokens = c.replaceAll("[{}]", "").split("\\s+"); // ArrayList<Element> charas = this.processCharacterText(tokens, // this.subjects); ArrayList<Element> charas = this.processSimpleCharacterState(c, this.subjects); updateLatestElements(charas); processPrep(new ChunkPrep(r)); // not as a relation } } /** * CK takes form of relation character/states [structures]? update * this.latestElements with structures only. * * nested1: r[p[of] o[5-40 , * fusion[{fusion~list~distinct~or~basally~connate}] r[p[in] o[groups]] , * coloration[{coloration~list~white~to~tan}] , {wholly} or {distally} * {plumose} (bristles)]] []] nested2: r[p[with] o[{central} {cluster} * r[p[of] o[(spines)]]]] * * @param ck * @param asrelation * : if this PP should be treated as a relation */ private void processPrep(ChunkPrep ck) { String ckstring = ck.toString(); // r[{} {} p[of] o[.....]] String modifier = ckstring.substring(0, ckstring.indexOf("p[")).replaceFirst("^r\\[", "").replaceAll("[{}]", "").trim(); // sometime o[] is not here as in ckstring=r[p[at or above]] {middle} // String pp = ckstring.substring(ckstring.indexOf("p["), // ckstring.lastIndexOf("] o[")).replaceAll("(\\w\\[|])", ""); // String object = // ckstring.substring(ckstring.lastIndexOf("o[")).replaceFirst("\\]+$", // "")+"]"; int objectindex = ckstring.indexOf("]", ckstring.indexOf("p[") + 1); String pp = ckstring.substring(ckstring.indexOf("p["), objectindex).replaceAll("(\\w\\[|])", ""); pp = pp.replace("-", " "); String object = "o[" + ckstring.substring(objectindex).trim().replaceAll("(\\b\\w\\[)|]", "").trim() + "]"; // String object = // "o["+ckstring.substring(objectindex).trim().replaceAll("(\\b\\w\\[|])", // "")+"]"; // String object = // "o["+ckstring.substring(objectindex).trim().replaceAll("(\\[|])", // "")+"]"; // TODO: r[p[in] o[outline]] or r[p[with] o[irregular ventral profile]] if (characterPrep(ckstring)) { return; } if (statePrep(ckstring)) { return; } /* * String pp = null; String object = null; * if(ckstring.matches(".*?\\]{4,}$")){//nested2 pp = * ckstring.substring(ckstring.indexOf("p["), * ckstring.lastIndexOf("] o[")).replaceAll("(\\w\\[|])", ""); object = * ckstring.substring(ckstring.lastIndexOf("o[")).replaceFirst("\\]+$", * "")+"]"; }else{//nested1 or not nested pp = * ckstring.substring(ckstring.indexOf("p["), * ckstring.indexOf("] o[")).replaceAll("(\\w\\[|])", ""); object = * ckstring.substring(ckstring.indexOf("o[")).replaceFirst("\\]+$", * "")+"]";//nested or not } */ object = NumericalHandler.originalNumForm(object); boolean lastIsStruct = false; boolean lastIsChara = false; boolean lastIsComma = false; // mohan code to get the original subject if the subject is empty Store // the chunk into the modifier // in dorsal view => in-dorsal-vew if (this.latestelements.size() == 0) { String content = ck.toString().replaceAll(" ", "-"); // String structure = "m[" +content+"]"; String structure = content.replaceAll("]-o\\[", "-").replaceAll("[{()}]", ""); if (cs.unassignedmodifier == null) { cs.unassignedmodifier = structure; } else { cs.unassignedmodifier += structure; } return; } // end mohan code Element lastelement = this.latestelements.get(this.latestelements.size() - 1); if (lastelement.getName().compareTo("structure") == 0) {// latest // element is a // structure lastIsStruct = true; } else if (lastelement.getName().compareTo("character") == 0) { lastIsChara = true; } else if (lastelement.getName().matches("(" + this.delims + ")")) { lastIsComma = true; if (this.printComma) { System.out.println("prep ahead of character: " + ckstring); } } // of o[3-7] if (lastIsStruct && object.matches("o\\[\\(?\\[?\\d.*?\\d\\+?\\]")) { this.annotateNumericals(object.replaceAll("(o\\[|\\])", ""), "count", null, this.latestelements, false, false); return; } ArrayList<Element> structures = new ArrayList<Element>(); // 3/30/2011: try to separate "in {} {} arrays" cases from // "at {flowering}", "in fruit", and "in size" cases // allow () be added around the last bare word if there is a {} before // the bare word, or if the word is not a character (size, profile, // lengths) object = parenthesis(object); // o[the {frontal} and the (sphenotic) ({spine})] ==> o[the {frontal} // ({spine}) and the (sphenotic) ({spine})] object = normalizeSharedOrganObject(object); /* * if(! object.matches(".*?\\}\\]+$")){ //contains organ: > or untagged: * arrays //add () around the last word if it is bare * if(object.matches(".*?[a-z]\\]+$")){ * System.out.println("!!!!!!Object: "+object); int l = * object.lastIndexOf(' '); if(l>0){ String last = * object.substring(l+1); object = object.replaceFirst(last+"$", * "("+last.replaceFirst("\\]", ")]")); }else{//object= o[tendrils] * object = object.replaceFirst("\\[", "[(").replaceFirst("\\]", ")]"); * } } */ if (object.matches(".*?\\)\\]+$")) { // structures = linkObjects(modifier, pp, object, lastIsStruct, // lastIsChara, lastelement); structures = linkObjects(modifier, pp, object, lastIsStruct, lastIsChara); // apply // to // all // latestelements updateLatestElements(structures); } else if (object.matches(".*?\\([-a-z]+\\).*") && !object.matches(".*?[-a-z]+\\]+$")) {// contains // organ // in // the // middle // of // object:r[p[from] // o[{thick} // {notothyrial} // (platform) // {excavated} // {laterally}]] String obj = object.substring(0, object.lastIndexOf(")") + 1).trim(); String modi = object.substring(object.lastIndexOf(")") + 1).trim(); // TODO: // left // out // right // end // modi // for // now. object = obj; // structures = linkObjects(modifier, pp, object, lastIsStruct, // lastIsChara, lastelement); structures = linkObjects(modifier, pp, object, lastIsStruct, lastIsChara); // apply // to // all // latestelements updateLatestElements(structures); } else {// "at {flowering}]" or "in size]" // contains no organ, e.g. "at flowering" // Element last = // this.latestelements.get(this.latestelements.size()-1); if (lastIsStruct) { for (Element lastE : this.latestelements) { lastE.setAttribute("name", lastE.getAttributeValue("name") + " " + ckstring.replaceAll("(\\w\\[|\\]|\\{|\\})", "").trim()); // addAttribute(lastE, "constraint", // ckstring.replaceAll("(\\w\\[|\\]|\\{|\\})", ""));//TODO // 5/16/2011 <corollas> r[p[of] o[{sterile} {much} // {expanded} and {exceeding} (corollas)]] This should not // be happening.z[{equaling} (phyllaries)] r[p[at] // o[{flowering}]] } } else if (lastIsChara) { // character element for (Element lastE : this.latestelements) { addAttribute(lastE, "modifier", ckstring.replaceAll("(\\w\\[|\\]|\\{|\\})", "")); } } // addPPAsAttributes(ckstring); } // bookkeeping: update this.latestElements: only structures are visible // updateLatestElements(structures); } private boolean statePrep(String ckstring) { ckstring = ckstring.replaceAll("(\\w+\\[|\\]|\\)|\\(|\\{|\\})", ""); if (ckstring.compareTo("in direct contact") == 0) { ArrayList<Element> structs = null; if (this.latestelements.size() != 0 && this.latestelements.get(this.latestelements.size() - 1).getName().compareTo("structure") == 0) { structs = this.latestelements; } else if (this.subjects.size() != 0) { structs = this.subjects; } else { // create placeholder structure "ApplicationUtilities.getProperty("unknown.structure.name")" this.establishSubject("("+ApplicationUtilities.getProperty("unknown.structure.name")+")", false); structs = this.subjects; } Element ch = new Element("character"); ch.setAttribute("name", "position"); ch.setAttribute("value", ckstring); ArrayList<Element> ech = new ArrayList<Element>(); for (Element s : structs) { ech.add(ch); this.addContent(s, ch); } this.updateLatestElements(ech); return true; } return false; } /** * * @param ckstring * :r[p[in] o[outline]] * @return */ private boolean characterPrep(String ckstring) { boolean done = false; String lastword = ckstring.substring(ckstring.lastIndexOf(" ")).replaceAll("\\W", ""); if (lastword.matches("(" + this.characters + ")")) { Element lastelement = this.latestelements.get(this.latestelements.size() - 1); if (lastelement.getName().compareTo("character") == 0) {// shell // oval in // outline Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { lastelement = it.next(); lastelement.setAttribute("name", lastword); } done = true; } else if (lastelement.getName().compareTo("structure") == 0) {// shell // in // oval // outline String cvalue = ckstring.replaceFirst(".*?\\]", "").replaceAll("\\w+\\[", "").replaceAll(lastword, "").replaceAll("[{}\\]\\[]", "").trim(); if(!cvalue.endsWith(")")){ Iterator<Element> it = this.latestelements.iterator(); while (it.hasNext()) { lastelement = it.next(); Element chara = new Element("character"); chara.setAttribute("name", lastword); chara.setAttribute("value", cvalue); this.addContent(lastelement, chara); } done = true; } } } return done; } private String parenthesis(String object) { if (!object.matches(".*?\\}\\]+$")) { // contains organ: > or untagged: // arrays if (object.matches(".*?\\bl\\[.*")) { // deal with list: o[l[season // or sex]]]] String beforelist = object.substring(0, object.indexOf("l[")); String list = object.substring(object.indexOf("l[")); // l[season // or // sex]]]] list = list.replaceFirst("\\]", ")]").replaceFirst("\\[", "[(").replaceAll(" ", ") ("); list = list.replaceAll("\\)+", ")").replaceAll("\\(+", "(").replaceAll("\\(or\\)", "or").replaceAll("\\(and\\)", "and").replaceAll("\\(,\\)", ",").trim(); return beforelist + list; } else if (object.matches(".*?[a-z]\\]+$")) {// there is a bare word int l = object.lastIndexOf(' '); l = l < 0 ? object.lastIndexOf('[') : l; String last = object.substring(l + 1).replaceAll("\\W+$", ""); if (object.indexOf('{') >= 0 || !isCharacter(last)) {// if there // are // other // modifiers/characters, // then // must // make // "last" // a // structure object = object.replaceFirst(last + "(?=\\]+$)", "(" + last + ")"); } } } return object; } private boolean isCharacter(String last) { try { Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select * from " + this.glosstable + " where term='" + last + "' and category='character'"); if (rs.next()) { return true; } } catch (Exception e) { e.printStackTrace(); } return false; } private ArrayList<Element> linkObjects(String modifier, String pp, String object, boolean lastIsStruct, boolean lastIsChara/* * , * Element * lastelement */) { ArrayList<Element> structures; structures = processObject(object); String base = ""; if (object.matches("o?\\[*\\{*(" + ChunkedSentence.basecounts + ")\\b.*")) { base = "each"; } if (lastIsChara) { for (Element lastelement : this.latestelements) { // if last character is size, change to location: <margins> // r[p[with] o[3�6 (spines)]] 1�3 {mm} r[p[{near}] o[(bases)]]. // 1-3 mm is not a size, but a location of spines if (lastelement.getAttributeValue("name").compareTo("size") == 0 && ((lastelement.getAttributeValue("value") != null && lastelement.getAttributeValue("value").matches(".*?\\d.*")) || (lastelement .getAttributeValue("from") != null && lastelement.getAttributeValue("from").matches(".*?\\d.*"))) && pp.matches("(" + ChunkedSentence.locationpp + ")")) { lastelement.setAttribute("name", "location"); } // addAttribute(lastelement, "constraint", // (pp+" "+base+" "+listStructureNames(structures)).replaceAll("\\s+", // " ").replaceAll("(\\{|\\})", "")); //a, b, c // addAttribute(lastelement, "constraint", // (pp+" "+listStructureNames(object)).replaceAll("\\s+", // " ").replaceAll("(\\w+\\[|\\(|\\)|\\{|\\}|\\])", "")); addAttribute(lastelement, "constraint", pp + " " + object); addAttribute(lastelement, "constraintid", listStructureIds(structures));// "1 2 3" if (modifier.length() > 0) { addAttribute(lastelement, "modifier", modifier); } } } else {// lastIsStructure // handle two cases: 1: the prep is a relation, e.g. of if (isRelation(pp)) { ArrayList<Element> entity1 = null; if (lastIsStruct) { entity1 = this.latestelements; } else { entity1 = this.subjects; } String relation = relationLabel(pp, entity1, structures);// determine // the // relation if (relation != null) { createRelationElements(relation, entity1, structures, modifier, false);// relation // elements // not // visible // to // outside } // reset "subject" structure for prositional preps, so all // subsequent characters should refer to organbeforeOf/entity1 boolean nextisposition = isNextChunkPosition(); if (relation != null && relation.matches("(" + ChunkedSentence.positionprep + ")") && !nextisposition) { structures = entity1; } } else { // 2: the prep can not be a relation, e.g. through, by. Make // these modifiers/contraints to the last relation/character // A reaching B through C : matching element holds "reading" // A communicating with B through C: "communicating [with]" // A seperated from B by C: "seperated from" // A connected by B to C: "connected by" // A extended from (B1 of B2) into C: "extended from" // find the last non-of chunk int p = cs.getPointer() - 1; String lasttoken = ""; do { p--; lasttoken = cs.getTokenAt(p); if(lasttoken == null){ lasttoken=""; break; } } while (lasttoken.compareTo("") == 0 || lasttoken.startsWith("r[p[of]")); if (lasttoken.matches("\\w+\\[.*")) {// is a chunk ArrayList<Element> targets = retrieveMatchingElement(lasttoken); for (Element target : targets) { String type = target.getName(); if (type.compareToIgnoreCase("relation") == 0) { this.addAttribute(target, "modifier", pp + " " + object); } else if (type.compareToIgnoreCase("character") == 0) { this.addAttribute(target, "constraint", pp + " " + object); this.addAttribute(target, "constraintid", listStructureIds(structures));// "1 2 3" } } } } } return structures; } /** * peek if the next chunk is a postion chunk * * @return */ private boolean isNextChunkPosition() { int pointer = cs.getPointer() + 1; int size = cs.getSize(); String token = ""; while (token.length() == 0 && pointer < size) { token = cs.getTokenAt(pointer++); } if (token.matches(".*?\\b(" + ChunkedSentence.positionprep + ")\\b.*")) return true; return false; } /** * In elementlog, find relation/character elements that matches the token * * @param lasttoken * @return */ // A reaching B through C : matching element holds "reading" // A communicating with B through C: "communicating [with]" // A seperated from B by C: "seperated from" // A connected by B to C: "connected by" // A extended from (B1 of B2) into C: "extended from" private ArrayList<Element> retrieveMatchingElement(String cstoken) { ArrayList<Element> result = new ArrayList<Element>(); cstoken = cstoken.replaceAll("(\\w+\\[|\\]|\\}|\\{|\\)|\\()", ""); cstoken = cstoken.indexOf(" ") > 0 ? cstoken.substring(0, cstoken.indexOf(" ")) : cstoken; // first // word for (int i = this.elementlog.size() - 1; i >= 0; i--) {// search // backwards Element e = this.elementlog.get(i); String type = e.getName(); if (type.contains("relation")) { String name = e.getAttributeValue("name"); if (name.matches(".*_of")) continue; if (name.matches(".*?\\b" + cstoken + "\\b.*")) { result.add(e); while (this.elementlog.get(i - 1).getName().contains("relation") && this.elementlog.get(i - 1).getAttributeValue("name").compareToIgnoreCase(name) == 0) { result.add(e); i = i - 1; } return result; } } if (type.contains("character")) { String value = e.getAttribute("value") != null ? e.getAttributeValue("value") : null; if (value != null && value.matches(".*?\\b" + cstoken + "\\b.*?")) { result.add(e); while (this.elementlog.get(i - 1).getName().contains("character") && this.elementlog.get(i - 1).getAttribute("value") != null && this.elementlog.get(i - 1).getAttributeValue("value").compareToIgnoreCase(value) == 0) { result.add(e); i = i - 1; } return result; } String constraint = e.getAttribute("constraint") != null ? e.getAttributeValue("constraint") : null; if (constraint != null && constraint.matches(".*?\\b" + cstoken + "\\b.*")) { result.add(e); while (this.elementlog.get(i - 1).getName().contains("character") && this.elementlog.get(i - 1).getAttribute("constraint") != null && this.elementlog.get(i - 1).getAttributeValue("constraint").compareToIgnoreCase(constraint) == 0) { result.add(e); i = i - 1; } return result; } } } return result; } /** * test and see whether pp plays the role of a relation here A reaching B * through C A communicating with B through C A seperated from B by C A * connected by B to C A extended from (B1 of B2) into C * * @param pp * @return */ private boolean isRelation(String pp) { int i = cs.getPointer(); if ((i - 2) < 0) return true; String lasttoken = cs.getTokenAt(i - 2); // comes right after a verb/prep chuck and part of non-relational prep if ((lasttoken.compareTo("") == 0 || lasttoken.matches(".*\\b(b\\[v\\[|r\\[p\\[).*")) && pp.matches("(" + this.nonrelation + ")")) { return false; } return true; } /** * o[.........{m} {m} (o1) and {m} (o2)] o[each {bisexual} , * architecture[{architecture * -list-functionally-staminate-punct-or-pistillate}] (floret)]] ; * * @param object * @return */ private ArrayList<Element> processObject(String object) { ArrayList<Element> structures; if (object.indexOf("l[") >= 0) { // a list of object object = object.replace("l[", "").replaceFirst("\\]", ""); } String[] twoparts = separate(object);// separate characters from the // organs in object // o[.........{m} {m} (o1) and // {m} (o2)] structures = createStructureElements(twoparts[1], false/* , false */);// to be // added // structures // found // in // 2nd // part, // not // rewrite // this.latestelements // yet if (twoparts[0].length() > 0) { /* * if(twoparts[0].matches(".*?\\b\\w\\[.*")){//nested chunks: e.g. * 5-40 , fusion[{fusion~list~distinct~or~basally~connate}] r[p[in] * o[groups]] , coloration[{coloration~list~white~to~tan}] , * {wholly} or {distally} {plumose} //get tokens for the new * chunkedsentence ArrayList<String>tokens = * TermOutputerUtilities.breakText(twoparts[0]); twoparts[0]=twoparts[0].trim(); * if(!twoparts[0].matches(".*?[,;\\.:]$")){ twoparts[0] +=" ."; * tokens.add("."); } ChunkedSentence newcs = new * ChunkedSentence(tokens, twoparts[0], conn, glosstable); * //annotate this new chunk ArrayList<Element> subjectscopy = * this.subjects; this.subjects = structures; * newcs.setInSegment(true); annotateByChunk(newcs, false); //no * need to updateLatestElements this.subjects = subjectscopy; }else{ */ ArrayList<Element> structurescp = (ArrayList<Element>) structures.clone(); String[] tokens = twoparts[0].replaceFirst("[_-]$", "").split("\\s+");// add // character // elements if (twoparts[1].indexOf(") plus") > 0) {// (teeth) plus 1-2 // (bristles), the structure // comes after "plus" should // be excluded String firstorgans = twoparts[1].substring(0, twoparts[1].indexOf(") plus")); // (teeth String lastorganincluded = firstorgans.substring(firstorgans.lastIndexOf("(") + 1); for (int i = structures.size() - 1; i >= 0; i--) { if (!structures.get(i).getAttributeValue("name_original").equals(lastorganincluded)) { //if (!structures.get(i).getAttributeValue("name").equals(TermOutputerUtilities.toSingular(lastorganincluded))) { structures.remove(i); } } } processCharacterText(tokens, structures, null, true); // process part 1, // which applies to // all // lateststructures, // invisible structures = structurescp; // } } return structures; } /** * * @param structures * @return */ private String listStructureIds(ArrayList<Element> structures) { StringBuffer list = new StringBuffer(); Iterator<Element> it = structures.iterator(); while (it.hasNext()) { Element e = it.next(); list.append(e.getAttributeValue("id") + ", "); } return list.toString().trim().replaceFirst(",$", ""); } // find all () in object private String listStructureNames(String object) { String os = ""; object = object.replaceAll("\\)\\s*\\(", " "); // (leaf) (blade) =>(leaf // blade) Pattern p = Pattern.compile(".*?\\(([^)]*?)\\)(.*)"); Matcher m = p.matcher(object); while (m.matches()) { os += m.group(1) + ", "; object = m.group(2); m = p.matcher(object); } return os.trim().replaceFirst(",$", ""); } /* * private String listStructureNames(ArrayList<Element> structures) { * StringBuffer list = new StringBuffer(); Iterator<Element> it = * structures.iterator(); while(it.hasNext()){ Element e = it.next(); * list.append(e.getAttributeValue("name")+", "); } return * list.toString().trim().replaceFirst(",$", ""); } */ private void createRelationElements(String relation, ArrayList<Element> fromstructs, ArrayList<Element> tostructs, String modifier, boolean symmetric) { // add relation elements relation = relation.replaceAll("(\\w+\\[|\\]|\\{|\\}|\\(|\\))", ""); for (int i = 0; i < fromstructs.size(); i++) { String o1id = fromstructs.get(i).getAttributeValue("id"); String o2id = ""; boolean negation = false; for (int j = 0; j < tostructs.size(); j++) { if (relation.compareTo("between") == 0) o2id += tostructs.get(j).getAttributeValue("id") + " "; else o2id = tostructs.get(j).getAttributeValue("id"); if (modifier.matches(".*?\\b(" + this.negationpt + ")\\b.*")) { negation = true; modifier = modifier.replaceFirst(".*?\\b(" + this.negationpt + ")\\b", "").trim(); } if (relation.matches(".*?\\b(" + this.negationpt + ")\\b.*")) { negation = true; relation = relation.replaceFirst(".*?\\b(" + this.negationpt + ")\\b", "").trim(); } if (relation.compareTo("between") != 0) { addRelation(relation, modifier, symmetric, o1id, o2id, negation, "based_on_text"); } } if (relation.compareTo("between") == 0) { addRelation(relation, modifier, symmetric, o1id, o2id.trim(), negation, "based_on_text"); } } // add other relations as a constraint to the structure: apex of leaves // {rounded}. // expect some character elements in the structure element. // if not, in post-processing, remove the constraint /* * if(relation.compareTo("consists of")!=0){ String constraint = * relation+" "; for(int j = 0; j<this.lateststructures.size(); j++){ * constraint += * this.lateststructures.get(j).getAttributeValue("name")+", "; //organ * name list } constraint.trim().replaceFirst("\\s*,$", ""); for(int i = * 0; i<latests.size(); i++){ addAttribute(latests.get(i), "constraint", * constraint); //base, of leaves, petals; apex, of leaves, petals } } */ } private void addRelation(String relation, String modifier, boolean symmetric, String o1id, String o2id, boolean negation, String inferencemethod) { Element rela = new Element("relation"); if (this.inbrackets) { rela.setAttribute("in_bracket", "true"); } rela.setAttribute("id", "r" + this.relationid); this.relationid++; rela.setAttribute("name", relation); rela.setAttribute("from", o1id); rela.setAttribute("to", o2id); rela.setAttribute("negation", negation + ""); // rela.setAttribute("symmetric", symmetric+""); // rela.setAttribute("inference_method", inferencemethod); // if(modifier.length()>0 && modifier.indexOf("m[")>=0){ if (modifier.length() > 0) { addAttribute(rela, "modifier", modifier.replaceAll("m\\[|\\]", "")); } addClauseModifierConstraint(cs, rela); // this.statement.addContent(rela); //add to statement addContent(this.statement, rela); } /** * * @param pp * @param latests * @param lateststructures2 * @return */ private String relationLabel(String pp, ArrayList<Element> organsbeforepp, ArrayList<Element> organsafterpp) { if (pp.compareTo("of") == 0) { return differentiateOf(organsbeforepp, organsafterpp); } return pp; } private void addAttribute(Element e, String attribute, String value) { if(attribute.compareTo("modifier")==0) value = value.replaceAll("-", " "); value = value.replaceAll("(\\w+\\[|\\]|\\{|\\}|\\(|\\))", "").replaceAll("\\s+;\\s+", ";").replaceAll("\\[", "").trim(); if (value.indexOf("LRB-") > 0) value = NumericalHandler.originalNumForm(value); value = value.replaceAll("\\b(" + this.notInModifier + ")\\b", "").trim(); if (this.evaluation && attribute.startsWith("constraint_")) attribute = "constraint"; if (value.length() > 0) { if (value.indexOf("moreorless") >= 0) { value = value.replaceAll("moreorless", "more or less"); } value = value.replaceAll(" , ", ", ").trim(); String v = e.getAttributeValue(attribute); if (v == null || !v.matches(".*?(^|; )" + value + "(;|$).*")) { if (v != null && v.trim().length() > 0) { v = v.trim() + ";" + value; } else { v = value; } if (attribute.equals("constraintid")) v = v.replaceAll("\\W", " "); // IDREFS are space-separated v = v.replaceAll("\\s+", " ").trim(); e.setAttribute(attribute, v); } } } /** * * @param organs * @param organs2 * @return part-of or consists-of * * involucre of => consists of */ private String differentiateOf(ArrayList<Element> organsbeforeOf, ArrayList<Element> organsafterOf) { String result = "part_of"; try { Statement stmt = conn.createStatement(); for (int i = 0; i < organsbeforeOf.size(); i++) { String b = organsbeforeOf.get(i).getAttributeValue("name"); String pb = organsbeforeOf.get(i).getAttributeValue("name_original"); if(pb.length()==0) pb = b; if (b.matches("(" + ChunkedSentence.pairs + "|" + ChunkedSentence.clusters + ")")) { // z[{2} (pairs)] r[p[of] o[(uroneural) (bones)]] // 2 was marked as the count from the organsbeforeOf List<Element> c = StanfordParser.path8.selectNodes(organsbeforeOf.get(i)); if (c.size() > 0) { // append "pair(s)" to count value, then move counts to // organsafterOf countPairs(c, organsafterOf, organsbeforeOf.get(i)); result = null; break; } else { result = "consist_of"; } break; } for (int j = 0; j < organsafterOf.size(); j++) { String a = organsafterOf.get(j).getAttributeValue("name"); String pa = organsafterOf.get(j).getAttributeValue("name_original"); if(pa.length()==0) pa = a; // String pattern = a+"[ ]+of[ ]+[0-9]+.*"+b+"[,\\.]"; // //consists-of if (a.length() > 0 && b.length() > 0) { String pattern = "(" + b + "|" + pb + ")" + "[ ]+of[ ]+[0-9]+.*" + "(" + a + "|" + pa + ")" + "[ ]?(,|;|\\.|and|or|plus)"; // consists-of String query = "select * from " + this.tableprefix + "_sentence where sentence rlike '" + pattern + "'"; ResultSet rs = stmt.executeQuery(query); if (rs.next()) { result = "consist_of"; break; } rs.close(); } } } stmt.close(); } catch (Exception e) { e.printStackTrace(); } return result; } /** * append "pair(s)" to count value from organsbeforeof, then move counts to * organsafterOf * * @param countCharas * @param organsafterOf * @param elementbeforeOf */ private void countPairs(List<Element> countCharas, ArrayList<Element> organsafterOf, Element elementbeforeOf) { String pair = elementbeforeOf.getAttributeValue("name"); String pairid = elementbeforeOf.getAttributeValue("id"); String organsafterOfids = ""; if (elementbeforeOf.getAttribute("constraint") != null) pair = elementbeforeOf.getAttributeValue("constraint") + " " + pair; Element parentOfCount = countCharas.get(0).getParentElement(); int totalchildren = parentOfCount.getChildren().size(); Iterator<Element> it = countCharas.iterator(); int movedChildren = 0; while (it.hasNext()) { String thispair = pair; // use thispair to avoid multiple "s" be // added to counts in the loop Element count = it.next(); if (count.getAttribute("value") != null) { String ct = count.getAttributeValue("value"); thispair = ct.matches("a|an|one|1|single") ? thispair : thispair + "s"; count.setAttribute("value", ct + " " + thispair); } count.detach(); movedChildren++; Iterator<Element> et = organsafterOf.iterator(); while (et.hasNext()) { Element e = et.next(); addContent(e, count); organsafterOfids += e.getAttributeValue("id") + " "; } } if (totalchildren == movedChildren) parentOfCount.detach(); // because "pair" is treated as a count and not an organ, if its ids are // used in any constraintid and in relations // these ids need to be changed to the ids of organsafterOf organsafterOfids = organsafterOfids.trim(); try { List<Element> elements = XPath.selectNodes(this.statement, ".//character[@constraintid='" + pairid + "']"); for (Element c : elements) { c.setAttribute("constraintid", organsafterOfids); } elements = XPath.selectNodes(this.statement, ".//relation[@to='" + pairid + "']"); for (Element r : elements) { r.setAttribute("to", organsafterOfids); } // structure "pair" no longer there, so drop any relations from // pair. elements = XPath.selectNodes(this.statement, ".//relation[@from='" + pairid + "']"); for (Element r : elements) { r.detach(); } } catch (Exception e) { e.printStackTrace(); } } /** * separate o[......... {m} {m} (o1) and {m} (o2)] to two parts: the last * part include all organ names also handles cases such as o[the {frontal} * and the (sphenotic) ({spine})] * * @param object * @return */ private String[] separate(String object) { String[] twoparts = new String[2]; object = object.replaceFirst("^o\\[", "").replaceFirst("\\]$", "").replaceAll("<", "(").replaceAll(">", ")"); String part2 = ""; if (object.indexOf("(") >= 0) { part2 = object.substring(object.indexOf("(")).trim(); } else if (object.lastIndexOf(" ") >= 0) {// take the last word as an // organ and part2 part2 = object.substring(object.lastIndexOf(" ")).trim(); } else { part2 = object; } String part1 = object.replace(part2, "").trim(); if (part1.length() > 0) { // part 1 may still have modifiers of the first organ in part 2, fix // this. String[] ws1 = part1.split("\\s+"); String[] ws2 = part2.split("\\s+"); String o = ""; for (int i = 0; i < ws2.length; i++) { if (ws2[i].indexOf("(") >= 0) { o += ws2[i] + " "; } else { break; } } o = o.trim(); for (int i = ws1.length - 1; i >= 0; i--) { String escaped = ws1[i].replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}"); if (constraintType(ws1[i].replaceAll("\\W", ""), o) != null) { part1 = part1.replaceFirst("\\s*" + escaped + "$", ""); part2 = ws1[i] + " " + part2; } else { break; } } part1 = part1.replaceAll("\\s+", " ").trim(); part2 = part2.replaceAll("\\s+", " ").trim(); } twoparts[0] = part1; twoparts[1] = part2; return twoparts; } /** * @param object * : o[the {frontal} and the (sphenotic) ({spine})] * @return o[the {frontal} ({spine}) and the (sphenotic) ({spine})] */ private String normalizeSharedOrganObject(String object) { // TODO Auto-generated method stub // if(object.matches(".*?\\b(and|or)\\b.*")){//a conic or subcylindric // lateral extremity is not a case of shared object if (object.matches(".*?\\band\\b.*")) { String norm = ""; String[] segs = object.split("\\s+"); String lastN = segs[segs.length - 1].replaceAll("\\]+$", "").trim(); if (object.matches(".*?\\b(and|or)\\s+" + lastN.replaceFirst("\\(", "\\\\(").replaceFirst("\\)", "\\\\)").replaceFirst("\\{", "\\\\{").replaceFirst("\\}", "\\\\}") + ".*")) { String lastTokenBeforeAnd = segs[segs.length - 3].replaceFirst("\\{", "(").replaceFirst("\\}", ")"); segs[segs.length - 3] = lastTokenBeforeAnd; // o[the {quadrate} // and // (hyomandibula)] } for (int i = segs.length - 1; i >= 0; i--) { norm = segs[i] + " " + norm; // if(segs[i].matches("(,|and|or)") && // !segs[i-1].contains("(")){ if (segs[i].matches("(,|and)") && !segs[i - 1].contains("(")) { norm = lastN + " " + norm; } //if(segs[i].matches("(,|and|or)") && segs[i-1].contains("(")){ if (segs[i].matches("(,|and)") && segs[i - 1].contains("(")) { lastN = segs[i - 1].trim(); } } return norm.trim(); } return object; } /** * TODO: flower and leaf blades??? * * @param ck * : {} (), {} (), () and/or () * @return */ private ArrayList<Element> createStructureElements(String listofstructures, boolean aftercomma/* * , * boolean * makeconstraint */) { ArrayList<Element> results = new ArrayList<Element>(); if (listofstructures.startsWith("l[")) { listofstructures = listofstructures.replaceFirst("^l\\[", "").replaceFirst("\\]$", ""); } // special case: pronouns like them: resolve to the last structure if (listofstructures.matches(".*?\\b(" + ChunkedSentence.pronouns + ")\\b.*") && listofstructures.indexOf(" ") < 0) { for (int e = this.elementlog.size() - 1; e >= 0; e--) { Element el = elementlog.get(e); if (el.getName().compareTo("structure") == 0) { results.add(el); return results; } } } // String[] organs = listofstructures.replaceAll(" (and|or|plus) ", // " , ").split("\\)\\s*,\\s*"); //TODO: flower and leaf blades??? String[] organs = listofstructures.replaceAll(",", " , ").split("\\)\\s+(and|or|plus|,)\\s+"); //Added by Hari for(int i=0;i<organs.length-1;i++) organs[i]=organs[i].trim()+")"; // TODO: flower and leaf blades??? // mohan 28/10/2011. If the first organ is a preposition then join the // preposition with the following organ for (int i = 0; i < organs.length; i++) { if (organs[i].matches("\\{r\\[p\\[.*\\]\\]\\}\\s+\\{.*\\}\\s+.*")) { organs[i] = organs[i].replaceAll("\\]\\]\\}\\s\\{", "]]}-{"); } } String[] sharedcharacters = null; for (int i = 0; i < organs.length; i++) { String[] organ = organs[i].trim().split("\\s+"); // for each organ mentioned, find organ name String o = ""; int j = 0; for (j = organ.length - 1; j >= 0; j--) { // if(organ[j].startsWith("(")){ //(spine tip) /* * if(organ[j].endsWith(")") || organ[j].startsWith("(")){ * //(spine tip) o = organ[j]+" "+o; organ[j] = ""; }else{ * break; } */ if (organ[j].endsWith(")") || organ[j].startsWith("(")) { // (spine // tip) o = organ[j] + " " + o; organ[j] = ""; break; // take the last organ name } } o = o.replaceAll("(\\w\\[|\\]|\\(|\\)|\\}|\\{)", "").trim(); if (o.length() == 0) return results; // create element, Element e = new Element("structure"); if (this.inbrackets) { e.setAttribute("in_bracket", "true"); } String strid = "o" + this.structid; this.structid++; e.setAttribute("id", strid); // e.setAttribute("name", o.trim()); //must have. o = o.trim(); if(o.indexOf("_")>0) { //make sure "_" is used only before the indexes, not btw words of a phrase if(o.matches("(.*?_[\\divx]+)|(.*?_[\\divx]+-[\\divx]+)")){ //handle abc_i-iii, abc_2_to_5, abc_3_and_5, abc_3,4-5... e.setAttribute("type","multi"); e.setAttribute("name", adjustUnderscore(o));//make sure "_" is used only before the indexes, not btw words of a phrase e.setAttribute("name_original", o.replaceAll("_", " ")); }else{ if(isPrematched(o)){ e.setAttribute("name", getSingularPhrase(o.replaceAll("_", " ")).trim()); //prematched phrases from uberon e.setAttribute("name_original", o.replaceAll("_", " ").trim()); } else{ e.setAttribute("name", o); //originally hyphenated phrases such as pubis_ischium e.setAttribute("name_original", o); } } }else{ e.setAttribute("name_original", o); //add structure name as the original text e.setAttribute("name", TermOutputerUtilities.toSingular(o)); } //if e appears right after a comma if(aftercomma){ e.setAttribute("after_comma", "true"); if(debugextraattributes) System.out.println("after_comma:"+e.getAttributeValue("name")); } //Changed by Zilong //if(o.trim().matches("(.*?_[\\divx]+)|(.*?_[\\divx]+-[\\divx]+)")){ //handle abc_i-iii, abc_2_to_5, abc_3_and_5, abc_3,4-5... // e.setAttribute("type","multi"); //} //Changed by Zilong End // must have. //corolla lobes addContent(this.statement, e); results.add(e); // results only adds e // determine constraints while (j >= 0 && organ[j].trim().length() == 0) { j--; } // cauline leaf abaxial surface trichmode hair long boolean terminate = false; boolean distribute = false; String constraint = "";// plain for (; j >= 0; j--) { if (terminate) break; String w = organ[j].replaceAll("(\\w+\\[|\\]|\\{\\(|\\)\\}|\\(\\{|\\}\\))", ""); // mohan code to make w keep all the tags for a preposition // chunk if (organ[j].matches("\\{?r\\[p\\[.*")) { w = organ[j]; } // end mohan code// if (w.equals(",")) { distribute = true; continue; } String type = null; if (organ[j].startsWith("(") || w.endsWith(")")) type = "parent_organ"; else type = constraintType(w, o); if (type != null) { organ[j] = ""; constraint = w + " " + constraint; // plain } else { break; } } j++; if (constraint.trim().length() > 0) { addAttribute(e, "constraint", constraint.replaceAll("(\\(|\\))", "").trim()); // may // not // have. } // determine character/modifier ArrayList<Element> list = new ArrayList<Element>(); list.add(e); // process text reminding in organ if (organ[0].trim().length() > 0) {// has c/m remains, may be shared // by later organs sharedcharacters = organ; } else if (sharedcharacters != null) {// share c/m from a previous // organ organ = sharedcharacters; } processCharacterText(organ, list, null, true); // characters created here // are final and all the // structures will have, // therefore they shall // stay local and not // visible from outside } return results; } /** * * @param phrase: endochondral elements * @return endochondral element */ private String getSingularPhrase(String phrase) { String s = p2sphrases.get(phrase); if(s!=null) return s; return phrase; } /** * * @param o: endochondral_elements * @return */ private boolean isPrematched(String o) { Statement stmt =null; ResultSet rs =null; if(o.compareTo(ApplicationUtilities.getProperty("unknown.structure.name"))==0) return false; try { // collect life_style terms /*stmt = conn.createStatement(); rs = stmt.executeQuery("select distinct term from " + this.glosstable + " where term ='"+o+"'"); if (rs.next()) { return true; }*/ if(phrases.contains(o.replaceAll("_", " "))) return true; } catch (Exception e) { e.printStackTrace(); } finally{ try{ if(rs!=null) rs.close(); if(stmt!=null) stmt.close(); }catch (Exception e) { e.printStackTrace(); } } return false; } /** * thoracic_vertebra_8 * @param o a phrase with underscores connecting each token, for example 'thoracic_vertebra_8 and_9', 'thoracic_vertebra_i-iii' * @return a phrase with underscores connecting the word part and the index, for example 'thoracic vertebra_8', 'thoracic vertebra_i-iii' */ private String adjustUnderscore(String o) { return o.replaceAll("_(?![\\divx]+)", " ").trim(); } /** * cauline leaf abaxial surface thin trichomode hair constraint_type: * trichomode constraint_parent_organ: cauline leaf abaxial surface * * @param fromid * : from_id * @param relation * : "part_of" * @param toorganname * : use this to find the to_id */ private void driveRelationFromStructrueContraint(String fromid, String relation, String toorganname) { try { // try to link toorganname to an previously mentioned organ List<Element> structures = StanfordParser.path7.selectNodes(this.statement); Iterator<Element> it = structures.iterator(); boolean exist = false; while (it.hasNext()) { Element structure = it.next(); String name = structure.getAttributeValue("name"); if (structure.getAttribute("constraint_type") != null) { String tokens = structure.getAttributeValue("constraint_type"); // need // to // reverse // order tokens = reversed(tokens); name = tokens + " " + name; } if (structure.getAttribute("constraint_parent_organ") != null) { name = structure.getAttributeValue("constraint_parent_organ") + " " + name; } if (structure.getAttribute("constraint") != null) { name = structure.getAttributeValue("constraint") + " " + name; } if (name.equals(toorganname)) { exist = true; String toid = structure.getAttributeValue("id"); addRelation(relation, "", false, fromid, toid, false, "based_on_parent_organ_constraint"); break; } } if (!exist) { // create a new structure addRelation(relation, "", false, fromid, "o" + this.structid, false, "based_on_parent_organ_constraint"); toorganname = toorganname.replaceFirst(" (?=\\w+$)", " (") + ")"; // format // organname if (toorganname.indexOf('(') < 0) toorganname = "(" + toorganname; this.createStructureElements(toorganname, false); } } catch (Exception e) { e.printStackTrace(); } } /** * turn "b;a" to "a b" * * @param tokens * @return */ private String reversed(String tokens) { String[] ts = tokens.split("\\s*;\\s*"); String result = ""; for (int i = ts.length - 1; i >= 0; i--) { result += i + " "; } return result.trim(); } /** * bases and tips mostly rounded * * @param tokens * @param parents */ private ArrayList<Element> processCharacterText(String[] tokens, ArrayList<Element> parents, String character, boolean characterismodifier) { ArrayList<Element> results = new ArrayList<Element>(); // determine characters and modifiers String modifiers = ""; for (int j = 0; j < tokens.length; j++) { if (tokens[j].trim().length() > 0) { tokens[j] = NumericalHandler.originalNumForm(tokens[j]); if (tokens[j].indexOf("~list~") >= 0) { results = this.processCharacterList(tokens[j], parents, characterismodifier); } else { String w = tokens[j]; String chara = null; if (tokens[j].matches("\\w{2,}\\[.*")) { chara = tokens[j].substring(0, tokens[j].indexOf('[')); w = tokens[j].replaceAll("(\\w+\\[|\\]|\\{|\\})", ""); } else if (tokens[j].matches("\\w\\[.*")) { w = tokens[j].replaceAll("(\\w+\\[|\\]|\\{|\\})", ""); } w = w.replaceAll("(\\{|\\})", ""); chara = Utilities.lookupCharacter(w, conn, ChunkedSentence.characterhash, glosstable, tableprefix); if (chara == null && w.matches("no")) { chara = "presence"; } if (chara == null && Utilities.isAdv(w, ChunkedSentence.adverbs, ChunkedSentence.notadverbs)) {// TODO: // can // be // made // more // efficient, // since // sometimes // character // is // already // given modifiers += w + " "; } else if (w.matches(".*?\\d.*") && !w.matches(".*?[a-z].*")) {// TODO: // 2 // times // =>2-times? results = this.annotateNumericals(w, "count", modifiers, parents, false, characterismodifier); modifiers = ""; } else { // String chara = MyPOSTagger.characterhash.get(w); if (chara != null) { if (character != null) { chara = character; } if (chara.compareToIgnoreCase("character") == 0 && modifiers.length() == 0) {// high // relief: // character=relief, // reset // the // character // of // "high" // to // "relief" Element lastelement = null; if (results.size() >= 1) { lastelement = results.get(results.size() - 1); } else if (this.latestelements.size() >= 1) { lastelement = this.latestelements.get(this.latestelements.size() - 1); } if (lastelement != null && lastelement.getName().compareTo("character") == 0) { lastelement.setAttribute("name", w); /* * Iterator<Element> it = * this.latestelements.iterator(); * while(it.hasNext()){ lastelement = * it.next(); * lastelement.setAttribute("name", w); } */ } } else { createCharacterElement(parents, results, modifiers, w, chara, "", characterismodifier); // default // type // "" // = // individual // vaues modifiers = ""; } } } } } } return results; } private String createCharacterElement(ArrayList<Element> parents, ArrayList<Element> results, String modifiers, String cvalue, String cname, String char_type, boolean characterismodifier) { Element character = new Element("character"); if (this.inbrackets) { character.setAttribute("in_bracket", "true"); } if(characterismodifier){ character.setAttribute("is_modifier", "true"); if(debugextraattributes) System.out.println("is modifier:"+cvalue); } if (cname.compareTo("count") == 0 && cvalue.indexOf("-") >= 0 && cvalue.indexOf("-") == cvalue.lastIndexOf("-")) { String[] values = cvalue.split("-"); character.setAttribute("char_type", "range_value"); character.setAttribute("name", cname); character.setAttribute("from", values[0]); character.setAttribute("to", values[1]); } else { if (cname.compareTo("size") == 0) { String value = cvalue.replaceFirst("\\b(" + ChunkedSentence.units + ")\\b", "").trim(); // 5-10 // mm String unit = cvalue.replace(value, "").trim(); if (unit.length() > 0) { character.setAttribute("unit", unit); } cvalue = value; } else if (cvalue.indexOf("-c-") >= 0 && (cname.compareTo("color") == 0 || cname.compareTo("coloration") == 0)) {// -c- // set // in // SentenceOrganStateMarkup String color = cvalue.substring(cvalue.lastIndexOf("-c-") + 3); // pale-blue String m = cvalue.substring(0, cvalue.lastIndexOf("-c-")); // color // = // blue // m=pale modifiers = modifiers.length() > 0 ? modifiers + ";" + m : m; cvalue = color; } if (char_type.length() > 0) { character.setAttribute("char_type", char_type); } character.setAttribute("name", cname); character.setAttribute("value", cvalue); } boolean usedm = false; Iterator<Element> it = parents.iterator(); while (it.hasNext()) { Element e = it.next(); character = (Element) character.clone(); if (modifiers.trim().length() > 0) { addAttribute(character, "modifier", modifiers.trim()); // may // not // have usedm = true; } results.add(character); // add to results // e.addContent(character);//add to e addContent(e, character); } if (usedm) { modifiers = ""; } addClauseModifierConstraint(cs, character); return modifiers; } /** * * @param parents * @param w * : m[usually] 0 * @param modifiers * @return */ private ArrayList<Element> annotateCount(ArrayList<Element> parents, String w, String modifiers) { // TODO Auto-generated method stub String modifier = w.replaceFirst("\\d.*", "").trim(); String number = w.replace(modifier, "").trim(); ArrayList<Element> e = new ArrayList<Element>(); Element count = new Element("character"); if (this.inbrackets) { count.setAttribute("in_bracket", "true"); } count.setAttribute("name", "count"); count.setAttribute("value", number); if (modifiers.length() > 0) { this.addAttribute(count, "modifier", modifiers); } if (modifier.length() > 0) { this.addAttribute(count, "modifier", modifier.replaceAll("(m\\[|\\])", "")); } Iterator<Element> it = parents.iterator(); while (it.hasNext()) { count = (Element) count.clone(); e.add(count); // it.next().addContent(count); addContent(it.next(), count); } addClauseModifierConstraint(cs, count); return e; } // if w has been seen used as a modifier to organ o private String constraintType(String w, String o) { String result = null; // mohan code to make w keep all the tags for a preposition chunk if (w.matches("\\{?r\\[p\\[.*"))// for cases such as // "with the head in full face view, the midpoint blah blah....", // "r[p[with head] {in-fullface-view}]" // is treated as a "condition" // constraint { return "condition"; } // mohan code ends. // w = w.replaceAll("\\W", ""); //don't turn frontal-postorbital to // frontalpostorbital String ch = Utilities.lookupCharacter(w, conn, ChunkedSentence.characterhash, this.glosstable, tableprefix); if (ch != null && ch.matches(".*?(_|^)(position|insertion|structure_type|life_stage|functionality)(_|$).*") && w.compareTo("low") != 0) return "type"; String sw = TermOutputerUtilities.toSingular(w); try { Statement stmt = conn.createStatement(); // Nov 30th 2011. Considered to use glossary, term_category, // wordroles to replace sentence markup evidence. For some // collections (e.g. phenotype test) sentence markup is not reliable ResultSet rs = stmt.executeQuery("select * from " + this.tableprefix + "_sentence where tag = '" + w + "' or tag='" + sw + "'"); if (rs.next()) { return "parent_organ"; } // rs = // stmt.executeQuery("select * from "+this.tableprefix+"_sentence where modifier = '"+w+"' or modifier like '"+w+" %' or modifier like '% "+w+" %' or modifier like '% "+w+"'"); rs = stmt.executeQuery("select * from " + this.tableprefix + "_sentence where modifier = '" + w + "'"); if (rs.next()) { return "type"; } rs.close(); stmt.close(); } catch (Exception e) { e.printStackTrace(); } return result; } /** * output annotated sentence in XML format Chunk types: PrepChunk, * IVerbChunk (Intransitive verb chunk, followed by a preposition), * VerbChunk, ADJChunk, SBARChunk, etc. * * @return */ /* * public Element annotate() throws Exception{ //query the sentence database * for the tag/modifier for this sentence, using this.sentsrc //also use the * substructure table to resolve of-clauses ArrayList<Chunk> chunks = new * ArrayList(); ArrayList<String> structureIDs = new ArrayList(); * * this.currentsubject = "fetch from sentence table"; String modifier = * "fetch from sentence table"; this.currentmainstructure = * createStructureElement(this.currentsubject, modifier, this.structid++); * while(cs.hasNext()){ Chunk chunk = cs.nextChunk(); chunks.add(chunk); * if(chunk instanceof Organ){ String organ = chunk.getText(); * if(chunks.size() == 1){ continue; //this is current subject read from the * sentence table }else{ * this.statement.addContent(this.currentmainstructure); //add this * completed structure //create a new structure element } }else if(chunk * instanceof PrepChunk){ String pphrase = chunk.getText(); int chunkid = * cs.getPointer() - 1; Element thiselement = * (Element)XPath.selectSingleNode(root, ".\\*[id='"+chunkid+"']"); //IN * String relationname = thiselement.getAttributeValue("text"); //create * structure(s) from the NPs. e.g "3 florets", character/modifier before * organnames //NP may be a list of NPs String np = * pphrase.replaceFirst("^"+relationname, "").trim(); ArrayList oids = * annotateNP(np); //in which <structure> may be created and inserted into * the <statement> if(chunks.get(chunks.size()-2) instanceof Organ){ //apex * of leaves //create a relation Element relation = * createRelationElement(this.relationid++); }else{ //create a constraint * for the last character } }else if(chunk instanceof SBARChunk){ * //SBARChunk could follow any xyzChunk * * * }else if(chunk instanceof SimpleCharacterState){ //check for its * character //associate it with current subject if(this.currentsubject * ==null){ //save this as a constraint for the to-be-discovered subject } } * } * * return statement; * * } */ public void setInBrackets(boolean b) { this.inbrackets = b; } /** * * @param measurements * : CI 72 - 75 (74 ), SI 99 - 108 (102 ), PeNI 73 - 83 (73 ), * LPeI 46 - 53 (46 ), DPeI 135 - 155 (145 ). */ private void annotatedMeasurements(String measurements) { measurements = measurements.replaceAll("�", "-"); Element whole = new Element(ApplicationUtilities.getProperty("unknown.structure.name")); // this.statement.addContent(whole); addContent(this.statement, whole); ArrayList<Element> parent = new ArrayList<Element>(); parent.add(whole); // select delimitor int comma = measurements.replaceAll("[^,]", "").length(); int semi = measurements.replaceAll("[^;]", "").length(); String del = comma > semi ? "," : ";"; String[] values = measurements.split(del); for (int i = 0; i < values.length; i++) { String value = values[i].replaceFirst("[,;\\.]\\s*$", ""); // separate char from values String chara = value.replaceFirst("\\s+\\d.*", ""); String vstring = value.replaceFirst("^" + chara, "").trim(); // seperate modifiers from vlu in case there is any String vlu = vstring.replaceFirst("\\s+[a-zA-Z].*", "").trim(); String modifier = vstring.substring(vlu.length()).trim(); modifier = modifier.length() > 0 ? "m[" + modifier + "]" : null; vlu = vlu.replaceAll("(?<=\\d)\\s*\\.\\s*(?=\\d)", "."); this.annotateNumericals(vlu.trim(), chara.trim(), modifier, parent, false, false); } } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub } }