/** * */ package outputter.prep; import java.io.File; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import outputter.ApplicationUtilities; import outputter.Utilities; import outputter.knowledge.Dictionary; /** * @author updates * */ public class XMLNormalizer { private static final Logger LOGGER = Logger.getLogger(XMLNormalizer.class); public Element root; public static XPath pathWithHaveHasRelation; public static XPath pathRangeValueCharacter; public static XPath pathCountStructure; public static XPath pathCharacterStatement; public static XPath pathStateStatement; public static XPath pathWholeOrganismStructure; public static XPath pathNonWholeOrganismStructure; public static XPath pathCharacter; public static XPath pathText; public static XPath pathModifier; static{ try{ pathCharacterStatement = XPath.newInstance(".//statement[@statement_type='character']"); pathStateStatement = XPath.newInstance(".//statement[@statement_type='character_state']"); pathWithHaveHasRelation = XPath.newInstance("//relation[@name='with'] | //relation[@name='have'] | //relation[@name='has']"); pathRangeValueCharacter = XPath.newInstance("//character[@char_type='range_value']"); pathCharacter = XPath.newInstance(".//character"); pathCountStructure = XPath.newInstance("//structure[character[@name='count']]"); pathText = XPath.newInstance(".//text"); pathWholeOrganismStructure = XPath.newInstance(".//structure[@name='"+ApplicationUtilities.getProperty("unknown.structure.name")+"']"); pathNonWholeOrganismStructure = XPath.newInstance(".//structure[@name!='"+ApplicationUtilities.getProperty("unknown.structure.name")+"']"); pathModifier = XPath.newInstance("//character[@is_modifier='true']"); }catch(Exception e){ LOGGER.error("", e); } } /** * preprocessing xml so the data in it are suitable for subsequent EQ generation */ public XMLNormalizer(Element root) { this.root = root; } public void normalize(){ try{ //with2partof(root); removeCategoricalRanges(root); // expect 1 file to have 1 character statement and n statements, but for generality, use arrayList for characterstatements too. //characterstatements are character descriptions List<Element> characterstatements = pathCharacterStatement.selectNodes(root); integrateWholeOrganism4CharacterStatements(characterstatements, root); repairWholeOrganismOnlyCharacterStatements(characterstatements, root); //Fixing size to corresponding measure fixSizeForRespectiveMeasureOnlyCharacterStatements(root); collapsePreps(root); //A with a row of B => <structure name="B" constraint="a row of"><relation name="with" from="A" to="B"> //merge segments in the character statement mergeCharacterStatement(root); spatialModifier(root); //XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); //System.out.println(outputter.outputString(root)); }catch(Exception e){ LOGGER.error("", e); } } /** * to correct results from incorrect categorization of spatial terms: dorsal fin => dorsal should be constraint, not a feature * @param root */ private void spatialModifier(Element root) { try{ List<Element> modchars = pathModifier.selectNodes(root); for(int i =0; i< modchars.size(); i++){ Element modchar = modchars.get(i); if(modchar.getAttribute("value")!=null){ String value = modchar.getAttributeValue("value"); if(value.matches(Dictionary.spatialtermptn)){ Element struct = modchar.getParentElement(); String constraint = struct.getAttributeValue("constraint")==null? " " : struct.getAttributeValue("constraint")+" "; String phrase = value + constraint +struct.getAttributeValue("name_original"); if(struct.getParentElement().getChildText("text").toLowerCase().replaceAll("-", " ").contains(phrase)){ constraint += value + " "+constraint; struct.setAttribute("constraint", constraint.trim()); modchar.detach(); } } } } }catch(Exception e){ LOGGER.error("",e); } } /** * merge multiple character statements into one * replace those statements with the merged one * * @param characterstatements * @return merged */ private void mergeCharacterStatement(Element root) { try{ List<Element> characterstatements = pathCharacterStatement.selectNodes(root); if(characterstatements.size()==1) return; Element merged = characterstatements.get(0); //Element parent = characterstatements.get(0).getParentElement(); //merged.detach(); //parent.addContent(merged); String mergedtext = merged.getChildText("text").trim() + " "; for(int i = 1; i < characterstatements.size(); i++){ Element e = characterstatements.get(i); mergedtext += e.getChildText("text")+" "; e.getChild("text").detach(); List<Element> children = e.getChildren(); while(children.size()>0){ Element c = children.get(0); c.detach(); merged.addContent(c); } e.detach(); } merged.getChild("text").setText(mergedtext.trim()); }catch(Exception e){ LOGGER.error("", e); } } /** * A with a row of B => <structure name="B" constraint="a row of"><relation name="with" from="A" to="B"> * @param root */ private void collapsePreps(Element root) { List<Element> descriptions = root.getChildren("description"); //should have only 1 description for(Element description: descriptions){ List<Element> statements = description.getChildren("statement"); for(Element statement: statements){ collapsePrepsInStatement(statement); } } } /** * <statement statement_type="character" character_id="states356" seg_id="0"> <text>Coronoids with a row of very small teeth or denticles lateral to tooth-row</text> <structure id="o228" name="coronoid" /> <structure id="o229" name="row" /> <relation id="r37" name="with" from="o228" to="o229" negation="false" /> <structure id="o230" name="tooth"> <character name="size" value="small" modifier="very" /> </structure> <structure id="o231" name="denticle"> <character name="size" value="small" modifier="very" /> </structure> <relation id="r38" name="consist_of" from="o229" to="o230" negation="false" /> <relation id="r39" name="consist_of" from="o229" to="o231" negation="false" /> <structure id="o232" name="tooth row" /> <relation id="r40" name="lateral to" from="o230" to="o232" negation="false" /> <relation id="r41" name="lateral to" from="o231" to="o232" negation="false" /> </statement> */ /** * A with a row of B => <structure name="B" constraint="a row of"><relation name="with" from="A" to="B"> * @param statement */ @SuppressWarnings("unchecked") private void collapsePrepsInStatement(Element statement) { //find relations that are both to and from organs in different relation //when it is a from organ, the relation is "consist of" //expanding beyond "consist_of" can be risky try{ boolean fixed = false; ArrayList<Element> tobedetached = new ArrayList<Element>(); List<Element> relationofs = XPath.selectNodes(statement, ".//relation[@name='consist_of']"); for(Element relationof: relationofs){ String rowid = relationof.getAttributeValue("from"); String strid = relationof.getAttributeValue("to"); //tooth id List<Element> relationwiths = XPath.selectNodes(statement, ".//relation[@to='"+rowid+"']"); if(relationwiths!=null && relationwiths.size()>0 && !Utilities.hasCharacters(rowid, root) //'row' without characters && relationofs.equals(XPath.selectNodes(statement, ".//relation[@from='"+rowid+"']")))//no other relations refers to rowid, so row structure may be safely removed { //found the target, now transform //1. clone relationwiths, then replace rowid with strid in relationwiths clones, add clones to xml, schedule to detach originals for(Element relationwith: relationwiths){ Element relationwithcp = (Element) relationwith.clone(); relationwithcp.setAttribute("to", strid); statement.addContent(relationwithcp); tobedetached.add(relationwith); } //2. remove row <structure> Element row = (Element) XPath.selectSingleNode(statement, ".//structure[@id='"+rowid+"']"); tobedetached.add(row); //3. add constraint "row of" to strid Element struct = (Element) XPath.selectSingleNode(statement, ".//structure[@id='"+strid+"']"); String constraint = struct.getAttribute("constraint")==null? "" : struct.getAttributeValue("constraint") +";"; struct.setAttribute("constraint", constraint+ Utilities.getStructureName(root, rowid) +" of"); //4. remove relationofs tobedetached.add(relationof); fixed=true; } } //detach unneeded elements for(Element e : tobedetached){ e.detach(); } if(fixed){ XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); System.out.println(outputter.outputString(root)); } }catch(Exception e){ LOGGER.error("", e); } } /* * Currently the below text * <text>posteriormost teeth at least twice height of anteriormost teeth</text> * is intepreted as, * * <structure constraint="posteriormost" name="tooth" id="o324"> * <character constraint="height of anteriormost teeth" name="size" value="2 times" modifier="at-least" constraintid="o325"/> * </structure> <structure constraint="anteriormost" name="tooth" id="o325"/> * * */ @SuppressWarnings("unchecked") private void fixSizeForRespectiveMeasureOnlyCharacterStatements(Element root) throws Exception { List<Element> characters = pathCharacter.selectNodes(root); for(Element chara:characters) { if(chara.getAttributeValue("name").equals("size")) { if((chara.getAttributeValue("constraint")!=null)&&(chara.getAttributeValue("constraint").matches(".*(height|width|length|depth).*"))) { //System.out.println("-----------inside normalizer------------"); if(chara.getAttributeValue("constraint").contains("height")) { chara.setAttribute("name","height"); } else if(chara.getAttributeValue("constraint").contains("width")) { chara.setAttribute("name","width"); } else if(chara.getAttributeValue("constraint").contains("depth")) { chara.setAttribute("name","depth"); } else { chara.setAttribute("name","length"); } } } } } /** * A with B => B part_of A * only sometimes A is not a structure: for example: "body scale: rhomboid with internal ridge; rounded" * @param root * @return */ @SuppressWarnings("unchecked") private void with2partof(Element root) throws Exception { List<Element> withs = pathWithHaveHasRelation.selectNodes(root); for (Element with : withs) { String to = with.getAttributeValue("to"); String from = with.getAttributeValue("from"); with.setAttribute("name", "part_of"); with.setAttribute("to", from); with.setAttribute("from", to); } } /** * remove categorical char_type="range_value" * recompose numerical char_type="range_value" * recompose count "7 or more" * * @param root */ @SuppressWarnings("unchecked") private void removeCategoricalRanges(Element root) throws Exception { List<Element> charas = pathRangeValueCharacter.selectNodes(root); for (Element chara : charas) { if (!chara.getAttributeValue("from").matches("[\\d\\.]+") && !chara.getAttributeValue("to").matches("[\\d\\.]+")) { chara.detach(); // remove } else {// recompose String value = chara.getAttributeValue("from") + (chara.getAttribute("from_unit") == null ? "" : chara.getAttributeValue("from_unit")) + " to " + chara.getAttributeValue("to") + (chara.getAttribute("to_unit") == null ? "" : chara.getAttributeValue("to_unit")); chara.removeAttribute("from"); chara.removeAttribute("from_unit"); chara.removeAttribute("to"); chara.removeAttribute("to_unit"); chara.removeAttribute("char_type"); chara.setAttribute("value", value.trim()); } } List<Element> structs = pathCountStructure.selectNodes(root); for (Element struct : structs) { charas = struct.getChildren(); int i = 0; String name = ""; while (charas.size() > i && !name.equals("count")) { name = charas.get(i).getAttributeValue("name"); i++; } if (name.equals("count") && charas.size() > i) { // i is count, check i+1 if (charas.get(i).getAttributeValue("name").equals("count")) { String value = charas.get(i - 1).getAttributeValue("value") + " or " + charas.get(i).getAttributeValue("value"); charas.get(i - 1).setAttribute("value", value); charas.get(i).detach(); } } } } /** * in phylogenetic descriptions, whole-organisms are not semantically possible in a character statement * [although whole-organism is used as ditto in state statement] * if such element exists in a character statement, remove whole_organism and merge its character to the next structure * Turn: * <statement statement_type="character" character_id="c97d5c39-838d-4cbb-b159-bce93a7a7291" seg_id="0"> * <text>Hatchet-shaped opercle</text> * <structure id="o1507" name="whole_organism"> * <character name="shape" value="hatchet-shaped" /> * </structure> * <structure id="o1508" name="opercle" /> * </statement> * To: * <statement statement_type="character" character_id="c97d5c39-838d-4cbb-b159-bce93a7a7291" seg_id="0"> * <text>Hatchet-shaped opercle</text> * </structure> * <structure id="o1508" name="opercle" > * <character name="shape" value="hatchet-shaped" /> * </structure> * </statement> * * @param characterstatements * @return * */ @SuppressWarnings("unchecked") private void integrateWholeOrganism4CharacterStatements(List<Element> characterstatements, Element root) throws Exception { for (Element statement : characterstatements) { List<Element> wholeOrgans = pathWholeOrganismStructure.selectNodes(statement); if (wholeOrgans.size() > 0 && statement.getChildren("structure").size() > wholeOrgans.size()) { // collect ids and chars from wholeOrgans ArrayList<Element> chars = new ArrayList<Element>(); ArrayList<String> woids = new ArrayList<String>(); for (Element wo : wholeOrgans) { woids.add(wo.getAttributeValue("id")); chars.addAll((List<Element>) wo.getChildren("character")); wo.detach(); } // integration Element firststructure = (Element) pathNonWholeOrganismStructure.selectSingleNode(statement); String sid = firststructure.getAttributeValue("id"); // add characters for (Element chara : chars) { chara.detach(); firststructure.addContent(chara); } // replace ids in relations for (String woid : woids) { Utilities.changeIdsInRelations(woid, sid, root); } } } } /** * character statements that contain 1 structure "whole_organism" * these were caused by annotation errors * for example "IO4", "A-B contact", "bony stays" etc. * * @param characterstatements * : character statements that contain 1 structure "whole_organism". This should not be possible. Remark it as structure/entity * @param root */ @SuppressWarnings("unchecked") private void repairWholeOrganismOnlyCharacterStatements(List<Element> characterstatements, Element root) throws Exception { for (Element statement : characterstatements) { List<Element> structures = pathNonWholeOrganismStructure.selectNodes(statement); if (structures.size() == 0) { // repair Element etext = (Element) pathText.selectSingleNode(statement); String text = etext.getTextTrim().replaceAll("\\[.*?\\]", ""); String struct = text.replaceFirst(".* ", ""); String constraint = text.replace(struct, "").trim(); List<Element> wos = XPath.selectNodes(statement, ".//structure[@name='"+ApplicationUtilities.getProperty("unknown.structure.name")+"']"); if (wos.size() > 0) { for (int i = 1; i < wos.size(); i++) { wos.get(i).detach(); wos.remove(i); } Element wo = wos.get(0); wo.setAttribute("name", struct); wo.setAttribute("constraint", constraint); wo.removeContent(); } } } } /** * @param args */ public static void main(String[] args) { try{ File dir = new File(ApplicationUtilities.getProperty("source.dir")+"final"); File[] files = dir.listFiles(); for(File f: files){ //File f = new File(ApplicationUtilities.getProperty("source.dir")+"test", "Swartz 2012.xml_states356.xml"); SAXBuilder builder = new SAXBuilder(); Document xml = builder.build(f); Element root = xml.getRootElement(); new XMLNormalizer(root).normalize(); } }catch(Exception e){ LOGGER.error("", e); } } }