/* * $HeadURL$ * $Id$ * Copyright (c) 2006-2014 by Public Library of Science http://plos.org http://ambraproject.org * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ambraproject.rhino.shared; import org.ambraproject.util.TextUtils; import org.ambraproject.views.AuthorView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import javax.xml.transform.TransformerException; import javax.xml.xpath.XPathException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * Contains logic for extracting author information from article XML. * <p/> * TODO: store all this data in the database instead at ingest time, and then * destroy this class with prejudice. */ public final class AuthorsXmlExtractor { private static final Logger log = LoggerFactory.getLogger(AuthorsXmlExtractor.class); /** * Patterns for <corresp></corresp> | <email></email> | <sec></sec> etc., tags and other content */ public static final Pattern[] PATTERNS = { Pattern.compile("<corresp(.*?)>"), Pattern.compile("</corresp>"), Pattern.compile("<email(?:" + "(?:\\s+xmlns:xlink\\s*=\\s*\"http://www.w3.org/1999/xlink\"\\s*)|" + "(?:\\s+xlink:type\\s*=\\s*\"simple\"\\s*)" + ")*>(.*?)</email>"), Pattern.compile("^E-mail:"), Pattern.compile("^\\* E-mail:"), Pattern.compile("\\*To whom"), Pattern.compile("\\* To whom"), Pattern.compile("<sec(?:.*)*>"), Pattern.compile("</sec>"), Pattern.compile("<list-item>"), Pattern.compile("</list-item>"), Pattern.compile("</list>"), Pattern.compile("<list(\\s+list-type=\"bullet\")?>"), Pattern.compile("<list\\s+list-type=\"(.*)\">"), Pattern.compile("<list(?:.*)*>"), Pattern.compile("<title(?:.*)*>"), Pattern.compile("<body[^>]*>"), Pattern.compile("</body>") }; /** * Pattern replacements for <corresp></corresp> | <email></email> | <sec></sec> etc., tags and other content */ public static final String[] REPLACEMENTS = { "", "", "<a href=\"mailto:$1\">$1</a>", "<span class=\"email\">* E-mail:</span>", "<span class=\"email\">* E-mail:</span>", "<span class=\"email\">*</span>To whom", "<span class=\"email\">*</span>To whom", "", "", "\n<li>", "</li>", "</ul>", "<ul class=\"bulletlist\">", "<ol class=\"$1\">", "<ul class=\"bulletlist\">", "", "", "" }; private AuthorsXmlExtractor() {} /** * Retrieves the authors as {@link AuthorView}s from article XML. * * @param doc parsed representation of the article XML * @param xpath XPathExtractor to use to process xpath expressions * @return list of AuthorView objects */ public static List<AuthorView> getAuthors(Document doc, XPathExtractor xpath) { ArrayList<AuthorView> list = new ArrayList<AuthorView>(); if (doc == null) { return list; } try { Map<String, String> affiliateMap = getAffiliateMap(doc, xpath); Map<String, String> addressMap = getAddressMap(doc, xpath); Map<String, String> otherFootnotesMap = getOtherFootnotesMap(doc, xpath); //Get all the authors NodeList authorList = xpath.selectNodes(doc, "//contrib-group/contrib[@contrib-type='author']"); for (int i = 0; i < authorList.getLength(); i++) { Node authorNode = authorList.item(i); //Create temp author document fragment to search out of DocumentFragment authorDoc = doc.createDocumentFragment(); //I thought this strange, appendChild actually moves the node in the case of document fragment //hence below I clone to keep the original DOM intact. //re: http://docs.oracle.com/javase/1.4.2/docs/api/org/w3c/dom/Node.html#appendChild%28org.w3c.dom.Node%29 authorDoc.appendChild(authorNode.cloneNode(true)); Node surNameNode = xpath.selectNode(authorDoc, "//name/surname"); Node givenNameNode = xpath.selectNode(authorDoc, "//name/given-names"); Node collabNameNode = xpath.selectNode(authorDoc, "//collab"); Node behalfOfNode = xpath.selectNode(authorDoc, "//on-behalf-of"); NodeList otherFootnotesNodeList = xpath.selectNodes(authorDoc, "//xref[@ref-type='fn']"); //Sometimes, an author is not a person, but a collab //Note:10.1371/journal.pone.0032315 if (surNameNode == null && givenNameNode == null) { if(collabNameNode != null) { //If current node is a collab author. Make sure previous author //Is not marked as "on behalf of" If so, we can ignore this collab //It is assumed this collab contains the same text as the value of the //Previous authors "on behalf of" node if(list.size() > 0) { if(list.get(list.size() - 1).getOnBehalfOf() != null) { //Craziness ensues here. Previous author has "on behalf of", lets append any //footnotes from this contrib to that author! for(int a = 0; a < otherFootnotesNodeList.getLength(); a++) { Node node = otherFootnotesNodeList.item(a); if(node.getAttributes().getNamedItem("rid") != null) { String id = node.getAttributes().getNamedItem("rid").getTextContent(); String value = otherFootnotesMap.get(id); if(value != null) { AuthorView av = list.get(list.size() - 1); //This may look a bit odd, but because the AuthorView is immutable //I have to create a new copy to change any values List<String> footnotes = new ArrayList<String>(); footnotes.addAll(av.getCustomFootnotes()); value = fixPilcrow(value, false); footnotes.add(value); list.set(list.size() - 1, AuthorView.builder(av) .setCustomFootnotes(footnotes) .build()); } } } break; } } } givenNameNode = collabNameNode; } // If both of these are null then don't bother to add if (surNameNode != null || givenNameNode != null) { Node suffixNode = xpath.selectNode(authorDoc, "//name/suffix"); Node equalContribNode = xpath.selectNode(authorDoc, "//@equal-contrib"); Node deceasedNode = xpath.selectNode(authorDoc, "//@deceased"); Node corresAuthorNode = xpath.selectNode(authorDoc, "//xref[@ref-type='corresp']"); NodeList addressList = xpath.selectNodes(authorDoc, "//xref[@ref-type='fn']/sup[contains(text(),'¤')]/.."); NodeList affList = xpath.selectNodes(authorDoc, "//xref[@ref-type='aff']"); // Either surname or givenName can be blank String surname = (surNameNode == null) ? null : surNameNode.getTextContent(); String givenName = (givenNameNode == null) ? null : givenNameNode.getTextContent(); String suffix = (suffixNode == null) ? null : suffixNode.getTextContent(); String onBehalfOf = (behalfOfNode == null) ? null : behalfOfNode.getTextContent(); boolean equalContrib = (equalContribNode != null); boolean deceased = (deceasedNode != null); boolean relatedFootnote = false; String corresponding = null; List<String> currentAddresses = new ArrayList<String>(); for(int a = 0; a < addressList.getLength(); a++) { Node addressNode = addressList.item(a); if(addressNode.getAttributes().getNamedItem("rid") != null) { String fnId = addressNode.getAttributes().getNamedItem("rid").getTextContent(); String curAddress = addressMap.get(fnId); //A fix for PBUG-153, sometimes addresses are null because of weird XML if(curAddress == null) { log.warn("No found current-aff footnote found for fnID: {}", fnId); } else { if(currentAddresses.size() > 0) { //If the current address is already defined, remove "current" text from subsequent //addresses currentAddresses.add(fixCurrentAddress(curAddress)); } else { currentAddresses.add(curAddress); } } } } //Footnotes //Note this web page for notes on author footnotes: //http://wiki.plos.org/pmwiki.php/Publications/FootnoteSymbolOrder List<String> otherFootnotes = new ArrayList<String>(); for(int a = 0; a < otherFootnotesNodeList.getLength(); a++) { Node node = otherFootnotesNodeList.item(a); if(node.getAttributes().getNamedItem("rid") != null) { String id = node.getAttributes().getNamedItem("rid").getTextContent(); String value = otherFootnotesMap.get(id); if(value != null) { //If the current footnote is also referenced by another contrib //We want to notify the end user of the relation if(hasRelatedFootnote(doc, xpath, id)) { value = fixPilcrow(value, true); relatedFootnote = true; } else { value = fixPilcrow(value, false); } otherFootnotes.add(value); } } } if(corresAuthorNode != null) { Node attr = corresAuthorNode.getAttributes().getNamedItem("rid"); if(attr == null) { log.warn("No rid attribute found for xref ref-type=\"corresp\" node."); } else { String rid = attr.getTextContent(); Node correspondAddrNode = xpath.selectNode(doc, "//author-notes/corresp[@id='" + rid + "']"); if(correspondAddrNode == null) { log.warn("No node found for corrsponding author: author-notes/corresp[@id='\" + rid + \"']"); } else { corresponding = TextUtils.getAsXMLString(correspondAddrNode); corresponding = transFormCorresponding(corresponding); } } } List<String> affiliations = new ArrayList<String>(); // Build a list of affiliations for this author for (int j = 0; j < affList.getLength(); j++) { Node anode = affList.item(j); if(anode.getAttributes().getNamedItem("rid") != null) { String affId = anode.getAttributes().getNamedItem("rid").getTextContent(); String affValue = affiliateMap.get(affId); //A fix for PBUG-149, sometimes we get wacky XML. This should handle it so at least the //List returned by this method is well structured if(affValue != null) { affiliations.add(affValue); } } } AuthorView author = AuthorView.builder() .setGivenNames(givenName) .setSurnames(surname) .setSuffix(suffix) .setOnBehalfOf(onBehalfOf) .setEqualContrib(equalContrib) .setDeceased(deceased) .setRelatedFootnote(relatedFootnote) .setCorresponding(corresponding) .setCurrentAddresses(currentAddresses) .setAffiliations(affiliations) .setCustomFootnotes(otherFootnotes) .build(); list.add(author); } } } catch (Exception e) { //TODO: Why does this die silently? log.error("Error occurred while gathering the author affiliations.", e); } return list; } /** * Grab all affiliations and put them into their own map * * @param doc the article XML document * @param xpath XPathExtractor to use to process xpath expressions * @return a Map of affiliate IDs and values */ public static Map<String, String> getAffiliateMap(Document doc, XPathExtractor xpath) throws XPathException { Map<String, String> affiliateMap = new LinkedHashMap<String, String>(); NodeList affiliationNodeList = xpath.selectNodes(doc, "//aff"); //Map all affiliation id's to their affiliation strings for (int a = 0; a < affiliationNodeList.getLength(); a++) { Node node = affiliationNodeList.item(a); // Not all <aff>'s have the 'id' attribute. String id = (node.getAttributes().getNamedItem("id") == null) ? "" : node.getAttributes().getNamedItem("id").getTextContent(); log.debug("Found affiliation node:" + id); // Not all <aff> id's are affiliations. if (id.startsWith("aff")) { DocumentFragment df = doc.createDocumentFragment(); //because of a org.w3c.Document.dom.Document peculiarity, simple appellation will strip it from the source and //cause bugs, so we need cloning technology df.appendChild(node.cloneNode(true)); StringBuilder res = new StringBuilder(); if(xpath.selectNode(df, "//institution") != null) { res.append(xpath.selectString(df, "//institution")); } if(xpath.selectNode(df, "//addr-line") != null) { if(res.length() > 0) { res.append(" "); } res.append(xpath.selectString(df, "//addr-line")); } affiliateMap.put(id, res.toString()); } } return affiliateMap; } /** * Grab all addresses and put them into their own map * * @param doc the article XML document * @param xpath XPathExtractor to use to process xpath expressions * @return a Map of address IDs and values */ private static Map<String, String> getAddressMap(Document doc, XPathExtractor xpath) throws XPathException { Map<String, String> addressMap = new HashMap<String, String>(); //Grab all the Current address information and place them into a map NodeList currentAddressNodeList = xpath.selectNodes(doc, "//fn[@fn-type='current-aff']"); for (int a = 0; a < currentAddressNodeList.getLength(); a++) { Node node = currentAddressNodeList.item(a); String id = (node.getAttributes().getNamedItem("id") == null) ? "" : node.getAttributes().getNamedItem("id").getTextContent(); log.debug("Current address node:" + id); DocumentFragment df = doc.createDocumentFragment(); df.appendChild(node); String address = xpath.selectString(df, "//p"); addressMap.put(id, address); } return addressMap; } /** * Grab all footnotes and put them into their own map * * @param doc the article XML document * @param xpath XPathExtractor to use to process xpath expressions * @return a Map of footnote IDs and values */ private static Map<String, String> getOtherFootnotesMap(Document doc, XPathExtractor xpath) throws XPathException, TransformerException { Map<String, String> otherFootnotesMap = new HashMap<String, String>(); //Grab all 'other' footnotes and put them into their own map NodeList footnoteNodeList = xpath.selectNodes(doc, "//fn[@fn-type='other']"); for (int a = 0; a < footnoteNodeList.getLength(); a++) { Node node = footnoteNodeList.item(a); // Not all <aff>'s have the 'id' attribute. String id = (node.getAttributes().getNamedItem("id") == null) ? "" : node.getAttributes().getNamedItem("id").getTextContent(); log.debug("Found footnote node:" + id); DocumentFragment df = doc.createDocumentFragment(); df.appendChild(node); String footnote = TextUtils.getAsXMLString(xpath.selectNode(df, "//p")); otherFootnotesMap.put(id, footnote); } return otherFootnotesMap; } /** * Reformat html embedded into the XML into something more easily styled on the front end * * @param source html fragment * @param prependHTML if true, append a html snippet for a 'pilcro' * * @return html fragment */ private static String fixPilcrow(String source, boolean prependHTML) { String destination; if(prependHTML) { destination = source.replace("<sup>¶</sup>", "<span class=\"rel-footnote\">¶</span>"); destination = destination.replaceAll("^<p>¶?\\s*", "<p><span class=\"rel-footnote\">¶</span>"); } else { destination = source.replace("<sup>¶</sup>", ""); destination = destination.replaceAll("^<p>¶?\\s*", "<p>"); } return destination; } /** * Remove "current" text from an address field * * @param source text fragment * * @return text fragment */ private static String fixCurrentAddress(String source) { String destination; destination = source.replaceAll("Current\\s[Aa]ddress:\\s*", ""); return destination; } /** * Check to see if the current footnote is referenced by multiple contribs * If the current footnote is also referenced by another contrib * We want to notify the end user of the relation * * @param doc the document * @param xpathExtractor XPathExtractor to use to process xpath expressions * @param rid the rid to search for, the RID is an attribute of a footnote that * attaches a footnote to one or many authors * * @return true if the rid is referenced by contribs more then once * * * @throws XPathExpressionException */ private static boolean hasRelatedFootnote(Node doc, XPathExtractor xpathExtractor, String rid) throws XPathException { String xpath = "//contrib/xref[@ref-type='fn' and @rid='" + rid + "']"; log.debug("xpath: {}", xpath); NodeList nl = xpathExtractor.selectNodes(doc, xpath); log.debug("nodecount: {}", nl.getLength()); if(nl.getLength() > 1) { return true; } else { return false; } } /** * Kludge for FEND-794, A better ways of doing this? * * Reformat html embedded into the XML into something more easily styled on the front end * * @param source html fragment * * @return html fragment */ public static String transFormCorresponding(String source) { for (int index = 0; index < PATTERNS.length; index++) { source = PATTERNS[index].matcher(source).replaceAll(REPLACEMENTS[index]); } return source; } }