package eu.dnetlib.iis.wf.ingest.pmc.plaintext; import com.google.common.base.Joiner; import com.google.common.collect.Lists; import org.jdom.Element; import org.jdom.Namespace; import org.jdom.Text; import org.jdom.output.XMLOutputter; import java.util.List; /** * @author Dominika Tkaczyk * @author Michal Oniszczuk (m.oniszczuk@icm.edu.pl) */ public final class NlmToDocumentTextConverter { /** * Private constructor. */ private NlmToDocumentTextConverter() { } public static String getDocumentText(Element source, Namespace namespace) { Element articleElement = getArticleElement(source, namespace); return Joiner.on("\n").skipNulls().join(getMetadataText(articleElement), getBodyText(articleElement), getReferencesText(articleElement)); } /** * Provides article element as root element or nested inside oai record. * * @param source * @param oaiNamespace * @return article root element or child of oai:metadata element. */ private static Element getArticleElement(Element source, Namespace oaiNamespace) { Element metadata = source.getChild("metadata", oaiNamespace); if (metadata != null) { Element article = metadata.getChild("article"); if (article != null) { return article; } else { throw new RuntimeException("unexpected NLM record contents: " + "article element was not found inside OAI metadata element! Record dump: " + new XMLOutputter().outputString(source)); } } else { // source element is not wrapped with oai:metadata element return source; } } private static String getMetadataText(Element source) { return source.getChild("front") == null ? null : getText(source.getChild("front"), Lists.newArrayList("journal-meta", "article-meta", "abstract")); } private static String getBodyText(Element source) { return source.getChild("body") == null ? null : getText(source.getChild("body"), Lists.newArrayList("sec", "p", "title")); } private static String getReferencesText(Element source) { return source.getChild("back") == null ? null : "References\n" + getText(source.getChild("back"), Lists.newArrayList("ref")); } /** * @param from * Extract text recursively from this element and its children. * @param insertNewlineBefore * Insert newlines before these children. * @return Concatenated text. */ private static String getText(Element from, List<String> insertNewlineBefore) { StringBuilder sb = new StringBuilder(); for (Object child : from.getContent()) { if (child instanceof Element) { String childAsText = getText((Element) child, insertNewlineBefore).trim(); if (!childAsText.isEmpty()) { if (insertNewlineBefore.contains(((Element) child).getName())) { sb.append('\n'); } else { sb.append(' '); } sb.append(childAsText); } } else if (child instanceof Text) { String cont = ((Text) child).getText().trim(); if (!cont.isEmpty()) { sb.append(' '); sb.append(cont); } } } return sb.toString().trim(); } }