package eu.dnetlib.iis.wf.ingest.pmc.metadata;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_AFFILIATION_ID;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_AFFILIATION_XREF;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_CONTRIBUTOR_TYPE;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_VALUE_AUTHOR;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_XREF_ID;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ATTR_XREF_TYPE;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_AFFILIATION;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_ARTICLE_ID;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_CONTRIBUTOR;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_CONTRIBUTOR_GROUP;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_FPAGE;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_GIVEN_NAMES;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_LABEL;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_LPAGE;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_NAME;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_SUP;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_SURNAME;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.ELEM_XREF;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.PUB_ID_TYPE;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.TagHierarchyUtils.hasAmongParents;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.TagHierarchyUtils.isElement;
import static eu.dnetlib.iis.wf.ingest.pmc.metadata.TagHierarchyUtils.isWithinElement;
import java.util.List;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.jdom.Element;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.google.common.collect.Lists;
import eu.dnetlib.iis.common.importer.CermineAffiliation;
import eu.dnetlib.iis.common.importer.CermineAffiliationBuilder;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Affiliation;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Author;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Range;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.metadata.affiliation.CRFAffiliationParser;
/**
* Sax xml handler of <article-meta> tag in JATS xml
*
* @author madryk
*/
public class ArticleMetaXmlHandler extends DefaultHandler implements ProcessingFinishedAwareXmlHandler {
/**
* Maximum affiliation lenght required due to Mallet library limitation causing StackOverflowError
* https://github.com/openaire/iis/issues/663
*/
private static final int MAX_AFF_LENGTH = 3000;
private Stack<String> parents;
private final ExtractedDocumentMetadata.Builder builder;
private String currentValue;
private final CermineAffiliationBuilder cermineAffiliationBuilder = new CermineAffiliationBuilder();
private final CermineToIngestAffConverter cermineToIngestAffConverter = new CermineToIngestAffConverter();
private String currentArticleIdType;
private final StringBuilder affiliationText = new StringBuilder();
private String currentAffiliationId;
private final StringBuilder authorText = new StringBuilder();
private JatsAuthor currentAuthor;
private final List<JatsAuthor> currentAuthorsGroup = Lists.newArrayList();
private final List<JatsAuthor> currentAuthors = Lists.newArrayList();
//------------------------ CONSTRUCTOS --------------------------
public ArticleMetaXmlHandler(ExtractedDocumentMetadata.Builder builder) {
super();
this.builder = builder;
this.parents = new Stack<String>();
}
//------------------------ LOGIC --------------------------
@Override
public void startDocument() throws SAXException {
this.parents = new Stack<String>();
builder.setAffiliations(Lists.newArrayList());
builder.setAuthors(Lists.newArrayList());
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (isElement(qName, ELEM_AFFILIATION)) {
currentAffiliationId = attributes.getValue(ATTR_AFFILIATION_ID);
} else if (isElement(qName, ELEM_ARTICLE_ID)) {
currentArticleIdType = attributes.getValue(PUB_ID_TYPE);
} else if (isElement(qName, ELEM_CONTRIBUTOR)) {
if (ATTR_VALUE_AUTHOR.equals(attributes.getValue(ATTR_CONTRIBUTOR_TYPE))) {
currentAuthor = new JatsAuthor();
}
} else if (isWithinElement(qName, ELEM_XREF, parents, ELEM_CONTRIBUTOR)) {
if (currentAuthor != null && ATTR_AFFILIATION_XREF.equals(attributes.getValue(ATTR_XREF_TYPE))) {
String affId = attributes.getValue(ATTR_XREF_ID);
if (affId != null) {
currentAuthor.getAffiliationRefId().add(affId);
}
}
}
this.parents.push(qName);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
this.currentValue = new String(ch, start, length);
if (hasAmongParents(parents, ELEM_AFFILIATION)) {
// skipping affiliation position element
if (!hasAmongParents(parents, ELEM_LABEL) && !hasAmongParents(parents, ELEM_SUP)) {
this.affiliationText.append(currentValue);
}
} else if (currentAuthor != null) {
this.authorText.append(currentValue);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
this.parents.pop();
if (isElement(qName, ELEM_ARTICLE_ID) && currentArticleIdType != null) {
builder.getExternalIdentifiers().put(currentArticleIdType, currentValue.trim());
} else if (isElement(qName, ELEM_FPAGE)) {
if (builder.getPages()==null) {
builder.setPages(Range.newBuilder().build());
}
builder.getPages().setStart(this.currentValue.trim());
} else if (isElement(qName, ELEM_LPAGE)) {
if (builder.getPages()==null) {
builder.setPages(Range.newBuilder().build());
}
builder.getPages().setEnd(this.currentValue.trim());
} else if (isElement(qName, ELEM_AFFILIATION)) {
handleAffiliation();
} else if (isElement(qName, ELEM_CONTRIBUTOR) || hasAmongParents(parents, ELEM_CONTRIBUTOR)) {
if (currentAuthor != null) { // currently handling a contributor which is an author
if (isWithinElement(qName, ELEM_SURNAME, parents, ELEM_NAME)) {
currentAuthor.setSurname(authorText.toString().trim());
authorText.setLength(0);
} else if (isWithinElement(qName, ELEM_GIVEN_NAMES, parents, ELEM_NAME)) {
currentAuthor.setGivenNames(authorText.toString().trim());
authorText.setLength(0);
} else if (isElement(qName, ELEM_CONTRIBUTOR)) {
currentAuthorsGroup.add(currentAuthor);
authorText.setLength(0);
currentAuthor = null;
}
}
} else if (isElement(qName, ELEM_CONTRIBUTOR_GROUP)) {
currentAuthors.addAll(currentAuthorsGroup);
currentAuthorsGroup.clear();
}
}
@Override
public void endDocument() throws SAXException {
for (JatsAuthor pmcAuthor : currentAuthors) {
Author author = Author.newBuilder()
.setFullname(pmcAuthor.getSurname() + ", " + pmcAuthor.getGivenNames())
.setAffiliationPositions(pmcAuthor.getAffiliationPos())
.build();
builder.getAuthors().add(author);
}
}
@Override
public boolean hasFinished() {
return parents.isEmpty();
}
//------------------------ PRIVATE --------------------------
private void handleAffiliation() throws SAXException {
Affiliation currentAffiliation = buildAffiliation();
if (currentAffiliation != null) {
int currentAffiliationPosition = builder.getAffiliations().size();
builder.getAffiliations().add(currentAffiliation);
assignAuthorsForAffiliation(currentAffiliationPosition);
}
affiliationText.setLength(0);
}
private Affiliation buildAffiliation() throws SAXException {
try {
String affStr = this.affiliationText.toString();
if (StringUtils.isNotBlank(affStr) && affStr.length() <= MAX_AFF_LENGTH) {
CRFAffiliationParser affiliationParser = new CRFAffiliationParser();
Element parsedAffiliation = affiliationParser.parse(affStr);
if (parsedAffiliation!=null) {
CermineAffiliation cAff = cermineAffiliationBuilder.build(parsedAffiliation);
return cermineToIngestAffConverter.convert(cAff);
}
}
} catch (TransformationException | AnalysisException e) {
throw new SAXException("unexpected exception while parsing "
+ "affiliations for document: " + builder.getId(), e);
}
return null;
}
private void assignAuthorsForAffiliation(int currentAffiliationPosition) throws SAXException {
if (hasAmongParents(parents, ELEM_CONTRIBUTOR)) {
if (currentAuthor != null) {
currentAuthor.getAffiliationPos().add(currentAffiliationPosition);
}
} else if (hasAmongParents(parents, ELEM_CONTRIBUTOR_GROUP)) {
for (JatsAuthor author : currentAuthorsGroup) {
author.getAffiliationPos().add(currentAffiliationPosition);
}
} else if (currentAffiliationId != null) {
for (JatsAuthor author : currentAuthors) {
for (String affRefId : author.getAffiliationRefId()) {
if (StringUtils.equals(currentAffiliationId, affRefId)) {
author.getAffiliationPos().add(currentAffiliationPosition);
}
}
}
}
}
}