package eu.dnetlib.iis.wf.ingest.pmc.metadata; import static eu.dnetlib.iis.wf.ingest.pmc.metadata.JatsXmlConstants.*; import static eu.dnetlib.iis.wf.ingest.pmc.metadata.TagHierarchyUtils.*; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Stack; import org.apache.commons.lang.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata; import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Range; import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceBasicMetadata; import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceMetadata; /** * Sax xml handler of <ref-list> tag in pmc xml * * @author mhorst * @author madryk * */ public class RefListXmlHandler extends DefaultHandler implements ProcessingFinishedAwareXmlHandler { private final Stack<String> parents; private final ExtractedDocumentMetadata.Builder builder; private final StringBuilder currentValue = new StringBuilder(); private ReferenceMetadata.Builder currentRefMetaBuilder; private String currentSurname; private String currentGivenNames; private List<CharSequence> currentRefAuthorList; private final StringBuilder currentReferenceText = new StringBuilder(); private boolean currentReferenceTextExplicitlySet = false; private String currentReferenceIdType; //------------------------ CONSTRUCTORS -------------------------- public RefListXmlHandler(ExtractedDocumentMetadata.Builder builder) { super(); this.builder = builder; this.parents = new Stack<String>(); } //------------------------ LOGIC -------------------------- @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if ( isWithinElement(qName, ELEM_SURNAME, parents, ELEM_NAME) || isWithinElement(qName, ELEM_GIVEN_NAMES, parents, ELEM_NAME) || hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF) || hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF)) { this.currentValue.setLength(0); } else if (isWithinElement(qName, ELEM_PUB_ID, parents, ELEM_CITATION) || isWithinElement(qName, ELEM_PUB_ID, parents, ELEM_ELEMENT_CITATION) || isWithinElement(qName, ELEM_PUB_ID, parents, ELEM_MIXED_CITATION)) { this.currentReferenceIdType = attributes.getValue(PUB_ID_TYPE); this.currentValue.setLength(0); } else if (isElement(qName, ELEM_REF)) { this.currentRefMetaBuilder = ReferenceMetadata.newBuilder(); this.currentRefAuthorList = new ArrayList<CharSequence>(); this.currentReferenceText.setLength(0); ReferenceBasicMetadata.Builder basicMetaBuilder = ReferenceBasicMetadata.newBuilder(); basicMetaBuilder.setExternalIds(new HashMap<CharSequence, CharSequence>()); this.currentRefMetaBuilder.setBasicMetadata(basicMetaBuilder.build()); } this.parents.push(qName); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { this.parents.pop(); if (hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF)) { currentRefMetaBuilder.getBasicMetadata().setTitle(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF)) { currentRefMetaBuilder.getBasicMetadata().setSource(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF)) { currentRefMetaBuilder.getBasicMetadata().setYear(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF)) { currentRefMetaBuilder.getBasicMetadata().setVolume(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF)) { currentRefMetaBuilder.getBasicMetadata().setIssue(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF)) { if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) { currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build()); } currentRefMetaBuilder.getBasicMetadata().getPages().setStart(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF)) { if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) { currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build()); } currentRefMetaBuilder.getBasicMetadata().getPages().setEnd(this.currentValue.toString()); } else if (hasAmongParents(qName, ELEM_PUB_ID, this.parents, ELEM_REF)) { if (this.currentReferenceIdType!=null) { currentRefMetaBuilder.getBasicMetadata().getExternalIds().put( this.currentReferenceIdType, this.currentValue.toString()); } } else if (isWithinElement(qName, ELEM_SURNAME, parents, ELEM_NAME)) { this.currentSurname = this.currentValue.toString(); } else if (isWithinElement(qName, ELEM_GIVEN_NAMES, parents, ELEM_NAME)) { this.currentGivenNames = this.currentValue.toString(); } else if (hasAmongParents(qName, ELEM_NAME, this.parents, ELEM_REF)) { // in element-citation names are nested in person-group this.currentRefAuthorList.add( this.currentSurname + ", " + this.currentGivenNames); this.currentSurname = null; this.currentGivenNames = null; } else if (isWithinElement(qName, ELEM_CITATION, parents, ELEM_REF) || isWithinElement(qName, ELEM_ELEMENT_CITATION, parents, ELEM_REF) || isWithinElement(qName, ELEM_MIXED_CITATION, parents, ELEM_REF)) { if (!this.currentRefMetaBuilder.hasText() && this.currentReferenceTextExplicitlySet && this.currentReferenceText.length()>0) { String trimmedRefText = this.currentReferenceText.toString().trim().replaceAll(" +", " "); if (!trimmedRefText.isEmpty()) { this.currentRefMetaBuilder.setText(trimmedRefText); } } } else if (isElement(qName, ELEM_REF)) { if (this.builder.getReferences()==null) { this.builder.setReferences(new ArrayList<ReferenceMetadata>()); } this.currentRefMetaBuilder.setPosition(this.builder.getReferences().size()+1); if (this.currentRefAuthorList!=null && this.currentRefAuthorList.size()>0) { this.currentRefMetaBuilder.getBasicMetadata().setAuthors(this.currentRefAuthorList); } if (!this.currentRefMetaBuilder.hasText()) { this.currentRefMetaBuilder.setText(generateReferenceRawText( this.currentRefMetaBuilder.getBasicMetadata())); } this.builder.getReferences().add(this.currentRefMetaBuilder.build()); // reference fields cleanup this.currentRefMetaBuilder = null; this.currentRefAuthorList = null; this.currentReferenceText.setLength(0); this.currentReferenceTextExplicitlySet = false; this.currentReferenceIdType = null; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { String currentElement = this.parents.pop(); try { this.currentValue.append(ch, start, length); // handing reference text if (hasAmongParents(this.parents, ELEM_REF)) { if (isWithinElement(currentElement, ELEM_CITATION, parents, ELEM_REF) || isWithinElement(currentElement, ELEM_ELEMENT_CITATION, parents, ELEM_REF) || isWithinElement(currentElement, ELEM_MIXED_CITATION, parents, ELEM_REF)) { // citation element contents char[] chunk = new char[length]; System.arraycopy(ch, start, chunk, 0, length); if (containsNonWhiteCharacter(chunk)) { this.currentReferenceTextExplicitlySet = true; } } if (this.currentReferenceText.length()>0 && isAlphanumeric(ch[start]) && isAlphanumeric(this.currentReferenceText.charAt( this.currentReferenceText.length()-1))) { // adding missing space separator between two alphanumeric characters this.currentReferenceText.append(' '); } this.currentReferenceText.append(ch, start, length); } } finally { this.parents.push(currentElement); } } @Override public boolean hasFinished() { return parents.isEmpty(); } //------------------------ PRIVATE -------------------------- private static boolean isAlphanumeric(char c) { return !(c < 0x30 || (c >= 0x3a && c <= 0x40) || (c > 0x5a && c <= 0x60) || c > 0x7a); } private static boolean containsNonWhiteCharacter(char[] ch) { if (ch!=null && ch.length>0) { for (char currentCh : ch) { if (!Character.isWhitespace(currentCh)) { return true; } } } return false; } private static String generateReferenceRawText(ReferenceBasicMetadata refMeta) { String authors = refMeta.getAuthors()!=null? StringUtils.join(refMeta.getAuthors(), ", "):""; String title = refMeta.getTitle()!=null?refMeta.getTitle().toString():null; String source = refMeta.getSource()!=null?refMeta.getSource().toString():null; String year = refMeta.getYear()!=null?refMeta.getYear().toString():null; String volume = refMeta.getVolume()!=null?refMeta.getVolume().toString():null; String issue = refMeta.getIssue()!=null?refMeta.getIssue().toString():null; String fpage = refMeta.getPages()!=null && refMeta.getPages().getStart()!=null ?refMeta.getPages().getStart().toString():null; String lpage = refMeta.getPages()!=null && refMeta.getPages().getEnd()!=null ?refMeta.getPages().getEnd().toString():null; StringBuilder builder = new StringBuilder(); if (StringUtils.isNotBlank(authors)) { builder.append(authors); builder.append(". "); } if (StringUtils.isNotBlank(title)) { builder.append(title); builder.append(". "); } if (StringUtils.isNotBlank(source)) { builder.append(source); builder.append(". "); } if (StringUtils.isNotBlank(year)) { builder.append(year); } if (StringUtils.isNotBlank(volume)) { builder.append("; "); builder.append(volume); } if (StringUtils.isNotBlank(issue)) { builder.append(" ("); builder.append(issue); builder.append(')'); } if (StringUtils.isNotBlank(fpage)) { builder.append(": "); builder.append(fpage); } if (StringUtils.isNotBlank(lpage)) { builder.append('-'); builder.append(lpage); } return builder.toString(); } }