package eu.dnetlib.iis.wf.importer.dataset;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Stack;
import org.apache.log4j.Logger;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.google.common.collect.Maps;
import eu.dnetlib.iis.common.InfoSpaceConstants;
import eu.dnetlib.iis.importer.schemas.DataSetReference;
import eu.dnetlib.iis.importer.schemas.DatasetToMDStore;
import eu.dnetlib.iis.wf.importer.RecordReceiver;
/**
* Datacite XML dump SAX handler.
* Notice: writer is not being closed by handler. Created outside, let it be closed outside as well.
* @author mhorst
*
*/
public class DataciteDumpXmlHandler extends DefaultHandler {
private static final String ELEM_HEADER = "header";
private static final String ELEM_PAYLOAD = "payload";
private static final String ELEM_METADATA = "metadata";
private static final String ELEM_RESOURCE = "resource";
public static final String ELEM_IDENTIFIER = "identifier";
public static final String ELEM_OBJ_IDENTIFIER = "objIdentifier";
private static final String ELEM_CREATOR = "creator";
private static final String ELEM_CREATOR_NAME = "creatorName";
private static final String ELEM_TITLES = "titles";
private static final String ELEM_TITLE = "title";
private static final String ELEM_DESCRIPTION = "description";
private static final String ELEM_PUBLISHER = "publisher";
private static final String ELEM_PUBLICATION_YEAR = "publicationYear";
private static final String ELEM_FORMATS = "formats";
private static final String ELEM_FORMAT = "format";
private static final String ELEM_RESOURCE_TYPE = "resourceType";
private static final String ELEM_ALTERNATE_IDENTIFIERS = "alternateIdentifiers";
private static final String ELEM_ALTERNATE_IDENTIFIER = "alternateIdentifier";
private static final String ATTRIBUTE_ID_TYPE = "identifierType";
private static final String ATTRIBUTE_RESOURCE_TYPE_GENERAL = "resourceTypeGeneral";
private static final String ATTRIBUTE_ALTERNATE_IDENTIFIER_TYPE = "alternateIdentifierType";
// lowercased identifier types
public static final String ID_TYPE_DOI = "doi";
private static final Logger log = Logger.getLogger(DataciteDumpXmlHandler.class);
private Stack<String> parents;
private StringBuilder currentValue = new StringBuilder();
private DatasetMetadata datasetMeta = new DatasetMetadata();
private final RecordReceiver<DataSetReference> datasetReceiver;
private final RecordReceiver<DatasetToMDStore> datasetToMDStoreReceived;
private final String mainIdFieldName;
private final String mdStoreId;
// ------------------------ LOGIC --------------------------
/**
* @param datasetReceiver dataset object receiver
* @param datasetToMDStoreReceived dataset to mdstore relation receiver
* @param mainIdFieldName field name to be used as main identifier. Introduced because of differences between MDStore records and XML dump records.
* @param mdStoreId mdStore identifier
*/
public DataciteDumpXmlHandler(RecordReceiver<DataSetReference> datasetReceiver,
RecordReceiver<DatasetToMDStore> datasetToMDStoreReceived,
String mainIdFieldName, String mdStoreId) {
super();
this.datasetReceiver = datasetReceiver;
this.datasetToMDStoreReceived = datasetToMDStoreReceived;
this.mainIdFieldName = mainIdFieldName;
this.mdStoreId = mdStoreId;
}
/**
* @param datasetReceiver dataset object receiver
* @param datasetToMDStoreReceived dataset to mdstore relation receiver
* @param mainIdFieldName field name to be used as main identifier. Introduced because of differences between MDStore records and XML dump records.
* @param mdStoreId mdStore identifier
*/
public DataciteDumpXmlHandler(RecordReceiver<DataSetReference> datasetReceiver,
RecordReceiver<DatasetToMDStore> datasetToMDStoreReceived, String mdStoreId) {
this(datasetReceiver, datasetToMDStoreReceived, ELEM_OBJ_IDENTIFIER, mdStoreId);
}
@Override
public void startDocument() throws SAXException {
parents = new Stack<String>();
datasetMeta = new DatasetMetadata();
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (isWithinElement(localName, mainIdFieldName, ELEM_HEADER)) {
// identifierType attribute is mandatory
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_IDENTIFIER, ELEM_RESOURCE)) {
// identifierType attribute is mandatory
datasetMeta.setIdType(attributes.getValue(ATTRIBUTE_ID_TYPE).toLowerCase());
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_CREATOR_NAME, ELEM_CREATOR)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_TITLE, ELEM_TITLES)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_FORMAT, ELEM_FORMATS)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_DESCRIPTION, ELEM_RESOURCE)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_PUBLISHER, ELEM_RESOURCE)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_PUBLICATION_YEAR, ELEM_RESOURCE)) {
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_RESOURCE_TYPE, ELEM_RESOURCE)) {
datasetMeta.setResourceTypeClass(attributes.getValue(ATTRIBUTE_RESOURCE_TYPE_GENERAL));
this.currentValue = new StringBuilder();
} else if (isWithinElement(localName, ELEM_ALTERNATE_IDENTIFIER, ELEM_ALTERNATE_IDENTIFIERS)) {
datasetMeta.setCurrentAlternateIdentifierType(attributes.getValue(ATTRIBUTE_ALTERNATE_IDENTIFIER_TYPE));
this.currentValue = new StringBuilder();
}
this.parents.push(localName);
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
this.parents.pop();
if (isWithinElement(localName, mainIdFieldName, ELEM_HEADER)) {
datasetMeta.setHeaderId(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_IDENTIFIER, ELEM_RESOURCE)) {
datasetMeta.setIdValue(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_CREATOR_NAME, ELEM_CREATOR)
&& this.currentValue.length()>0) {
if (datasetMeta.getCreatorNames()==null) {
datasetMeta.setCreatorNames(new ArrayList<CharSequence>());
}
datasetMeta.getCreatorNames().add(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_TITLE, ELEM_TITLES)
&& this.currentValue.length()>0) {
if (datasetMeta.getTitles()==null) {
datasetMeta.setTitles(new ArrayList<CharSequence>());
}
datasetMeta.getTitles().add(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_FORMAT, ELEM_FORMATS)
&& this.currentValue.length()>0) {
if (datasetMeta.getFormats()==null) {
datasetMeta.setFormats(new ArrayList<CharSequence>());
}
datasetMeta.getFormats().add(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_ALTERNATE_IDENTIFIER, ELEM_ALTERNATE_IDENTIFIERS)
&& this.currentValue.length()>0) {
if (datasetMeta.getCurrentAlternateIdentifierType()!=null) {
if (datasetMeta.getAlternateIdentifiers()==null) {
datasetMeta.setAlternateIdentifiers(Maps.newHashMap());
}
datasetMeta.getAlternateIdentifiers().put(datasetMeta.getCurrentAlternateIdentifierType(), this.currentValue.toString().trim());
}
} else if (isWithinElement(localName, ELEM_DESCRIPTION, ELEM_RESOURCE)
&& this.currentValue.length()>0) {
datasetMeta.setDescription(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_PUBLISHER, ELEM_RESOURCE)
&& this.currentValue.length()>0) {
datasetMeta.setPublisher(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_PUBLICATION_YEAR, ELEM_RESOURCE)
&& this.currentValue.length()>0) {
datasetMeta.setPublicationYear(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_RESOURCE_TYPE, ELEM_RESOURCE)
&& this.currentValue.length()>0) {
datasetMeta.setResourceTypeValue(this.currentValue.toString().trim());
} else if (isWithinElement(localName, ELEM_RESOURCE, ELEM_PAYLOAD) ||
// temporary hack: the case below is for the records originated from MDStore
// where no 'payload' element is present, required until fixing MDStore contents
isWithinElement(localName, ELEM_RESOURCE, ELEM_METADATA)) {
if (datasetMeta.getIdType()!=null && datasetMeta.getIdValue()!=null) {
storeRecords();
} else {
log.warn("either reference type " + datasetMeta.getIdType() + " or id value: " + datasetMeta.getIdValue() +
" was null for record id: " + datasetMeta.getHeaderId());
}
this.datasetMeta = new DatasetMetadata();
}
// resetting current value;
this.currentValue = null;
}
@Override
public void endDocument() throws SAXException {
parents.clear();
parents = null;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (this.currentValue!=null) {
this.currentValue.append(ch, start, length);
}
}
// ------------------------ PRIVATE --------------------------
private void storeRecords() throws SAXException {
try {
if (datasetMeta.getHeaderId()==null) {
throw new SAXException("header identifier was not found!");
}
String idValueStr = datasetMeta.getIdValue().trim();
String datasetId = ELEM_OBJ_IDENTIFIER.equals(mainIdFieldName)?
InfoSpaceConstants.ROW_PREFIX_RESULT + datasetMeta.getHeaderId() : datasetMeta.getHeaderId();
DatasetToMDStore.Builder documentToMDStoreBuilder = DatasetToMDStore.newBuilder();
documentToMDStoreBuilder.setMdStoreId(this.mdStoreId);
documentToMDStoreBuilder.setDatasetId(datasetId);
DataSetReference.Builder dataSetRefBuilder = DataSetReference.newBuilder();
dataSetRefBuilder.setId(datasetId);
dataSetRefBuilder.setReferenceType(datasetMeta.getIdType());
dataSetRefBuilder.setIdForGivenType(idValueStr);
if (datasetMeta.getCreatorNames()!=null) {
dataSetRefBuilder.setCreatorNames(datasetMeta.getCreatorNames());
}
if (datasetMeta.getTitles()!=null) {
dataSetRefBuilder.setTitles(datasetMeta.getTitles());
}
if (datasetMeta.getFormats()!=null) {
dataSetRefBuilder.setFormats(datasetMeta.getFormats());
}
if (datasetMeta.getDescription()!=null) {
dataSetRefBuilder.setDescription(datasetMeta.getDescription());
}
if (datasetMeta.getPublisher()!=null) {
dataSetRefBuilder.setPublisher(datasetMeta.getPublisher());
}
if (datasetMeta.getPublicationYear()!=null) {
dataSetRefBuilder.setPublicationYear(datasetMeta.getPublicationYear());
}
if (datasetMeta.getResourceTypeClass()!=null) {
dataSetRefBuilder.setResourceTypeClass(datasetMeta.getResourceTypeClass());
}
if (datasetMeta.getResourceTypeValue()!=null) {
dataSetRefBuilder.setResourceTypeValue(datasetMeta.getResourceTypeValue());
}
if (datasetMeta.getAlternateIdentifiers()!=null) {
dataSetRefBuilder.setAlternateIdentifiers(datasetMeta.getAlternateIdentifiers());
}
datasetReceiver.receive(dataSetRefBuilder.build());
datasetToMDStoreReceived.receive(documentToMDStoreBuilder.build());
} catch (IOException e) {
throw new SAXException(e);
}
}
boolean isWithinElement(String localName, String expectedElement, String expectedParent) {
return localName.equals(expectedElement) && !this.parents.isEmpty() &&
expectedParent.equals(this.parents.peek());
}
}