package edu.harvard.iq.dataverse.api.imports;
import edu.harvard.iq.dataverse.DatasetFieldConstant;
import edu.harvard.iq.dataverse.DatasetFieldServiceBean;
import edu.harvard.iq.dataverse.DatasetFieldType;
import edu.harvard.iq.dataverse.DatasetVersion;
import edu.harvard.iq.dataverse.DatasetVersion.VersionState;
import edu.harvard.iq.dataverse.api.dto.*;
import edu.harvard.iq.dataverse.api.dto.FieldDTO;
import edu.harvard.iq.dataverse.api.dto.MetadataBlockDTO;
import edu.harvard.iq.dataverse.api.imports.ImportUtil.ImportType;
import static edu.harvard.iq.dataverse.export.ddi.DdiExportUtil.NOTE_TYPE_CONTENTTYPE;
import edu.harvard.iq.dataverse.util.StringUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.EJB;
import javax.ejb.EJBException;
import javax.ejb.Stateless;
import javax.persistence.NoResultException;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLInputFactory;
import org.apache.commons.lang.StringUtils;
/**
*
* @author ellenk
*/
// TODO:
// does this need to be a service bean/stateless? - could be transformed into
// a util with static methods.
// (it would need to be passed the fields service beans as arguments)
// -- L.A. 4.5
@Stateless
public class ImportDDIServiceBean {
public static final String SOURCE_DVN_3_0 = "DVN_3_0";
public static final String NAMING_PROTOCOL_HANDLE = "hdl";
public static final String NAMING_PROTOCOL_DOI = "doi";
public static final String AGENCY_HANDLE = "handle";
public static final String AGENCY_DOI = "DOI";
public static final String AGENCY_DARA = "dara"; // da|ra - http://www.da-ra.de/en/home/
public static final String REPLICATION_FOR_TYPE = "replicationFor";
public static final String VAR_WEIGHTED = "wgtd";
public static final String VAR_INTERVAL_CONTIN = "contin";
public static final String VAR_INTERVAL_DISCRETE = "discrete";
public static final String CAT_STAT_TYPE_FREQUENCY = "freq";
public static final String VAR_FORMAT_TYPE_NUMERIC = "numeric";
public static final String VAR_FORMAT_SCHEMA_ISO = "ISO";
public static final String EVENT_START = "start";
public static final String EVENT_END = "end";
public static final String EVENT_SINGLE = "single";
public static final String LEVEL_DVN = "dvn";
public static final String LEVEL_DV = "dv";
public static final String LEVEL_STUDY = "study";
public static final String LEVEL_FILE = "file";
public static final String LEVEL_VARIABLE = "variable";
public static final String LEVEL_CATEGORY = "category";
public static final String NOTE_TYPE_UNF = "VDC:UNF";
public static final String NOTE_SUBJECT_UNF = "Universal Numeric Fingerprint";
public static final String NOTE_TYPE_TERMS_OF_USE = "DVN:TOU";
public static final String NOTE_SUBJECT_TERMS_OF_USE = "Terms Of Use";
public static final String NOTE_TYPE_CITATION = "DVN:CITATION";
public static final String NOTE_SUBJECT_CITATION = "Citation";
public static final String NOTE_TYPE_VERSION_NOTE = "DVN:VERSION_NOTE";
public static final String NOTE_SUBJECT_VERSION_NOTE= "Version Note";
public static final String NOTE_TYPE_ARCHIVE_NOTE = "DVN:ARCHIVE_NOTE";
public static final String NOTE_SUBJECT_ARCHIVE_NOTE= "Archive Note";
public static final String NOTE_TYPE_ARCHIVE_DATE = "DVN:ARCHIVE_DATE";
public static final String NOTE_SUBJECT_ARCHIVE_DATE= "Archive Date";
public static final String NOTE_TYPE_EXTENDED_METADATA = "DVN:EXTENDED_METADATA";
public static final String NOTE_TYPE_LOCKSS_CRAWL = "LOCKSS:CRAWLING";
public static final String NOTE_SUBJECT_LOCKSS_PERM = "LOCKSS Permission";
public static final String NOTE_TYPE_REPLICATION_FOR = "DVN:REPLICATION_FOR";
private static final String HARVESTED_FILE_STORAGE_PREFIX = "http://";
private XMLInputFactory xmlInputFactory = null;
@EJB CustomFieldServiceBean customFieldService;
@EJB DatasetFieldServiceBean datasetFieldService;
// TODO:
// stop passing the xml source as a string; (it could be huge!) -- L.A. 4.5
public DatasetDTO doImport(ImportType importType, String xmlToParse) throws XMLStreamException, ImportException {
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); DatasetDTO datasetDTO = this.initializeDataset();
// Read docDescr and studyDesc into DTO objects.
// TODO: the fileMap is likely not needed.
Map fileMap = mapDDI(importType, xmlToParse, datasetDTO);
if (!isMigrationImport(importType)) {
// For migration, this filemetadata is copied in a separate SQL step
}
return datasetDTO;
}
public void importFileMetadata(DatasetVersion dv, String xmlToParse) {
}
private boolean isHarvestImport(ImportType importType) {
return importType.equals(ImportType.HARVEST) || importType.equals(ImportType.HARVEST_WITH_FILES);
}
private boolean isHarvestWithFilesImport(ImportType importType) {
return importType.equals(ImportType.HARVEST_WITH_FILES);
}
private boolean isNewImport(ImportType importType) {
return importType.equals(ImportType.NEW);
}
private boolean isMigrationImport(ImportType importType) {
return importType.equals(ImportType.MIGRATION);
}
public Map mapDDI(ImportType importType, String xmlToParse, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
Map filesMap = new HashMap();
StringReader reader = new StringReader(xmlToParse);
XMLStreamReader xmlr = null;
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlr = xmlFactory.createXMLStreamReader(reader);
processDDI(importType, xmlr, datasetDTO, filesMap);
return filesMap;
}
public Map mapDDI(ImportType importType, File ddiFile, DatasetDTO datasetDTO ) throws ImportException {
FileInputStream in = null;
XMLStreamReader xmlr = null;
Map filesMap = new HashMap();
try {
in = new FileInputStream(ddiFile);
xmlr = xmlInputFactory.createXMLStreamReader(in);
processDDI(importType, xmlr, datasetDTO , filesMap );
} catch (FileNotFoundException ex) {
Logger.getLogger("global").log(Level.SEVERE, null, ex);
throw new EJBException("ERROR occurred in mapDDI: File Not Found!");
} catch (XMLStreamException ex) {
Logger.getLogger("global").log(Level.SEVERE, null, ex);
throw new EJBException("ERROR occurred in mapDDI.", ex);
} finally {
try {
if (xmlr != null) { xmlr.close(); }
} catch (XMLStreamException ex) {}
try {
if (in != null) { in.close();}
} catch (IOException ex) {}
}
return filesMap;
}
private void processDDI(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException, ImportException {
// make sure we have a codeBook
//while ( xmlr.next() == XMLStreamConstants.COMMENT ); // skip pre root comments
xmlr.nextTag();
xmlr.require(XMLStreamConstants.START_ELEMENT, null, "codeBook");
// Some DDIs provide an ID in the <codeBook> section.
// We are going to treat it as just another otherId.
// (we've seen instances where this ID was the only ID found in
// in a harvested DDI).
String codeBookLevelId = xmlr.getAttributeValue(null, "ID");
// (but first we will parse and process the entire DDI - and only
// then add this codeBook-level id to the list of identifiers; i.e.,
// we don't want it to be the first on the list, if one or more
// ids are available in the studyDscr section - those should take
// precedence!)
// In fact, we should only use these IDs when no ID is available down
// in the study description section!
processCodeBook(importType, xmlr, datasetDTO, filesMap);
MetadataBlockDTO citationBlock = datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation");
if (codeBookLevelId != null && !codeBookLevelId.equals("")) {
if (citationBlock.getField("otherId")==null) {
// this means no ids were found during the parsing of the
// study description section. we'll use the one we found in
// the codeBook entry:
FieldDTO otherIdValue = FieldDTO.createPrimitiveFieldDTO("otherIdValue", codeBookLevelId);
FieldDTO otherId = FieldDTO.createCompoundFieldDTO("otherId", otherIdValue);
citationBlock.getFields().add(otherId);
}
}
if (isHarvestImport(importType)) {
datasetDTO.getDatasetVersion().setVersionState(VersionState.RELEASED);
}
}
public DatasetDTO initializeDataset() {
DatasetDTO datasetDTO = new DatasetDTO();
DatasetVersionDTO datasetVersionDTO = new DatasetVersionDTO();
datasetDTO.setDatasetVersion(datasetVersionDTO);
HashMap<String, MetadataBlockDTO> metadataBlocks = new HashMap<>();
datasetVersionDTO.setMetadataBlocks(metadataBlocks);
datasetVersionDTO.getMetadataBlocks().put("citation", new MetadataBlockDTO());
datasetVersionDTO.getMetadataBlocks().get("citation").setFields(new ArrayList<FieldDTO>());
datasetVersionDTO.getMetadataBlocks().put("socialscience", new MetadataBlockDTO());
datasetVersionDTO.getMetadataBlocks().get("socialscience").setFields(new ArrayList<FieldDTO>());
datasetVersionDTO.getMetadataBlocks().put("geospatial", new MetadataBlockDTO());
datasetVersionDTO.getMetadataBlocks().get("geospatial").setFields(new ArrayList<FieldDTO>());
return datasetDTO;
}
// Read the XMLStream, and populate datasetDTO and filesMap
private void processCodeBook(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException, ImportException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("docDscr")) {
processDocDscr(xmlr, datasetDTO);
} else if (xmlr.getLocalName().equals("stdyDscr")) {
processStdyDscr(importType, xmlr, datasetDTO);
} else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) {
processOtherMat(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isHarvestWithFilesImport(importType)) {
// If this is a harvesting import, we'll attempt to extract some minimal
// file-level metadata information from the fileDscr sections as well.
// TODO: add more info here... -- 4.6
processFileDscrMinimal(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isNewImport(importType)) {
// this is a "full" fileDscr section - Dataverses use it
// to encode *tabular* files only. It will contain the information
// about variables, observations, etc. It will be complemented
// by a number of <var> entries in the dataDscr section.
// Dataverses do not use this section for harvesting exports, since
// we don't harvest tabular metadata. And all the "regular"
// file-level metadata is encoded in otherMat sections.
// The goal is to one day be able to import such tabular
// metadata using the direct (non-harvesting) import API.
// EMK TODO: add this back in for ImportType.NEW
//processFileDscr(xmlr, datasetDTO, filesMap);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("codeBook")) return;
}
}
}
private void processDocDscr(XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("IDNo") && StringUtil.isEmpty(datasetDTO.getIdentifier()) ) {
// this will set a StudyId if it has not yet been set; it will get overridden by a metadata
// id in the StudyDscr section, if one exists
if ( AGENCY_HANDLE.equals( xmlr.getAttributeValue(null, "agency") ) ) {
parseStudyIdHandle( parseText(xmlr), datasetDTO );
}
// EMK TODO: we need to save this somewhere when we add harvesting infrastructure
} /*else if ( xmlr.getLocalName().equals("holdings") && StringUtil.isEmpty(datasetDTO..getHarvestHoldings()) ) {
metadata.setHarvestHoldings( xmlr.getAttributeValue(null, "URI") );
}*/
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("docDscr")) return;
}
}
}
private String parseText(XMLStreamReader xmlr) throws XMLStreamException {
return parseText(xmlr,true);
}
private String parseText(XMLStreamReader xmlr, boolean scrubText) throws XMLStreamException {
String tempString = getElementText(xmlr);
if (scrubText) {
tempString = tempString.trim().replace('\n',' ');
}
return tempString;
}
private String parseDate (XMLStreamReader xmlr, String endTag) throws XMLStreamException {
String date = xmlr.getAttributeValue(null, "date");
if (date == null) {
date = parseText(xmlr);
}
return date;
}
/* We had to add this method because the ref getElementText has a bug where it
* would append a null before the text, if there was an escaped apostrophe; it appears
* that the code finds an null ENTITY_REFERENCE in this case which seems like a bug;
* the workaround for the moment is to comment or handling ENTITY_REFERENCE in this case
*/
private String getElementText(XMLStreamReader xmlr) throws XMLStreamException {
if(xmlr.getEventType() != XMLStreamConstants.START_ELEMENT) {
throw new XMLStreamException("parser must be on START_ELEMENT to read next text", xmlr.getLocation());
}
int eventType = xmlr.next();
StringBuffer content = new StringBuffer();
while(eventType != XMLStreamConstants.END_ELEMENT ) {
if(eventType == XMLStreamConstants.CHARACTERS
|| eventType == XMLStreamConstants.CDATA
|| eventType == XMLStreamConstants.SPACE
/* || eventType == XMLStreamConstants.ENTITY_REFERENCE*/) {
content.append(xmlr.getText());
} else if(eventType == XMLStreamConstants.PROCESSING_INSTRUCTION
|| eventType == XMLStreamConstants.COMMENT
|| eventType == XMLStreamConstants.ENTITY_REFERENCE) {
// skipping
} else if(eventType == XMLStreamConstants.END_DOCUMENT) {
throw new XMLStreamException("unexpected end of document when reading element text content");
} else if(eventType == XMLStreamConstants.START_ELEMENT) {
throw new XMLStreamException("element text content may not contain START_ELEMENT", xmlr.getLocation());
} else {
throw new XMLStreamException("Unexpected event type "+eventType, xmlr.getLocation());
}
eventType = xmlr.next();
}
return content.toString();
}
private void processStdyDscr(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("citation")) processCitation(importType, xmlr, datasetDTO);
else if (xmlr.getLocalName().equals("stdyInfo")) processStdyInfo(xmlr, datasetDTO.getDatasetVersion());
else if (xmlr.getLocalName().equals("method")) processMethod(xmlr, datasetDTO.getDatasetVersion());
else if (xmlr.getLocalName().equals("dataAccs")) processDataAccs(xmlr, datasetDTO.getDatasetVersion());
else if (xmlr.getLocalName().equals("othrStdyMat")) processOthrStdyMat(xmlr, datasetDTO.getDatasetVersion());
else if (xmlr.getLocalName().equals("notes")) processNotes(xmlr, datasetDTO.getDatasetVersion());
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("stdyDscr")) return;
}
}
}
private void processOthrStdyMat(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
List<HashSet<FieldDTO>> publications = new ArrayList<>();
boolean replicationForFound = false;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("relMat")) {
// this code is still here to handle imports from old DVN created ddis
if (!replicationForFound && REPLICATION_FOR_TYPE.equals(xmlr.getAttributeValue(null, "type"))) {
if (!SOURCE_DVN_3_0.equals(xmlr.getAttributeValue(null, "source"))) {
// this is a ddi from pre 3.0, so we should add a publication
/* StudyRelPublication rp = new StudyRelPublication();
metadata.getStudyRelPublications().add(rp);
rp.setMetadata(metadata);
rp.setText( parseText( xmlr, "relMat" ) );
rp.setReplicationData(true);
replicationForFound = true;*/
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set, DatasetFieldConstant.publicationCitation, parseText(xmlr, "relMat"));
if (!set.isEmpty()) {
publications.add(set);
}
if (publications.size()>0)
getCitation(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO(DatasetFieldConstant.publication, publications));
}
} else {
List<String> relMaterial = new ArrayList<String>();
relMaterial.add(parseText(xmlr, "relMat"));
getCitation(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO(DatasetFieldConstant.relatedMaterial, relMaterial));
}
}
else if (xmlr.getLocalName().equals("relStdy")) {
List<String> relStudy = new ArrayList<String>();
relStudy.add(parseText(xmlr, "relStdy"));
getCitation(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO(DatasetFieldConstant.relatedDatasets, relStudy));
} else if (xmlr.getLocalName().equals("relPubl")) {
HashSet<FieldDTO> set = new HashSet<>();
// call new parse text logic
Object rpFromDDI = parseTextNew(xmlr, "relPubl");
if (rpFromDDI instanceof Map) {
Map rpMap = (Map) rpFromDDI;
addToSet(set, DatasetFieldConstant.publicationCitation, (String) rpMap.get("text"));
addToSet(set, DatasetFieldConstant.publicationIDNumber, (String) rpMap.get("idNumber"));
addToSet(set, DatasetFieldConstant.publicationURL, (String) rpMap.get("url"));
if (rpMap.get("idType")!=null) {
set.add(FieldDTO.createVocabFieldDTO(DatasetFieldConstant.publicationIDType, ((String) rpMap.get("idType")).toLowerCase()));
}
// rp.setText((String) rpMap.get("text"));
// rp.setIdType((String) rpMap.get("idType"));
// rp.setIdNumber((String) rpMap.get("idNumber"));
// rp.setUrl((String) rpMap.get("url"));
// TODO: ask about where/whether we want to save this
// if (!replicationForFound && rpMap.get("replicationData") != null) {
// rp.setReplicationData(true);
/// replicationForFound = true;
// }
} else {
addToSet(set, DatasetFieldConstant.publicationCitation, (String) rpFromDDI);
// rp.setText( (String) rpFromDDI );
}
publications.add(set);
if (publications.size()>0) {
getCitation(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO(DatasetFieldConstant.publication, publications));
}
} else if (xmlr.getLocalName().equals("otherRefs")) {
List<String> otherRefs = new ArrayList<String>();
otherRefs.add(parseText(xmlr, "otherRefs"));
getCitation(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO(DatasetFieldConstant.otherReferences, otherRefs));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("othrStdyMat")) {
return;
}
}
}
}
private void processCitation(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
DatasetVersionDTO dvDTO = datasetDTO.getDatasetVersion();
MetadataBlockDTO citation=datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation");
boolean distStatementProcessed = false;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) processTitlStmt(xmlr, datasetDTO);
else if (xmlr.getLocalName().equals("rspStmt")) processRspStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("prodStmt")) processProdStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("distStmt")) {
if (distStatementProcessed) {
// We've already encountered one Distribution Statement in
// this citation, we'll just skip any consecutive ones.
// This is a defensive check against duplicate distStmt
// in some DDIs (notably, from ICPSR)
} else {
processDistStmt(xmlr,citation);
distStatementProcessed = true;
}
}
else if (xmlr.getLocalName().equals("serStmt")) processSerStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("verStmt")) processVerStmt(importType, xmlr,dvDTO);
else if (xmlr.getLocalName().equals("notes")) {
String _note = parseNoteByType( xmlr, NOTE_TYPE_UNF );
if (_note != null) {
datasetDTO.getDatasetVersion().setUNF( parseUNF( _note ) );
} else {
processNotes(xmlr,dvDTO);
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("citation")) return;
}
}
}
/**
*
*
* @param xmlr
* @param citation
* @throws XMLStreamException
*/
private void processStdyInfo(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
List<HashSet<FieldDTO>> descriptions = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("subject")) {
processSubject(xmlr, getCitation(dvDTO));
} else if (xmlr.getLocalName().equals("abstract")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"dsDescriptionDate", xmlr.getAttributeValue(null, "date"));
addToSet(set,"dsDescriptionValue", parseText(xmlr, "abstract"));
if (!set.isEmpty()) {
descriptions.add(set);
}
} else if (xmlr.getLocalName().equals("sumDscr")) processSumDscr(xmlr, dvDTO);
else if (xmlr.getLocalName().equals("notes")) processNotes(xmlr,dvDTO);
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("stdyInfo") ) {
if (descriptions.size()>0) {
getCitation(dvDTO).getFields().add(FieldDTO.createMultipleCompoundFieldDTO("dsDescription", descriptions));
}
return;
}
}
}
}
private void processSubject(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
List<HashSet<FieldDTO>> keywords = new ArrayList<>();
List<HashSet<FieldDTO>> topicClasses = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("keyword")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"keywordVocabulary", xmlr.getAttributeValue(null, "vocab"));
addToSet(set, "keywordVocabularyURI", xmlr.getAttributeValue(null, "vocabURI") );
addToSet(set,"keywordValue", parseText(xmlr));
if (!set.isEmpty()) {
keywords.add(set);
}
} else if (xmlr.getLocalName().equals("topcClas")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"topicClassVocab", xmlr.getAttributeValue(null, "vocab"));
addToSet(set,"topicClassVocabURI", xmlr.getAttributeValue(null, "vocabURI") );
addToSet(set,"topicClassValue",parseText(xmlr));
if (!set.isEmpty()) {
topicClasses.add(set);
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("subject")) {
if (keywords.size()>0) {
citation.getFields().add(FieldDTO.createMultipleCompoundFieldDTO("keyword", keywords));
}
if (topicClasses.size()>0) {
citation.getFields().add(FieldDTO.createMultipleCompoundFieldDTO("topicClassification", topicClasses));
}
return;
}
} else {
// citation.getFields().add(FieldDTO.createPrimitiveFieldDTO( "subject",xmlr.getElementText()));
}
}
}
/**
* Process the notes portion of the DDI doc -- if there is one
* Return a formatted string
*
* @param xmlr
* @return
*/
private String formatNotesfromXML(XMLStreamReader xmlr) throws XMLStreamException{
if (xmlr==null){
throw new NullPointerException("XMLStreamReader xmlr cannot be null");
}
//System.out.println("formatNotesfromXML");
// Initialize array of strings
List<String> noteValues = new ArrayList<String>();
String attrVal;
// Check for "subject"
attrVal = xmlr.getAttributeValue(null, "subject");
if (attrVal != null){
noteValues.add("Subject: " + attrVal);
}
// Check for "type"
attrVal = xmlr.getAttributeValue(null, "type");
if (attrVal != null){
noteValues.add("Type: " + attrVal);
}
// Add notes, if they exist
attrVal = parseText(xmlr, "notes");
if ((attrVal != null) && (!attrVal.isEmpty())){
noteValues.add("Notes: " + attrVal);
}
// Nothing to add
if (noteValues.isEmpty()){
//System.out.println("nuthin'");
return null;
}
//System.out.println(StringUtils.join(noteValues, " ") + ";");
return StringUtils.join(noteValues, " ") + ";";
/*
Examples of xml:
<notes type="Statistics" subject="Babylon"> </notes>
<notes type="Note Type" subject="Note Subject">Note Text</notes>
<notes type="Note Type 2" subject="Note Subject 2">Note Text 2</notes>
<notes>Note Text 3</notes>
*/
/*
// Original, changed b/c of string 'null' appearing in final output
String note = " Subject: "+xmlr.getAttributeValue(null, "subject")+" "
+ " Type: "+xmlr.getAttributeValue(null, "type")+" "
+ " Notes: "+parseText(xmlr, "notes")+";";
addNote(note, dvDTO);
*/
}
private void processNotes (XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
String formattedNotes = this.formatNotesfromXML(xmlr);
if (formattedNotes != null){
this.addNote(formattedNotes, dvDTO);
}
}
private void addNote(String noteText, DatasetVersionDTO dvDTO ) {
MetadataBlockDTO citation = getCitation(dvDTO);
FieldDTO field = citation.getField("notesText");
if (field==null) {
field = FieldDTO.createPrimitiveFieldDTO("notesText", "");
citation.getFields().add(field);
}
String noteValue = field.getSinglePrimitive();
noteValue+= noteText;
field.setSinglePrimitive(noteValue);
}
private void processSumDscr(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
List<String> geoUnit = new ArrayList<>();
List<String> unitOfAnalysis = new ArrayList<>();
List<String> universe = new ArrayList<>();
List<String> kindOfData = new ArrayList<>();
List<HashSet<FieldDTO>> geoBoundBox = new ArrayList<>();
List<HashSet<FieldDTO>> geoCoverages = new ArrayList<>();
FieldDTO timePeriodStart = null;
FieldDTO timePeriodEnd = null;
FieldDTO dateOfCollectionStart = null;
FieldDTO dateOfCollectionEnd = null;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("timePrd")) {
String eventAttr = xmlr.getAttributeValue(null, "event");
if (eventAttr == null || EVENT_SINGLE.equalsIgnoreCase(eventAttr) || EVENT_START.equalsIgnoreCase(eventAttr)) {
timePeriodStart = FieldDTO.createPrimitiveFieldDTO("timePeriodCoveredStart", parseDate(xmlr, "timePrd"));
} else if (EVENT_END.equals(eventAttr)) {
timePeriodEnd = FieldDTO.createPrimitiveFieldDTO("timePeriodCoveredEnd", parseDate(xmlr, "timePrd"));
}
} else if (xmlr.getLocalName().equals("collDate")) {
String eventAttr = xmlr.getAttributeValue(null, "event");
if (eventAttr == null || EVENT_SINGLE.equalsIgnoreCase(eventAttr) || EVENT_START.equalsIgnoreCase(eventAttr)) {
dateOfCollectionStart = FieldDTO.createPrimitiveFieldDTO("dateOfCollectionStart", parseDate(xmlr, "collDate"));
} else if (EVENT_END.equals(eventAttr)) {
dateOfCollectionEnd = FieldDTO.createPrimitiveFieldDTO("dateOfCollectionEnd", parseDate(xmlr, "collDate"));
}
} else if (xmlr.getLocalName().equals("nation")) {
HashSet<FieldDTO> set = new HashSet<>();
set.add(FieldDTO.createVocabFieldDTO("country", parseText(xmlr)));
geoCoverages.add(set);
} else if (xmlr.getLocalName().equals("geogCover")) {
HashSet<FieldDTO> set = new HashSet<>();
set.add(FieldDTO.createPrimitiveFieldDTO("otherGeographicCoverage", parseText(xmlr)));
geoCoverages.add(set);
} else if (xmlr.getLocalName().equals("geogUnit")) {
geoUnit.add(parseText(xmlr));
} else if (xmlr.getLocalName().equals("geoBndBox")) {
geoBoundBox.add(processGeoBndBox(xmlr));
} else if (xmlr.getLocalName().equals("anlyUnit")) {
unitOfAnalysis.add(parseText(xmlr, "anlyUnit"));
} else if (xmlr.getLocalName().equals("universe")) {
universe.add(parseText(xmlr, "universe"));
} else if (xmlr.getLocalName().equals("dataKind")) {
kindOfData.add(parseText(xmlr));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("sumDscr")) {
if (timePeriodStart!=null || timePeriodEnd!=null) {
getCitation(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO("timePeriodCovered", timePeriodStart, timePeriodEnd));
}
if (dateOfCollectionStart!=null || dateOfCollectionEnd!=null) {
getCitation(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO("dateOfCollection", dateOfCollectionStart, dateOfCollectionEnd));
}
if (geoUnit.size() > 0) {
getGeospatial(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO("geographicUnit", geoUnit));
}
if (unitOfAnalysis.size() > 0) {
getSocialScience(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO("unitOfAnalysis", unitOfAnalysis));
}
if (universe.size() > 0) {
getSocialScience(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO("universe", universe));
}
if (kindOfData.size() > 0) {
getCitation(dvDTO).addField(FieldDTO.createMultiplePrimitiveFieldDTO("kindOfData", kindOfData));
}
if (geoCoverages.size()>0) {
getGeospatial(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO("geographicCoverage", geoCoverages));
}
if (geoBoundBox.size()>0) {
getGeospatial(dvDTO).addField(FieldDTO.createMultipleCompoundFieldDTO("geographicBoundingBox", geoBoundBox));
}
return ;
}
}
}
}
private HashSet<FieldDTO> processGeoBndBox(XMLStreamReader xmlr) throws XMLStreamException {
HashSet<FieldDTO> set = new HashSet<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("westBL")) {
addToSet(set,"westLongitude", parseText(xmlr));
} else if (xmlr.getLocalName().equals("eastBL")) {
addToSet(set,"eastLongitude", parseText(xmlr));
} else if (xmlr.getLocalName().equals("southBL")) {
addToSet(set,"southLongitude", parseText(xmlr));
} else if (xmlr.getLocalName().equals("northBL")) {
addToSet(set,"northLongitude", parseText(xmlr));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("geoBndBox")) break;
}
}
return set;
}
private void processMethod(XMLStreamReader xmlr, DatasetVersionDTO dvDTO ) throws XMLStreamException, ImportException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("dataColl")) {
processDataColl(xmlr, dvDTO);
} else if (xmlr.getLocalName().equals("notes")) {
String noteType = xmlr.getAttributeValue(null, "type");
if (NOTE_TYPE_EXTENDED_METADATA.equalsIgnoreCase(noteType) ) {
processCustomField(xmlr, dvDTO);
} else {
addNote("Subject: Study Level Error Note, Notes: "+ parseText( xmlr,"notes" ) +";", dvDTO);
}
} else if (xmlr.getLocalName().equals("anlyInfo")) {
processAnlyInfo(xmlr, getSocialScience(dvDTO));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("method")) return;
}
}
}
private void processCustomField(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException, ImportException {
String subject = xmlr.getAttributeValue(null, "subject");
if (!subject.isEmpty()) {
// Syntax of subject attribute:
// TEMPLATE:Contains Custom Fields;FIELD:Customfield1
// first parse by semicolon
String template = subject.substring(subject.indexOf(":") + 1, subject.indexOf(";"));
String sourceField = subject.substring(subject.lastIndexOf(":") + 1);
String fieldValue = parseText(xmlr);
CustomFieldMap map = customFieldService.findByTemplateField(template.trim(), sourceField.trim());
if (map == null) {
throw new ImportException("Did not find mapping for template: "+template+", sourceField: "+sourceField);
}
if (map.getTargetDatasetField().endsWith("#IGNORE")) {
// if the target field is #IGNORE, that means we don't want to
// copy this field from 3.6 to 4.0
return;
}
// 1. Get datasetFieldType for the targetField
// 2. find the metadatablock for this field type
// 3. If this metadatablock doesn't exist in DTO, create it
// 4. add field to mdatadatablock
DatasetFieldType dsfType = datasetFieldService.findByName(map.getTargetDatasetField());
if (dsfType == null) {
throw new ImportException("Did not find datasetField for target: " + map.getTargetDatasetField());
}
String metadataBlockName = dsfType.getMetadataBlock().getName();
MetadataBlockDTO customBlock = dvDTO.getMetadataBlocks().get(metadataBlockName);
if (customBlock == null) {
customBlock = new MetadataBlockDTO();
customBlock.setDisplayName(metadataBlockName);
dvDTO.getMetadataBlocks().put(metadataBlockName, customBlock);
}
if (dsfType.isChild()) {
handleChildField(customBlock, dsfType, fieldValue);
} else {
if (dsfType.isAllowMultiples()) {
List<String> valList = new ArrayList<>();
valList.add(fieldValue);
if (dsfType.isAllowControlledVocabulary()) {
customBlock.addField(FieldDTO.createMultipleVocabFieldDTO(dsfType.getName(), valList));
} else if (dsfType.isPrimitive()) {
customBlock.addField(FieldDTO.createMultiplePrimitiveFieldDTO(dsfType.getName(), valList));
} else {
throw new ImportException("Unsupported custom field type: " + dsfType);
}
} else {
if (dsfType.isAllowControlledVocabulary()) {
customBlock.addField(FieldDTO.createVocabFieldDTO(dsfType.getName(), fieldValue));
} else if (dsfType.isPrimitive()) {
customBlock.addField(FieldDTO.createPrimitiveFieldDTO(dsfType.getName(), fieldValue));
} else {
throw new ImportException("Unsupported custom field type: " + dsfType);
}
}
}
}
}
private void handleChildField(MetadataBlockDTO customBlock, DatasetFieldType dsfType, String fieldValue) throws ImportException {
DatasetFieldType parent = dsfType.getParentDatasetFieldType();
// Create child Field
FieldDTO child = null;
if (dsfType.isAllowControlledVocabulary()) {
child = FieldDTO.createVocabFieldDTO(dsfType.getName(), fieldValue);
} else if (dsfType.isPrimitive()) {
child = FieldDTO.createPrimitiveFieldDTO(dsfType.getName(), fieldValue);
} else {
throw new ImportException("Unsupported custom child field type: " + dsfType);
}
// Create compound field with this child as its only element
FieldDTO compound = null;
if (parent.isAllowMultiples()) {
compound = FieldDTO.createMultipleCompoundFieldDTO(parent.getName(), child);
} else {
compound = FieldDTO.createCompoundFieldDTO(parent.getName(), child);
}
customBlock.addField(compound);
}
private void processSources(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
// citation dataSources
String parsedText;
if (xmlr.getLocalName().equals("dataSrc")) {
parsedText = parseText( xmlr, "dataSrc" );
if (!parsedText.isEmpty()) {
citation.addField(FieldDTO.createMultiplePrimitiveFieldDTO("dataSources", Arrays.asList(parsedText)));
}
// citation originOfSources
} else if (xmlr.getLocalName().equals("srcOrig")) {
parsedText = parseText( xmlr, "srcOrig" );
if (!parsedText.isEmpty()) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("originOfSources", parsedText));
}
// citation characteristicOfSources
} else if (xmlr.getLocalName().equals("srcChar")) {
parsedText = parseText( xmlr, "srcChar" );
if (!parsedText.isEmpty()) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("characteristicOfSources", parsedText));
}
// citation accessToSources
} else if (xmlr.getLocalName().equals("srcDocu")) {
parsedText = parseText( xmlr, "srcDocu" );
if (!parsedText.isEmpty()) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("accessToSources", parsedText));
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("sources")) return;
}
}
}
private void processAnlyInfo(XMLStreamReader xmlr, MetadataBlockDTO socialScience) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
// socialscience responseRate
if (xmlr.getLocalName().equals("respRate")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("responseRate", parseText( xmlr, "respRate" )));
// socialscience samplingErrorEstimates
} else if (xmlr.getLocalName().equals("EstSmpErr")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("samplingErrorEstimates", parseText( xmlr, "EstSmpErr" )));
// socialscience otherDataAppraisal
} else if (xmlr.getLocalName().equals("dataAppr")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("otherDataAppraisal", parseText( xmlr, "dataAppr" )));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("anlyInfo")) return;
}
}
}
private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
MetadataBlockDTO socialScience =getSocialScience(dvDTO);
String collMode = "";
String timeMeth = "";
String weight = "";
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
//timeMethod
if (xmlr.getLocalName().equals("timeMeth")) {
String thisValue = parseText( xmlr, "timeMeth" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(timeMeth)) {
timeMeth = timeMeth.concat(", ");
}
timeMeth = timeMeth.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" )));
} else if (xmlr.getLocalName().equals("dataCollector")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("dataCollector", parseText( xmlr, "dataCollector" )));
// frequencyOfDataCollection
} else if (xmlr.getLocalName().equals("frequenc")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("frequencyOfDataCollection", parseText( xmlr, "frequenc" )));
//samplingProcedure
} else if (xmlr.getLocalName().equals("sampProc")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("samplingProcedure", parseText( xmlr, "sampProc" )));
//targetSampleSize
} else if (xmlr.getLocalName().equals("targetSampleSize")) {
processTargetSampleSize(xmlr, socialScience);
//devationsFromSamplingDesign
} else if (xmlr.getLocalName().equals("deviat")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("deviationsFromSampleDesign", parseText( xmlr, "deviat" )));
// collectionMode
} else if (xmlr.getLocalName().equals("collMode")) {
String thisValue = parseText( xmlr, "collMode" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(collMode)) {
collMode = collMode.concat(", ");
}
collMode = collMode.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" )));
//researchInstrument
} else if (xmlr.getLocalName().equals("resInstru")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("researchInstrument", parseText( xmlr, "resInstru" )));
} else if (xmlr.getLocalName().equals("sources")) {
processSources(xmlr,getCitation(dvDTO));
} else if (xmlr.getLocalName().equals("collSitu")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("dataCollectionSituation", parseText( xmlr, "collSitu" )));
} else if (xmlr.getLocalName().equals("actMin")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("actionsToMinimizeLoss", parseText( xmlr, "actMin" )));
} else if (xmlr.getLocalName().equals("ConOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("controlOperations", parseText( xmlr, "ConOps" )));
} else if (xmlr.getLocalName().equals("weight")) {
String thisValue = parseText( xmlr, "weight" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(weight)) {
weight = weight.concat(", ");
}
weight = weight.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" )));
} else if (xmlr.getLocalName().equals("cleanOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("cleaningOperations", parseText( xmlr, "cleanOps" )));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("dataColl")) {
if (!StringUtil.isEmpty(timeMeth)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", timeMeth));
}
if (!StringUtil.isEmpty(collMode)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", collMode));
}
if (!StringUtil.isEmpty(weight)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", weight));
}
return;
}
}
}
}
private void processTargetSampleSize(XMLStreamReader xmlr, MetadataBlockDTO socialScience) throws XMLStreamException {
FieldDTO sampleSize=null;
FieldDTO sampleSizeFormula=null;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("sampleSize")) {
sampleSize = FieldDTO.createPrimitiveFieldDTO("targetSampleActualSize", parseText( xmlr, "sampleSize" ));
} else if (xmlr.getLocalName().equals("sampleSizeFormula")) {
sampleSizeFormula = FieldDTO.createPrimitiveFieldDTO("targetSampleSizeFormula", parseText( xmlr, "sampleSizeFormula" ));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("targetSampleSize")) {
if (sampleSize!=null || sampleSizeFormula!=null) {
socialScience.getFields().add(FieldDTO.createCompoundFieldDTO("targetSampleSize", sampleSize,sampleSizeFormula));
}
return;
}
}
}
}
/*
EMK TODO: In DVN 3.6, users were allowed to enter their own version date, and in addition the app assigned a version date when
the version is released. So DDI's that we have to migrate, we can see this:
<verStmt>
<version date="2004-04-04">1</version>
</verStmt>
<verStmt source="DVN">
<version date="2014-05-21" type="RELEASED">1</version>
</verStmt>
Question: what to do with these two different dates? Need to review with Eleni
Note: we should use the verStmt with source="DVN" as the 'official' version statement
DDI's that we are migrating should have one and only one DVN version statement
*/
private void processVerStmt(ImportType importType, XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
if (isMigrationImport(importType) || isHarvestImport(importType)) {
if (!"DVN".equals(xmlr.getAttributeValue(null, "source"))) {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("version")) {
addNote("Version Date: "+ xmlr.getAttributeValue(null, "date"),dvDTO);
addNote("Version Text: "+ parseText(xmlr),dvDTO);
} else if (xmlr.getLocalName().equals("notes")) { processNotes(xmlr, dvDTO); }
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("verStmt")) return;
}
}
} else {
// this is the DVN version info; get version number for StudyVersion object
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("version")) {
dvDTO.setReleaseDate(xmlr.getAttributeValue(null, "date"));
String versionState =xmlr.getAttributeValue(null,"type");
if (versionState!=null ) {
if( versionState.equals("ARCHIVED")) {
versionState="RELEASED";
} else if (versionState.equals("IN_REVIEW")) {
versionState = DatasetVersion.VersionState.DRAFT.toString();
dvDTO.setInReview(true);
}
dvDTO.setVersionState(Enum.valueOf(VersionState.class, versionState));
}
parseVersionNumber(dvDTO,parseText(xmlr));
}
} else if(event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("verStmt")) return;
}
}
}
}
if (isNewImport(importType)) {
// If this is a new, Draft version, versionNumber and minor versionNumber are null.
dvDTO.setVersionState(VersionState.DRAFT);
}
}
private void processDataAccs(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("setAvail")) {
processSetAvail(xmlr, dvDTO);
} else if (xmlr.getLocalName().equals("useStmt")) {
processUseStmt(xmlr, dvDTO);
} else if (xmlr.getLocalName().equals("notes")) {
String noteType = xmlr.getAttributeValue(null, "type");
if (NOTE_TYPE_TERMS_OF_USE.equalsIgnoreCase(noteType) ) {
if ( LEVEL_DV.equalsIgnoreCase(xmlr.getAttributeValue(null, "level"))) {
dvDTO.setTermsOfUse(parseText(xmlr, "notes"));
}
} else {
processNotes(xmlr, dvDTO);
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("dataAccs")) {
return;
}
}
}
}
private void processSetAvail(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("accsPlac")) {
dvDTO.setDataAccessPlace( parseText( xmlr, "accsPlac" ) );
} else if (xmlr.getLocalName().equals("origArch")) {
dvDTO.setOriginalArchive( parseText( xmlr, "origArch" ) );
} else if (xmlr.getLocalName().equals("avlStatus")) {
dvDTO.setAvailabilityStatus( parseText( xmlr, "avlStatus" ) );
} else if (xmlr.getLocalName().equals("collSize")) {
dvDTO.setSizeOfCollection(parseText( xmlr, "collSize" ) );
} else if (xmlr.getLocalName().equals("complete")) {
dvDTO.setStudyCompletion( parseText( xmlr, "complete" ) );
} else if (xmlr.getLocalName().equals("notes")) {
processNotes( xmlr, dvDTO );
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("setAvail")) return;
}
}
}
private void processUseStmt(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("confDec")) {
dvDTO.setConfidentialityDeclaration( parseText( xmlr, "confDec" ) );
} else if (xmlr.getLocalName().equals("specPerm")) {
dvDTO.setSpecialPermissions( parseText( xmlr, "specPerm" ) );
} else if (xmlr.getLocalName().equals("restrctn")) {
dvDTO.setRestrictions( parseText( xmlr, "restrctn" ) );
} else if (xmlr.getLocalName().equals("contact")) {
dvDTO.setContactForAccess( parseText( xmlr, "contact" ) );
} else if (xmlr.getLocalName().equals("citReq")) {
dvDTO.setCitationRequirements( parseText( xmlr, "citReq" ) );
} else if (xmlr.getLocalName().equals("deposReq")) {
dvDTO.setDepositorRequirements( parseText( xmlr, "deposReq" ) );
} else if (xmlr.getLocalName().equals("conditions")) {
dvDTO.setConditions( parseText( xmlr, "conditions" ) );
} else if (xmlr.getLocalName().equals("disclaimer")) {
dvDTO.setDisclaimer( parseText( xmlr, "disclaimer" ) );
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("useStmt")) return;
}
}
}
/**
* Separate the versionNumber into two parts - before the first '.'
* is the versionNumber, and after is the minorVersionNumber.
* If no minorVersionNumber exists, set to "0".
* @param dvDTO
* @param versionNumber
*/
private void parseVersionNumber(DatasetVersionDTO dvDTO, String versionNumber) {
int firstIndex = versionNumber.indexOf('.');
if (firstIndex == -1) {
dvDTO.setVersionNumber(Long.parseLong(versionNumber));
dvDTO.setMinorVersionNumber("0");
} else {
dvDTO.setVersionNumber(Long.parseLong(versionNumber.substring(0, firstIndex - 1)));
dvDTO.setMinorVersionNumber(versionNumber.substring(firstIndex + 1));
}
}
private void processSerStmt(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
FieldDTO seriesName=null;
FieldDTO seriesInformation=null;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("serName")) {
seriesName = FieldDTO.createPrimitiveFieldDTO("seriesName", parseText(xmlr));
} else if (xmlr.getLocalName().equals("serInfo")) {
seriesInformation=FieldDTO.createPrimitiveFieldDTO("seriesInformation", parseText(xmlr) );
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("serStmt")) {
citation.getFields().add(FieldDTO.createCompoundFieldDTO("series",seriesName,seriesInformation ));
return;
}
}
}
}
private void processDistStmt(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
List<HashSet<FieldDTO>> distributors = new ArrayList<>();
List<HashSet<FieldDTO>> datasetContacts = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("distrbtr")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set, "distributorAbbreviation", xmlr.getAttributeValue(null, "abbr"));
addToSet(set, "distributorAffiliation", xmlr.getAttributeValue(null, "affiliation"));
Map<String, String> distDetails = parseCompoundText(xmlr, "distrbtr");
addToSet(set, "distributorName", distDetails.get("name"));
addToSet(set, "distributorURL", distDetails.get("url"));
addToSet(set, "distributorLogoURL", distDetails.get("logo"));
distributors.add(set);
} else if (xmlr.getLocalName().equals("contact")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set, "datasetContactEmail", xmlr.getAttributeValue(null, "email"));
addToSet(set, "datasetContactAffiliation", xmlr.getAttributeValue(null, "affiliation"));
addToSet(set, "datasetContactName", parseText(xmlr));
datasetContacts.add(set);
} else if (xmlr.getLocalName().equals("depositr")) {
Map<String, String> depDetails = parseCompoundText(xmlr, "depositr");
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("depositor", depDetails.get("name")));
} else if (xmlr.getLocalName().equals("depDate")) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("dateOfDeposit", parseDate(xmlr, "depDate")));
} else if (xmlr.getLocalName().equals("distDate")) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("distributionDate", parseDate(xmlr, "distDate")));
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("distStmt")) {
if (distributors.size() > 0) {
citation.addField(FieldDTO.createMultipleCompoundFieldDTO("distributor", distributors));
}
if (datasetContacts.size() > 0) {
citation.addField(FieldDTO.createMultipleCompoundFieldDTO("datasetContact", datasetContacts));
}
return;
}
}
}
}
private void processProdStmt(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
List<HashSet<FieldDTO>> producers = new ArrayList<>();
List<HashSet<FieldDTO>> grants = new ArrayList<>();
List<HashSet<FieldDTO>> software = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("producer")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"producerAbbreviation", xmlr.getAttributeValue(null, "abbr"));
addToSet(set,"producerAffiliation", xmlr.getAttributeValue(null, "affiliation"));
Map<String, String> prodDetails = parseCompoundText(xmlr, "producer");
addToSet(set,"producerName", prodDetails.get("name"));
addToSet(set,"producerURL", prodDetails.get("url" ));
addToSet(set,"producerLogoURL", prodDetails.get("logo"));
if (!set.isEmpty())
producers.add(set);
} else if (xmlr.getLocalName().equals("prodDate")) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("productionDate", parseDate(xmlr, "prodDate")));
} else if (xmlr.getLocalName().equals("prodPlac")) {
citation.getFields().add(FieldDTO.createPrimitiveFieldDTO("productionPlace", parseDate(xmlr, "prodPlac")));
} else if (xmlr.getLocalName().equals("software")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"softwareVersion", xmlr.getAttributeValue(null, "version"));
addToSet(set,"softwareName", xmlr.getAttributeValue(null, "version"));
if (!set.isEmpty()) {
software.add(set);
}
//TODO: ask Gustavo "fundAg"?TO
} else if (xmlr.getLocalName().equals("fundAg")) {
// save this in contributorName - member of compoundFieldContributor
// metadata.setFundingAgency( parseText(xmlr) );
} else if (xmlr.getLocalName().equals("grantNo")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"grantNumberAgency", xmlr.getAttributeValue(null, "agency"));
addToSet(set,"grantNumberValue", parseText(xmlr));
if (!set.isEmpty()){
grants.add(set);
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("prodStmt")) {
if (software.size()>0) {
citation.addField(FieldDTO.createMultipleCompoundFieldDTO("software", software));
}
if (grants.size()>0) {
citation.addField(FieldDTO.createMultipleCompoundFieldDTO("grantNumber", grants));
}
if (producers.size()>0) {
citation.getFields().add(FieldDTO.createMultipleCompoundFieldDTO("producer", producers));
}
return;
}
}
}
}
private void processTitlStmt(XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
MetadataBlockDTO citation = datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation");
List<HashSet<FieldDTO>> otherIds = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titl")) {
FieldDTO field = FieldDTO.createPrimitiveFieldDTO("title", parseText(xmlr));
citation.getFields().add(field);
} else if (xmlr.getLocalName().equals("subTitl")) {
FieldDTO field = FieldDTO.createPrimitiveFieldDTO("subtitle", parseText(xmlr));
citation.getFields().add(field);
} else if (xmlr.getLocalName().equals("altTitl")) {
FieldDTO field = FieldDTO.createPrimitiveFieldDTO("alternativeTitle", parseText(xmlr));
citation.getFields().add(field);
} else if (xmlr.getLocalName().equals("IDNo")) {
if ( AGENCY_HANDLE.equals( xmlr.getAttributeValue(null, "agency") ) ) {
parseStudyIdHandle( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DOI.equals( xmlr.getAttributeValue(null, "agency") ) ) {
parseStudyIdDOI( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DARA.equals( xmlr.getAttributeValue(null, "agency"))) {
/*
da|ra - "Registration agency for social and economic data"
(http://www.da-ra.de/en/home/)
ICPSR uses da|ra to register their DOIs; so they have agency="dara"
in their IDNo entries.
Also, their DOIs are formatted differently, without the
hdl: prefix.
*/
parseStudyIdDoiICPSRdara( parseText(xmlr), datasetDTO );
} else {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"otherIdAgency", xmlr.getAttributeValue(null, "agency"));
addToSet(set,"otherIdValue", parseText(xmlr));
if(!set.isEmpty()){
otherIds.add(set);
}
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) {
if (otherIds.size()>0) {
citation.addField(FieldDTO.createMultipleCompoundFieldDTO("otherId", otherIds));
}
return;
}
}
}
}
private void processRspStmt(XMLStreamReader xmlr, MetadataBlockDTO citation) throws XMLStreamException {
List<HashSet<FieldDTO>> authors = new ArrayList<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("AuthEnty")) {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"authorAffiliation", xmlr.getAttributeValue(null, "affiliation"));
addToSet(set,"authorName", parseText(xmlr));
if (!set.isEmpty()) {
authors.add(set);
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("rspStmt")) {
if (authors.size()>0) {
FieldDTO author = FieldDTO.createMultipleCompoundFieldDTO("author", authors);
citation.getFields().add(author);
}
return;
}
}
}
}
private Map<String,String> parseCompoundText (XMLStreamReader xmlr, String endTag) throws XMLStreamException {
Map<String,String> returnMap = new HashMap<String,String>();
String text = "";
while (true) {
int event = xmlr.next();
if (event == XMLStreamConstants.CHARACTERS) {
if (text != "") { text += "\n";}
text += xmlr.getText().trim().replace('\n',' ');
} else if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("ExtLink")) {
String mapKey = ("image".equalsIgnoreCase( xmlr.getAttributeValue(null, "role") ) || "logo".equalsIgnoreCase(xmlr.getAttributeValue(null, "title")))? "logo" : "url";
returnMap.put( mapKey, xmlr.getAttributeValue(null, "URI") );
parseText(xmlr, "ExtLink"); // this line effectively just skips though until the end of the tag
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals(endTag)) break;
}
}
returnMap.put( "name", text );
return returnMap;
}
private String parseText(XMLStreamReader xmlr, String endTag) throws XMLStreamException {
return (String) parseTextNew(xmlr,endTag);
}
private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStreamException {
String returnString = "";
Map returnMap = null;
while (true) {
if (!returnString.equals("")) { returnString += "\n";}
int event = xmlr.next();
if (event == XMLStreamConstants.CHARACTERS) {
returnString += xmlr.getText().trim().replace('\n',' ');
} else if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("p") || xmlr.getLocalName().equals("br") || xmlr.getLocalName().equals("head")) {
returnString += "<p>" + parseText(xmlr, xmlr.getLocalName()) + "</p>";
} else if (xmlr.getLocalName().equals("emph") || xmlr.getLocalName().equals("em") || xmlr.getLocalName().equals("i")) {
returnString += "<em>" + parseText(xmlr, xmlr.getLocalName()) + "</em>";
} else if (xmlr.getLocalName().equals("hi") || xmlr.getLocalName().equals("b")) {
returnString += "<strong>" + parseText(xmlr, xmlr.getLocalName()) + "</strong>";
} else if (xmlr.getLocalName().equals("ExtLink")) {
String uri = xmlr.getAttributeValue(null, "URI");
String text = parseText(xmlr, "ExtLink").trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("a") || xmlr.getLocalName().equals("A")) {
String uri = xmlr.getAttributeValue(null, "URI");
if (StringUtil.isEmpty(uri)) {
uri = xmlr.getAttributeValue(null, "HREF");
}
String text = parseText(xmlr, xmlr.getLocalName()).trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("list")) {
returnString += parseText_list(xmlr);
} else if (xmlr.getLocalName().equals("citation")) {
if (SOURCE_DVN_3_0.equals(xmlr.getAttributeValue(null, "source")) ) {
returnMap = parseDVNCitation(xmlr);
} else {
returnString += parseText_citation(xmlr);
}
} else if (xmlr.getLocalName().equals("txt")) {
returnString += parseText(xmlr);
} else {
throw new EJBException("ERROR occurred in mapDDI (parseText): tag not yet supported: <" + xmlr.getLocalName() + ">" );
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals(endTag)) break;
}
}
if (returnMap != null) {
// this is one of our new citation areas for DVN3.0
return returnMap;
}
// otherwise it's a standard section and just return the String like we always did
return returnString.trim();
}
private String parseNoteByType(XMLStreamReader xmlr, String type) throws XMLStreamException {
if (type.equalsIgnoreCase(xmlr.getAttributeValue(null, "type"))) {
return parseText(xmlr);
} else {
return null;
}
}
private String parseText_list (XMLStreamReader xmlr) throws XMLStreamException {
String listString = null;
String listCloseTag = null;
// check type
String listType = xmlr.getAttributeValue(null, "type");
if ("bulleted".equals(listType) || listType == null){
listString = "<ul>\n";
listCloseTag = "</ul>";
} else if ("ordered".equals(listType) ) {
listString = "<ol>\n";
listCloseTag = "</ol>";
} else {
// this includes the default list type of "simple"
throw new EJBException("ERROR occurred in mapDDI (parseText): ListType of types other than {bulleted, ordered} not currently supported.");
}
while (true) {
int event = xmlr.next();
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("itm")) {
listString += "<li>" + parseText(xmlr,"itm") + "</li>\n";
} else {
throw new EJBException("ERROR occurred in mapDDI (parseText): ListType does not currently supported contained LabelType.");
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("list")) break;
}
}
return (listString + listCloseTag);
}
private String parseText_citation (XMLStreamReader xmlr) throws XMLStreamException {
String citation = "<!-- parsed from DDI citation title and holdings -->";
boolean addHoldings = false;
String holdings = "";
while (true) {
int event = xmlr.next();
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) {
while (true) {
event = xmlr.next();
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titl")) {
citation += parseText(xmlr);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) break;
}
}
} else if (xmlr.getLocalName().equals("holdings")) {
String uri = xmlr.getAttributeValue(null, "URI");
String holdingsText = parseText(xmlr);
if ( !StringUtil.isEmpty(uri) || !StringUtil.isEmpty(holdingsText)) {
holdings += addHoldings ? ", " : "";
addHoldings = true;
if ( StringUtil.isEmpty(uri) ) {
holdings += holdingsText;
} else if ( StringUtil.isEmpty(holdingsText) ) {
holdings += "<a href=\"" + uri + "\">" + uri + "</a>";
} else {
// both uri and text have values
holdings += "<a href=\"" + uri + "\">" + holdingsText + "</a>";
}
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("citation")) break;
}
}
if (addHoldings) {
citation += " (" + holdings + ")";
}
return citation;
}
private String parseUNF(String unfString) {
if (unfString.indexOf("UNF:") != -1) {
return unfString.substring( unfString.indexOf("UNF:") );
} else {
return null;
}
}
private Map parseDVNCitation(XMLStreamReader xmlr) throws XMLStreamException {
Map returnValues = new HashMap();
while (true) {
int event = xmlr.next();
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("IDNo")) {
returnValues.put("idType", xmlr.getAttributeValue(null, "agency") );
returnValues.put("idNumber", parseText(xmlr) );
}
else if (xmlr.getLocalName().equals("biblCit")) {
returnValues.put("text", parseText(xmlr) );
}
else if (xmlr.getLocalName().equals("holdings")) {
returnValues.put("url", xmlr.getAttributeValue(null, "URI") );
}
else if (xmlr.getLocalName().equals("notes")) {
if (NOTE_TYPE_REPLICATION_FOR.equals(xmlr.getAttributeValue(null, "type")) ) {
returnValues.put("replicationData", new Boolean(true));
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("citation")) break;
}
}
return returnValues;
}
private void parseStudyIdHandle(String _id, DatasetDTO datasetDTO) {
int index1 = _id.indexOf(':');
int index2 = _id.indexOf('/');
if (index1==-1) {
throw new EJBException("Error parsing (Handle) IdNo: "+_id+". ':' not found in string");
} else {
datasetDTO.setProtocol(_id.substring(0,index1));
}
if (index2 == -1) {
throw new EJBException("Error parsing (Handle) IdNo: "+_id+". '/' not found in string");
} else {
datasetDTO.setAuthority(_id.substring(index1+1, index2));
}
datasetDTO.setDoiSeparator("/");
datasetDTO.setProtocol("hdl");
datasetDTO.setIdentifier(_id.substring(index2+1));
}
private void parseStudyIdDOI(String _id, DatasetDTO datasetDTO) throws ImportException{
int index1 = _id.indexOf(':');
int index2 = _id.lastIndexOf('/');
if (index1==-1) {
throw new EJBException("Error parsing (DOI) IdNo: "+_id+". ':' not found in string");
}
if (index2 == -1) {
throw new ImportException("Error parsing (DOI) IdNo: "+_id+". '/' not found in string");
} else {
datasetDTO.setAuthority(_id.substring(index1+1, index2));
}
datasetDTO.setProtocol("doi");
datasetDTO.setDoiSeparator("/");
datasetDTO.setIdentifier(_id.substring(index2+1));
}
private void parseStudyIdDoiICPSRdara(String _id, DatasetDTO datasetDTO) throws ImportException{
/*
dara/ICPSR DOIs are formatted without the hdl: prefix; for example -
10.3886/ICPSR06635.v1
so we assume that everything before the last "/" is the authority,
and everything past it - the identifier:
*/
int index = _id.lastIndexOf('/');
if (index == -1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+". '/' not found in string");
}
if (index == _id.length() - 1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+" ends with '/'");
}
datasetDTO.setAuthority(_id.substring(0, index));
datasetDTO.setProtocol("doi");
datasetDTO.setDoiSeparator("/");
datasetDTO.setIdentifier(_id.substring(index+1));
}
// Helper methods
private MetadataBlockDTO getCitation(DatasetVersionDTO dvDTO) {
return dvDTO.getMetadataBlocks().get("citation");
}
private MetadataBlockDTO getGeospatial(DatasetVersionDTO dvDTO) {
return dvDTO.getMetadataBlocks().get("geospatial");
}
private MetadataBlockDTO getSocialScience(DatasetVersionDTO dvDTO) {
return dvDTO.getMetadataBlocks().get("socialscience");
}
private void addToSet(HashSet<FieldDTO> set, String typeName, String value ) {
if (value!=null && !value.trim().isEmpty()) {
set.add(FieldDTO.createPrimitiveFieldDTO(typeName, value));
}
}
private void processOtherMat(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();
if (datasetDTO.getDatasetVersion().getFileMetadatas() == null) {
datasetDTO.getDatasetVersion().setFileMetadatas(new ArrayList<>());
}
datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO);
DataFileDTO dfDTO = new DataFileDTO();
//if (datasetDTO.getDataFiles() == null) {
// datasetDTO.setDataFiles(new ArrayList<>());
//}
//datasetDTO.getDataFiles().add(dfDTO);
dfDTO.setStorageIdentifier( xmlr.getAttributeValue(null, "URI"));
fmdDTO.setDataFile(dfDTO);
// TODO: handle categories; note that multiple categories are allowed in Dataverse 4;
String catName = null;
String icpsrDesc = null;
String icpsrId = null;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("labl")) {
// this is the file name:
fmdDTO.setLabel( parseText(xmlr) );
// TODO: in DVN3 we used to make an attempt to determine the file type
// based on the file name.
} else if (xmlr.getLocalName().equals("txt")) {
fmdDTO.setDescription( parseText(xmlr) );
} else if (xmlr.getLocalName().equals("notes")) {
String noteType = xmlr.getAttributeValue(null, "type");
if ("vdc:category".equalsIgnoreCase(noteType) ) {
catName = parseText(xmlr);
} else if ("icpsr:category".equalsIgnoreCase(noteType) ) {
String subjectType = xmlr.getAttributeValue(null, "subject");
if ("description".equalsIgnoreCase(subjectType)) {
icpsrDesc = parseText(xmlr);
} else if ("id".equalsIgnoreCase(subjectType)) {
icpsrId = parseText(xmlr);
}
} else if (NOTE_TYPE_CONTENTTYPE.equalsIgnoreCase(noteType)) {
String contentType = parseText(xmlr);
if (!StringUtil.isEmpty(contentType)) {
dfDTO.setContentType(contentType);
}
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {// </codeBook>
if (xmlr.getLocalName().equals("otherMat")) {
// post process
if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) {
fmdDTO.setLabel("harvested file");
}
// TODO: categories:
return;
}
}
}
}
// this method is for attempting to extract the minimal amount of file-level
// metadata from an ICPSR-supplied DDI. (they use the "fileDscr" instead of
// "otherMat" for general file metadata; the only field they populate is
// "fileName". -- 4.6
private void processFileDscrMinimal(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();
if (datasetDTO.getDatasetVersion().getFileMetadatas() == null) {
datasetDTO.getDatasetVersion().setFileMetadatas(new ArrayList<>());
}
datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO);
DataFileDTO dfDTO = new DataFileDTO();
dfDTO.setContentType("data/various-formats"); // reserved ICPSR content type identifier
fmdDTO.setDataFile(dfDTO);
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("fileName")) {
// this is the file name:
String label = parseText(xmlr);
// do some cleanup:
int col = label.lastIndexOf(':');
if ( col > -1) {
if (col < label.length() - 1) {
label = label.substring(col+1);
} else {
label = label.replaceAll(":", "");
}
}
label = label.replaceAll("[#;<>\\?\\|\\*\"]", "");
label = label.replaceAll("/", "-");
// strip leading blanks:
label = label.replaceFirst("^[ \t]*", "");
fmdDTO.setLabel(label);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("fileDscr")) {
if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) {
fmdDTO.setLabel("harvested file");
}
if (StringUtil.isEmpty(fmdDTO.getDataFile().getStorageIdentifier())) {
fmdDTO.getDataFile().setStorageIdentifier(HARVESTED_FILE_STORAGE_PREFIX);
}
return;
}
}
}
}
private void processFileDscr(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();
datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO);
//StudyFile sf = new OtherFile(studyVersion.getStudy()); // until we connect the sf and dt, we have to assume it's an other file
// as an experiment, I'm going to do it the other way around:
// assume that every fileDscr is a subsettable file now, and convert them
// to otherFiles later if no variables are referencing it -- L.A.
// TabularDataFile sf = new TabularDataFile(studyVersion.getStudy());
DataFileDTO dfDTO = new DataFileDTO();
DataTableDTO dtDTO = new DataTableDTO();
dfDTO.getDataTables().add(dtDTO);
fmdDTO.setDataFile(dfDTO);
datasetDTO.getDataFiles().add(dfDTO);
// EMK TODO: ask Gustavo about this property
//dfDTO.setFileSystemLocation( xmlr.getAttributeValue(null, "URI"));
String ddiFileId = xmlr.getAttributeValue(null, "ID");
/// the following Strings are used to determine the category
String catName = null;
String icpsrDesc = null;
String icpsrId = null;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("fileTxt")) {
String tempDDIFileId = processFileTxt(xmlr, fmdDTO, dtDTO);
ddiFileId = ddiFileId != null ? ddiFileId : tempDDIFileId;
}
else if (xmlr.getLocalName().equals("notes")) {
String noteType = xmlr.getAttributeValue(null, "type");
if (NOTE_TYPE_UNF.equalsIgnoreCase(noteType) ) {
String unf = parseUNF( parseText(xmlr) );
dfDTO.setUNF(unf);
dtDTO.setUnf(unf);
} else if ("vdc:category".equalsIgnoreCase(noteType) ) {
catName = parseText(xmlr);
} else if ("icpsr:category".equalsIgnoreCase(noteType) ) {
String subjectType = xmlr.getAttributeValue(null, "subject");
if ("description".equalsIgnoreCase(subjectType)) {
icpsrDesc = parseText(xmlr);
} else if ("id".equalsIgnoreCase(subjectType)) {
icpsrId = parseText(xmlr);
}
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {// </codeBook>
if (xmlr.getLocalName().equals("fileDscr")) {
// post process
if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) {
fmdDTO.setLabel("file");
}
fmdDTO.setCategory(determineFileCategory(catName, icpsrDesc, icpsrId));
if (ddiFileId != null) {
List filesMapEntry = new ArrayList();
filesMapEntry.add(fmdDTO);
filesMapEntry.add(dtDTO);
filesMap.put( ddiFileId, filesMapEntry);
}
return;
}
}
}
}
private String determineFileCategory(String catName, String icpsrDesc, String icpsrId) {
if (catName == null) {
catName = icpsrDesc;
if (catName != null) {
if (icpsrId != null && !icpsrId.trim().equals("") ) {
catName = icpsrId + ". " + catName;
}
}
}
return (catName != null ? catName : "");
}
/**
* sets fmdDTO.label, fmdDTO.description, fmdDTO.studyfile.subsettableFileType
* @param xmlr
* @param fmdDTO
* @param dtDTO
* @return fmdDTO.label (ddiFileId)
* @throws XMLStreamException
*/
private String processFileTxt(XMLStreamReader xmlr, FileMetadataDTO fmdDTO, DataTableDTO dtDTO) throws XMLStreamException {
String ddiFileId = null;
DataFileDTO dfDTO = fmdDTO.getDataFile();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("fileName")) {
ddiFileId = xmlr.getAttributeValue(null, "ID");
fmdDTO.setLabel( parseText(xmlr) );
/*sf.setFileType( FileUtil.determineFileType( fmdDTO.getLabel() ) );*/
} else if (xmlr.getLocalName().equals("fileType")) {
String contentType = parseText(xmlr);
if (!StringUtil.isEmpty(contentType)) {
dfDTO.setContentType(contentType);
}
} else if (xmlr.getLocalName().equals("fileCont")) {
fmdDTO.setDescription( parseText(xmlr) );
} else if (xmlr.getLocalName().equals("dimensns")) processDimensns(xmlr, dtDTO);
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("fileTxt")) {
// If we still don't know the content type of this file
// (i.e., if there was no "<fileType>" tag explicitly specifying
// the type), we can try and make an educated guess. We already
// now that this is a subsettable file. And now that the
// "<dimensns>" section has been parsed, we can further
// decide if it's a tab, or a fixed field:
if (StringUtil.isEmpty(dfDTO.getContentType())) {
String subsettableFileType = "text/tab-separated-values";
if (dtDTO.getRecordsPerCase() != null) {
subsettableFileType = "text/x-fixed-field";
}
}
//EMK TODO: ask Gustavo & Leonid what should be used here instead of setFileType
// dfDTO.setFileType( subsettableFileType );
return ddiFileId;
}
}
}
return ddiFileId;
}
/**
* Set dtDTO. caseQuantity, varQuantity, recordsPerCase
* @param xmlr
* @param dtDTO
* @throws XMLStreamException
*/
private void processDimensns(XMLStreamReader xmlr, DataTableDTO dtDTO) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("caseQnty")) {
try {
dtDTO.setCaseQuantity( new Long( parseText(xmlr) ) );
} catch (NumberFormatException ex) {}
} else if (xmlr.getLocalName().equals("varQnty")) {
try{
dtDTO.setVarQuantity( new Long( parseText(xmlr) ) );
} catch (NumberFormatException ex) {}
} else if (xmlr.getLocalName().equals("recPrCas")) {
try {
dtDTO.setRecordsPerCase( new Long( parseText(xmlr) ) );
} catch (NumberFormatException ex) {}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {// </codeBook>
if (xmlr.getLocalName().equals("dimensns")) return;
}
}
}
}