/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package edu.harvard.iq.dataverse.datavariable;
import edu.harvard.iq.dataverse.DataTable;
import edu.harvard.iq.dataverse.util.StringUtil;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
/**
*
* @author Leonid Andreev
*/
public class DataTableImportDDI {
public static final String VAR_INTERVAL_DISCRETE = "discrete";
public static final String VAR_INTERVAL_CONTIN = "contin";
public static final String VAR_INTERVAL_NOMINAL = "nominal";
public static final String VAR_INTERVAL_DICHOTOMOUS = "dichotomous";
public static final String VAR_TYPE_NUMERIC = "numeric";
public static final String VAR_TYPE_CHARACTER = "character";
public static final String VAR_WEIGHTED = "wgtd";
public static final String LEVEL_VARIABLE = "variable";
public static final String LEVEL_CATEGORY = "category";
public static final String CAT_STAT_TYPE_FREQUENCY = "freq";
public static final String NOTE_TYPE_UNF = "VDC:UNF";
// Method processDataDscr takes XMLStreamReader xmlr that has just
// encountered the DDI tag <dataDscr>, processes all the variables and
// returns a Map of DataTables mapped by the strings found in the
// "location" attributes of the variables. The DataTables from the
// Map will need to be linked to the corresponding DataFiles by these
// file ids. The DataVariable objects found in this dataDscr section
// have already been linked to the corresponding DataTables in the Map.
// -- L.A. 4.0 beta 9
private Map<String, DataTable> processDataDscr(XMLStreamReader xmlr) throws XMLStreamException {
Map<String, DataTable> dataTablesMap = new HashMap<>();
Map<String, Integer> varsPerFileMap = new HashMap<>();
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("var")) {
processVar(xmlr, dataTablesMap, varsPerFileMap);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("dataDscr")) {
for (Object fileId : dataTablesMap.keySet()) {
Integer numberOfVariables = (Integer) varsPerFileMap.get(fileId);
if (numberOfVariables != null && numberOfVariables.intValue() > 0) {
// OK, this looks like we have found variables for this
// data file entry.
} else {
// TODO:
// otherwise, the studyfile needs to be converted
// from TabularFile to OtherFile; i.e., it should
// be treated as non-subsettable, if there are
// no variables in the <dataDscr> section of the
// DDI referencing the file.
// This actually happens in real life. For example,
// Roper puts some of their files into the <fileDscr>
// section, even though there's no <dataDscr>
// provided for them.
// -- L.A.
// TODO: confirm that this works under 4.0 as is
// -- L.A. 4.0 beta 9
}
}
return dataTablesMap;
}
}
}
return null;
}
private void processVar(XMLStreamReader xmlr, Map dataTablesMap, Map varsPerFileMap) throws XMLStreamException {
DataVariable dv = new DataVariable();
dv.setInvalidRanges(new ArrayList());
dv.setSummaryStatistics( new ArrayList() );
dv.setCategories( new ArrayList() );
dv.setName( xmlr.getAttributeValue(null, "name") );
try {
dv.setNumberOfDecimalPoints( new Long( xmlr.getAttributeValue(null, "dcml") ) );
} catch (NumberFormatException nfe) {}
// interval type (DB value may be different than DDI value)
String _interval = xmlr.getAttributeValue(null, "intrvl");
if (VAR_INTERVAL_CONTIN.equals(_interval)) {
dv.setIntervalContinuous();
} else if (VAR_INTERVAL_NOMINAL.equals(_interval)) {
dv.setIntervalNominal();
} else if (VAR_INTERVAL_DICHOTOMOUS.equals(_interval)) {
dv.setIntervalDichotomous();
} else {
// default is discrete
dv.setIntervalDiscrete();
}
dv.setWeighted( VAR_WEIGHTED.equals( xmlr.getAttributeValue(null, "wgt") ) );
// default is not-wgtd, so null sets weighted to false
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("location")) {
processLocation(xmlr, dv, dataTablesMap, varsPerFileMap);
}
else if (xmlr.getLocalName().equals("labl")) {
String _labl = processLabl( xmlr, LEVEL_VARIABLE );
if (_labl != null && !_labl.equals("") ) {
dv.setLabel( _labl );
}
} else if (xmlr.getLocalName().equals("universe")) {
dv.setUniverse( parseText(xmlr) );
} else if (xmlr.getLocalName().equals("invalrng")) {
processInvalrng( xmlr, dv );
} else if (xmlr.getLocalName().equals("varFormat")) {
processVarFormat( xmlr, dv );
} else if (xmlr.getLocalName().equals("sumStat")) {
processSumStat( xmlr, dv );
} else if (xmlr.getLocalName().equals("catgry")) {
processCatgry( xmlr, dv );
} else if (xmlr.getLocalName().equals("notes")) {
String _note = parseNoteByType( xmlr, NOTE_TYPE_UNF );
if (_note != null && !_note.equals("") ) {
dv.setUnf( parseUNF( _note ) );
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("var")) return;
}
}
}
private void processLocation(XMLStreamReader xmlr, DataVariable dv, Map dataTablesMap, Map varsPerFileMap) throws XMLStreamException {
// fileStartPos, FileEndPos, and RecSegNo
// if these fields don't convert to Long, just leave blank
try {
dv.setFileStartPosition( new Long( xmlr.getAttributeValue(null, "StartPos") ) );
} catch (NumberFormatException ex) {}
try {
dv.setFileEndPosition( new Long( xmlr.getAttributeValue(null, "EndPos") ) );
} catch (NumberFormatException ex) {}
try {
dv.setRecordSegmentNumber( new Long( xmlr.getAttributeValue(null, "RecSegNo") ) );
} catch (NumberFormatException ex) {}
if (dv.getDataTable() == null) {
String fileId = xmlr.getAttributeValue(null, "fileid");
if (fileId != null && !fileId.equals("")) {
DataTable datatable = null;
if (dataTablesMap.get(fileId) != null) {
datatable = (DataTable) dataTablesMap.get(fileId);
} else {
datatable = new DataTable();
dataTablesMap.put(fileId, datatable);
varsPerFileMap.put(fileId, new Integer(0));
}
dv.setDataTable(datatable);
if (datatable.getDataVariables() == null) {
datatable.setDataVariables(new ArrayList<DataVariable>());
}
datatable.getDataVariables().add(dv);
int filePosition = ((Integer)varsPerFileMap.get(fileId)).intValue();
dv.setFileOrder(filePosition++);
varsPerFileMap.put(fileId, new Integer(filePosition));
}
} else {
throw new XMLStreamException("Empty or NULL location attribute in a variable section.");
}
}
private void processInvalrng(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("item")) {
VariableRange range = new VariableRange();
dv.getInvalidRanges().add(range);
range.setDataVariable(dv);
range.setBeginValue( xmlr.getAttributeValue(null, "VALUE") );
range.setBeginValueTypePoint();
} else if (xmlr.getLocalName().equals("range")) {
VariableRange range = new VariableRange();
dv.getInvalidRanges().add(range);
range.setDataVariable(dv);
String min = xmlr.getAttributeValue(null, "min");
String minExclsuive = xmlr.getAttributeValue(null, "minExclusive");
String max = xmlr.getAttributeValue(null, "max");
String maxExclusive = xmlr.getAttributeValue(null, "maxExclusive");
if ( !StringUtil.isEmpty(min) ) {
range.setBeginValue( min );
range.setBeginValueTypeMin( );
} else if ( !StringUtil.isEmpty(minExclsuive) ) {
range.setBeginValue( minExclsuive );
range.setBeginValueTypeMinExcl();
}
if ( !StringUtil.isEmpty(max) ) {
range.setEndValue( max );
range.setEndValueTypeMax();
} else if ( !StringUtil.isEmpty(maxExclusive) ) {
range.setEndValue( maxExclusive );
range.setEndValueTypeMaxExcl();
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("invalrng")) return;
}
}
}
private void processVarFormat(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
String type = xmlr.getAttributeValue(null, "type");
type = (type == null ? VAR_TYPE_NUMERIC : type);
if (VAR_TYPE_CHARACTER.equals(type)) {
dv.setTypeCharacter();
} else {
dv.setTypeNumeric(); // default is numeric
}
dv.setFormat( xmlr.getAttributeValue(null, "formatname") );
String varFormatCategoryAtt = xmlr.getAttributeValue(null, "category");
String varFormatText = parseText(xmlr);
/*
* A somewhat hackish way of recognizing "boolean" variables;
* This is not a universally accepted convention - we (the DVN team)
* simply decided to handle it this way. Booleans are treated simply
* as categorical variables with integers 0 and 1 for the values, and
* "FALSE" and "TRUE" for the labels. On top of that, we make a note
* of the variable's "booleanness", in the DDI, like this:
* <varFormat ...>Boolean</varFormat>
* and in the database, by setting the value of dv.formatCategory to
* "Boolean".
* This information isn't used much in the application (as of May, 2013),
* except in the subsetting: when the column is subset and re-imported
* into an R data frame, we'll convert it into a logical vector.
* TODO:
* Add this to the export end! --L.A.
*/
if ("Boolean".equalsIgnoreCase(varFormatText)) {
dv.setFormatCategory( "Boolean" );
} else {
dv.setFormatCategory( varFormatCategoryAtt );
}
}
private void processSumStat(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
SummaryStatistic ss = new SummaryStatistic();
ss.setTypeByLabel(xmlr.getAttributeValue(null, "type"));
ss.setValue( parseText(xmlr)) ;
ss.setDataVariable(dv);
dv.getSummaryStatistics().add(ss);
}
private void processCatgry(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
VariableCategory cat = new VariableCategory();
cat.setMissing( "Y".equals( xmlr.getAttributeValue(null, "missing") ) ); // default is N, so null sets missing to false
cat.setDataVariable(dv);
if (dv.getCategories() == null || dv.getCategories().size() == 0) {
// if this is the first category we encounter, we'll assume that this
// categorical data/"factor" variable is ordered.
// But we'll switch it back to unordered later, if we encounter
// *any* categories with no order attribute defined.
dv.setOrderedCategorical(true);
}
// Process extra level order values, if available;
// Currently (as of 3.6) only available in R Data ingests.
// TODO:
// revisit this (for 4.0) - we've discussed encoding this order
// simply by the order in which the categories appear in the
// DDI. (-- L.A. 4.0 beta 9)
String order = null;
order = xmlr.getAttributeValue(null, "order");
Integer orderValue = null;
if (order != null) {
try {
orderValue = new Integer (order);
} catch (NumberFormatException ex) {
orderValue = null;
}
}
if (orderValue != null && orderValue.intValue() >= 0) {
cat.setOrder(orderValue.intValue());
} else if (!cat.isMissing()) {
// Everey category of an ordered categorical ("factor") variable
// must have the order rank defined. Which means that if we
// encounter a single NON-MISSING category with no ordered attribute, it
// will be processed as un-ordered.
dv.setOrderedCategorical(false);
}
dv.getCategories().add(cat);
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("labl")) {
String _labl = processLabl( xmlr, LEVEL_CATEGORY );
if (_labl != null && !_labl.equals("") ) {
cat.setLabel( _labl );
}
} else if (xmlr.getLocalName().equals("catValu")) {
cat.setValue( parseText(xmlr, false) );
}
else if (xmlr.getLocalName().equals("catStat")) {
String type = xmlr.getAttributeValue(null, "type");
if (type == null || CAT_STAT_TYPE_FREQUENCY.equalsIgnoreCase( type ) ) {
String _freq = parseText(xmlr);
if (_freq != null && !_freq.equals("") ) {
cat.setFrequency( new Double( _freq ) );
}
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("catgry")) return;
}
}
}
private String processLabl(XMLStreamReader xmlr, String level) throws XMLStreamException {
if (level.equalsIgnoreCase( xmlr.getAttributeValue(null, "level") ) ) {
return parseText(xmlr);
} else {
return null;
}
}
private String parseNoteByType (XMLStreamReader xmlr, String type) throws XMLStreamException {
if (type.equalsIgnoreCase( xmlr.getAttributeValue(null, "type") ) ) {
return parseText(xmlr);
} else {
return null;
}
}
private String parseUNF(String unfString) {
if (unfString.indexOf("UNF:") != -1) {
return unfString.substring( unfString.indexOf("UNF:") );
} else {
return null;
}
}
private String parseText(XMLStreamReader xmlr) throws XMLStreamException {
return parseText(xmlr,true);
}
private String parseText(XMLStreamReader xmlr, boolean scrubText) throws XMLStreamException {
String tempString = getElementText(xmlr);
if (scrubText) {
tempString = tempString.trim().replace('\n',' ');
}
return tempString;
}
/* We had to add this method because the ref getElementText has a bug where it
* would append a null before the text, if there was an escaped apostrophe; it appears
* that the code finds an null ENTITY_REFERENCE in this case which seems like a bug;
* the workaround for the moment is to comment or handling ENTITY_REFERENCE in this case
*/
/*
* TODO: do we still need this method? ( -- L.A. 4.0 beta 9)
*/
private String getElementText(XMLStreamReader xmlr) throws XMLStreamException {
if(xmlr.getEventType() != XMLStreamConstants.START_ELEMENT) {
throw new XMLStreamException("parser must be on START_ELEMENT to read next text", xmlr.getLocation());
}
int eventType = xmlr.next();
StringBuilder content = new StringBuilder();
while(eventType != XMLStreamConstants.END_ELEMENT ) {
if(eventType == XMLStreamConstants.CHARACTERS
|| eventType == XMLStreamConstants.CDATA
|| eventType == XMLStreamConstants.SPACE) {
content.append(xmlr.getText());
} else if(eventType == XMLStreamConstants.PROCESSING_INSTRUCTION
|| eventType == XMLStreamConstants.COMMENT
|| eventType == XMLStreamConstants.ENTITY_REFERENCE) {
// skipping
} else if(eventType == XMLStreamConstants.END_DOCUMENT) {
throw new XMLStreamException("unexpected end of document when reading element text content");
} else if(eventType == XMLStreamConstants.START_ELEMENT) {
throw new XMLStreamException("element text content may not contain START_ELEMENT", xmlr.getLocation());
} else {
throw new XMLStreamException("Unexpected event type "+eventType, xmlr.getLocation());
}
eventType = xmlr.next();
}
return content.toString();
}
}