/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.avro.xml; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.math.BigDecimal; import java.math.BigInteger; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; import java.util.HashMap; import java.util.List; import java.util.concurrent.TimeUnit; import javax.xml.bind.DatatypeConverter; import javax.xml.datatype.DatatypeFactory; import javax.xml.datatype.Duration; import javax.xml.namespace.QName; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.stream.StreamSource; import org.apache.avro.Schema; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.Encoder; import org.apache.ws.commons.schema.XmlSchemaAttribute; import org.apache.ws.commons.schema.XmlSchemaCollection; import org.apache.ws.commons.schema.XmlSchemaElement; import org.apache.ws.commons.schema.constants.Constants; import org.apache.ws.commons.schema.docpath.SaxWalkerOverDom; import org.apache.ws.commons.schema.docpath.XmlSchemaDocumentNode; import org.apache.ws.commons.schema.docpath.XmlSchemaNamespaceContext; import org.apache.ws.commons.schema.docpath.XmlSchemaPathFinder; import org.apache.ws.commons.schema.docpath.XmlSchemaPathNode; import org.apache.ws.commons.schema.docpath.XmlSchemaStateMachineGenerator; import org.apache.ws.commons.schema.docpath.XmlSchemaStateMachineNode; import org.apache.ws.commons.schema.walker.XmlSchemaAttrInfo; import org.apache.ws.commons.schema.walker.XmlSchemaBaseSimpleType; import org.apache.ws.commons.schema.walker.XmlSchemaTypeInfo; import org.apache.ws.commons.schema.walker.XmlSchemaWalker; import org.w3c.dom.Document; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Reads an XML {@link Document} and writes it to an {@link Encoder}. * <p> * Generates an Avro {@link Schema} on the fly from the XML Schema itself. * That {@link Schema} can be retrieved by calling {@link #getSchema()}. * </p> */ public class XmlDatumWriter implements DatumWriter<Document> { private static final QName NIL_ATTR = new QName("http://www.w3.org/2001/XMLSchema-instance", "nil"); private final XmlSchemaCollection xmlSchemaCollection; private final XmlSchemaStateMachineNode stateMachine; private Schema schema; private static class StackEntry { XmlSchemaDocumentNode<AvroRecordInfo> docNode; boolean receivedContent; StackEntry(XmlSchemaDocumentNode<AvroRecordInfo> docNode) { this.docNode = docNode; this.receivedContent = false; } } private static class Writer extends DefaultHandler { private static final XmlSchemaTypeInfo XML_MIXED_CONTENT_TYPE = new XmlSchemaTypeInfo(XmlSchemaBaseSimpleType.STRING); private static final Schema AVRO_MIXED_CONTENT_SCHEMA = Schema.create(Schema.Type.STRING); private XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> currLocation; private StringBuilder content; private QName currAnyElem; private ArrayList<StackEntry> stack; private final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path; private final Encoder out; private final XmlSchemaNamespaceContext nsContext; Writer( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path, Encoder out) { this.path = path; this.out = out; nsContext = new XmlSchemaNamespaceContext(); stack = new ArrayList<StackEntry>(); currLocation = null; content = null; currAnyElem = null; } @Override public void startDocument() throws SAXException { currLocation = path; } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { nsContext.addNamespace(prefix, uri); } @Override public void endPrefixMapping(String prefix) throws SAXException { nsContext.removeNamespace(prefix); } @Override public void startElement( String uri, String localName, String qName, Attributes atts) throws SAXException { if (currAnyElem != null) { // We are inside an any element and not processing this one. return; } final QName elemQName = new QName(uri, localName); walkToElement(elemQName); if (!currLocation .getDirection() .equals(XmlSchemaPathNode.Direction.CHILD) && !currLocation .getDirection() .equals(XmlSchemaPathNode.Direction.SIBLING)) { throw new IllegalStateException( "We are starting an element, so our path node direction should be " + "to a CHILD or SIBLING, not " + currLocation.getDirection()); } if (currLocation .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ANY)) { // This is an any element; we are not processing it. currAnyElem = elemQName; return; } try { final XmlSchemaDocumentNode<AvroRecordInfo> doc = currLocation.getDocumentNode(); final AvroRecordInfo recordInfo = doc.getUserDefinedContent(); Schema avroSchema = recordInfo.getAvroSchema(); final List<XmlSchemaAttrInfo> attributes = doc.getStateMachineNode().getAttributes(); final HashMap<String, XmlSchemaTypeInfo> attrTypes = new HashMap<String, XmlSchemaTypeInfo>(); final HashMap<String, XmlSchemaAttribute> schemaAttrs = new HashMap<String, XmlSchemaAttribute>(); for (XmlSchemaAttrInfo attribute : attributes) { attrTypes.put( attribute.getAttribute().getName(), attribute.getType()); schemaAttrs.put( attribute.getAttribute().getName(), attribute.getAttribute()); } // If there are children, we want to start an array and end it later. final StackEntry entry = new StackEntry(currLocation.getDocumentNode()); if (avroSchema.getType().equals(Schema.Type.RECORD)) { if ( !stack.isEmpty() ) { out.startItem(); } if (recordInfo.getUnionIndex() >= 0) { out.writeIndex( recordInfo.getUnionIndex() ); } } else if ( avroSchema.getType().equals(Schema.Type.MAP) ) { final AvroPathNode mapNode = currLocation.getUserDefinedContent(); if (mapNode == null) { throw new IllegalStateException( "Reached " + elemQName + ", a MAP node, but there is no map information here."); } switch ( mapNode.getType() ) { case MAP_START: { if ( !stack.isEmpty() ) { out.startItem(); } if (recordInfo.getUnionIndex() >= 0) { out.writeIndex( recordInfo.getUnionIndex() ); } out.writeMapStart(); out.setItemCount( mapNode.getMapSize() ); } case ITEM_START: { out.startItem(); avroSchema = avroSchema.getValueType(); /* If the MAP value is another UNION, reach * into that one to fetch the schema. */ final int mapUnionIndex = recordInfo.getMapUnionIndex(); if (mapUnionIndex >= 0) { avroSchema = avroSchema.getTypes().get(mapUnionIndex); } String key = null; for (int fieldIndex = 0; fieldIndex < avroSchema.getFields().size() - 1; ++fieldIndex) { final Schema.Field field = avroSchema.getFields().get(fieldIndex); final XmlSchemaTypeInfo attrType = attrTypes.get(field.name()); final XmlSchemaAttribute xsa = schemaAttrs.get(field.name()); if ((attrType.getUserRecognizedType() != null) && attrType .getUserRecognizedType() .equals(Constants.XSD_ID)) { key = getAttrValue( atts, xsa.getQName().getNamespaceURI(), field.name()); if (key == null) { throw new IllegalStateException( "Attribute value for " + xsa.getQName() + " of element " + elemQName + " is null."); } break; } } if (key == null) { throw new IllegalStateException( "Unable to find key for element " + elemQName); } out.writeString(key); /* If the MAP value is another UNION, write * the union index before continuing. */ if (mapUnionIndex >= 0) { out.writeIndex(mapUnionIndex); } break; } case MAP_END: case CONTENT: default: throw new IllegalStateException( "Did not expect to find a map node of type " + mapNode.getType() + " when starting " + elemQName + "."); } } else { throw new IllegalStateException( "Elements are either MAPs or RECORDs, not " + avroSchema.getType() + "s."); } /* The last element in the set of fields is the children. We want * to process the children separately as they require future calls * to characters() and/or startElement(). */ for (int fieldIndex = 0; fieldIndex < avroSchema.getFields().size() - 1; ++fieldIndex) { final Schema.Field field = avroSchema.getFields().get(fieldIndex); if (field.name().equals(elemQName.getLocalPart())) { // We reached the children field early ... not supposed to happen! throw new IllegalStateException( "The children field is indexed at " + fieldIndex + " when it was expected to be the last element, or " + (avroSchema.getFields().size() - 1) + "."); } final XmlSchemaTypeInfo typeInfo = attrTypes.get( field.name() ); final QName attrQName = schemaAttrs.get( field.name() ).getQName(); String value = getAttrValue( atts, attrQName.getNamespaceURI(), field.name()); if (value == null) { // See if there is a default or fixed value instead. final XmlSchemaAttribute schemaAttr = schemaAttrs.get( field.name() ); value = schemaAttr.getDefaultValue(); if (value == null) { value = schemaAttr.getFixedValue(); } } try { write(typeInfo, attrQName, field.schema(), value); } catch (Exception e) { throw new RuntimeException( "Could not write " + field.name() + " in " + field.schema().toString() + " to the output stream for element " + elemQName, e); } } final XmlSchemaTypeInfo elemType = doc.getStateMachineNode().getElementType(); boolean isComplexType = true; if ( !elemType.getType().equals(XmlSchemaTypeInfo.Type.COMPLEX) ) { isComplexType = false; } if (avroSchema .getField( elemQName.getLocalPart() ) .schema() .getType() .equals(Schema.Type.ARRAY) && isComplexType) { out.writeArrayStart(); if (recordInfo.getNumChildren() > 0) { out.setItemCount( recordInfo.getNumChildren() ); } else { out.setItemCount(0); } /* We expect to receive child elements; no need to look * for a default or fixed value once this element exits. */ entry.receivedContent = true; } else if (avroSchema .getField( elemQName.getLocalPart() ) .schema() .getType() .equals(Schema.Type.NULL) ) { out.writeNull(); entry.receivedContent = true; } else { final int nilIndex = atts.getIndex( NIL_ATTR.getNamespaceURI(), NIL_ATTR.getLocalPart()); if ((nilIndex >= 0) && Boolean.parseBoolean(atts.getValue(nilIndex))) { write(doc.getStateMachineNode().getElementType(), elemQName, avroSchema.getField( elemQName.getLocalPart() ).schema(), null); entry.receivedContent = true; } } stack.add(entry); } catch (Exception e) { throw new RuntimeException( "Unable to write " + elemQName + " to the output stream.", e); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (currAnyElem != null) { // We do not process wildcard elements. return; } if (stack.isEmpty()) { throw new SAXException( "We are processing content, but the element stack is empty!"); } final XmlSchemaDocumentNode<AvroRecordInfo> owningElem = stack.get(stack.size() - 1).docNode; XmlSchemaPathNode path = walkToContent(owningElem); if (path == null) { final String str = new String(ch, start, length).trim(); if (str.isEmpty()) { return; } else { if (path == null) { throw new SAXException( "We are processing characters \"" + str + "\" for " + owningElem .getStateMachineNode() .getElement() .getQName() + " but the current direction is " + currLocation.getDirection() + " to " + currLocation.getStateMachineNode() + ", not CONTENT."); } } } else { currLocation = path; if (currLocation.getNext() == null) { throw new SAXException( "We are processing characters for " + stack.get(stack.size() - 1) .docNode .getStateMachineNode() .getElement() .getQName() + " but somehow the path ends here!"); } } /* If characters() will be called multiple times, we want to collect * all of them in the "content" StringBuilder, then process it all * once the last bit of content has been collected. * * This includes where content is interspersed with any elements, which * are skipped anyway. * * If this is the last content node, we'll just write it all out here. */ final boolean moreContentComing = hasMoreContent(currLocation.getNext(), owningElem); String result = null; if (moreContentComing || ((content != null) && (content.length() > 0))) { if (content == null) { content = new StringBuilder(); } content.append(ch, start, length); if (!moreContentComing) { // If this is the last node, process the content. result = content.toString(); content.delete(0, content.length()); } } else { // This is the only content node - just write it. result = new String(ch, start, length); } if (result != null) { final StackEntry entry = stack.get(stack.size() - 1); final XmlSchemaDocumentNode<AvroRecordInfo> docNode = entry.docNode; final XmlSchemaTypeInfo elemType = docNode.getStateMachineNode().getElementType(); final QName elemQName = docNode .getStateMachineNode() .getElement() .getQName(); final Schema avroSchema = docNode .getUserDefinedContent() .getAvroSchema() .getField(elemQName.getLocalPart()) .schema(); try { final AvroPathNode contentPathNode = currLocation.getUserDefinedContent(); if ((contentPathNode != null) && contentPathNode.getType().equals(AvroPathNode.Type.CONTENT)) { out.startItem(); out.writeIndex(contentPathNode.getContentUnionIndex()); write( XML_MIXED_CONTENT_TYPE, elemQName, AVRO_MIXED_CONTENT_SCHEMA, result); } else { write(elemType, elemQName, avroSchema, result); } entry.receivedContent = true; } catch (Exception e) { throw new RuntimeException( "Unable to write the content \"" + result + "\" for " + elemQName, e); } } } @Override public void endElement( String uri, String localName, String qName) throws SAXException { final QName elemQName = new QName(uri, localName); if (currAnyElem != null) { if (currAnyElem.equals(elemQName)) { // We are exiting an any element; prepare for the next one! currAnyElem = null; } return; } final StackEntry entry = stack.remove(stack.size() - 1); final XmlSchemaDocumentNode<AvroRecordInfo> docNode = entry.docNode; final XmlSchemaTypeInfo elemType = docNode.getStateMachineNode().getElementType(); if (!entry.receivedContent) { /* Look for either the default value * or fixed value and apply it, if any. */ String value = docNode.getStateMachineNode().getElement().getDefaultValue(); if (value == null) { value = docNode.getStateMachineNode().getElement().getFixedValue(); } final AvroRecordInfo record = docNode.getUserDefinedContent(); Schema avroSchema = record.getAvroSchema(); if ( avroSchema.getType().equals(Schema.Type.MAP) ) { avroSchema = avroSchema.getValueType(); if (record.getMapUnionIndex() >= 0) { avroSchema = avroSchema.getTypes().get(record.getMapUnionIndex()); } } avroSchema = avroSchema.getField(localName).schema(); try { write(elemType, elemQName, avroSchema, value); } catch (IOException e) { throw new RuntimeException( "Attempted to write a default value of \"" + value + "\" for " + elemQName + " and failed.", e); } } final QName stackElemQName = docNode .getStateMachineNode() .getElement() .getQName(); if (!stackElemQName.equals(elemQName)) { throw new IllegalStateException( "We are leaving " + elemQName + " but the element on the stack is " + stackElemQName + "."); } Schema avroSchema = docNode .getUserDefinedContent() .getAvroSchema(); boolean isMapEnd = false; if (avroSchema.getType().equals(Schema.Type.MAP)) { avroSchema = avroSchema.getValueType(); final int mapUnionIndex = docNode.getUserDefinedContent().getMapUnionIndex(); if (mapUnionIndex >= 0) { avroSchema = avroSchema.getTypes().get(mapUnionIndex); } isMapEnd = isMapEnd(); } boolean isComplexType = true; if ( !elemType.getType().equals(XmlSchemaTypeInfo.Type.COMPLEX) ) { isComplexType = false; } if (avroSchema .getField( elemQName.getLocalPart() ) .schema() .getType() .equals(Schema.Type.ARRAY) && isComplexType) { try { out.writeArrayEnd(); } catch (Exception e) { throw new RuntimeException( "Unable to end the array for " + elemQName, e); } } if (isMapEnd) { try { out.writeMapEnd(); } catch (Exception e) { throw new RuntimeException("Unable to process a MAP_END.", e); } } } @Override public void endDocument() throws SAXException { if (currLocation.getNext() != null) { currLocation = currLocation.getNext(); while (currLocation != null) { if (!currLocation .getDirection() .equals(XmlSchemaPathNode.Direction.PARENT)) { throw new IllegalStateException( "Path has more nodes after document end: " + currLocation.getDirection() + " | " + currLocation.getStateMachineNode()); } currLocation = currLocation.getNext(); } } } private void walkToElement(QName elemName) { if (stack.isEmpty() && currLocation .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT) && currLocation .getStateMachineNode() .getElement() .getQName() .equals(elemName)) { return; } do { currLocation = currLocation.getNext(); } while ((currLocation != null) && (currLocation .getDirection() .equals(XmlSchemaPathNode.Direction.PARENT) || (!currLocation .getDirection() .equals(XmlSchemaPathNode.Direction.PARENT) && !currLocation .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT) && !currLocation .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ANY)))); if (currLocation == null) { throw new IllegalStateException( "Cannot find " + elemName + " in the path!"); } else if ( currLocation .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT) && !currLocation .getStateMachineNode() .getElement() .getQName() .equals(elemName)) { throw new IllegalStateException( "The next element in the path is " + currLocation.getStateMachineNode().getElement().getQName() + " (" + currLocation.getDirection() + "), not " + elemName + "."); } } /** * For a path to be exiting a particular element's * scope, it must be doing one of four things: * * 1. It is null, indicating the end of the document. * 2. It is a PARENT path to the owning element's parent. * 3. It is a CHILD path to the owning element's child (wildcard) element. * 4. It is a SIBLING path to a new element instance. */ private static boolean pathExitsElementScope( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path, XmlSchemaDocumentNode<AvroRecordInfo> owningElem, boolean ignoreAny) { // 1. This is the end of the path. if (path == null) { return true; } // 2. This is a PARENT path to the owning element's parent. final XmlSchemaDocumentNode<AvroRecordInfo> parentElem = owningElem.getParent(); if (path.getDirection().equals(XmlSchemaPathNode.Direction.PARENT) && path.getDocumentNode() == parentElem) { return true; } // 3. It is a CHILD path to the owning element's child. final XmlSchemaStateMachineNode.Type nodeType = path.getStateMachineNode().getNodeType(); final boolean isElement = nodeType.equals(XmlSchemaStateMachineNode.Type.ELEMENT); final boolean isAny = nodeType.equals(XmlSchemaStateMachineNode.Type.ANY); if (path.getDirection().equals(XmlSchemaPathNode.Direction.CHILD) && (isElement || (isAny && !ignoreAny))) { return true; } // 4. It is a SIBLING path to a new element instance. if (path.getDirection().equals(XmlSchemaPathNode.Direction.SIBLING) && (isElement || (isAny && !ignoreAny))) { return true; } // It is none of these things; we are still in the scope. return false; } private static boolean hasMoreContent( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path, XmlSchemaDocumentNode<AvroRecordInfo> owningElem) { if (path == null) { return false; } while (!pathExitsElementScope(path, owningElem, true) && !path .getDirection() .equals(XmlSchemaPathNode.Direction.CONTENT) ) { path = path.getNext(); } if ((path != null) && path .getDirection() .equals(XmlSchemaPathNode.Direction.CONTENT) ) { return true; } return false; } private XmlSchemaPathNode walkToContent( XmlSchemaDocumentNode<AvroRecordInfo> owningElem) { if (currLocation == null) { return null; } XmlSchemaPathNode path = currLocation.getNext(); while (!pathExitsElementScope(path, owningElem, false) && !path .getDirection() .equals(XmlSchemaPathNode.Direction.CONTENT) ) { path = path.getNext(); } if ((path != null) && path .getDirection() .equals(XmlSchemaPathNode.Direction.CONTENT) ) { return path; } return null; } private boolean isMapEnd() { XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> position = currLocation; AvroPathNode pathInfo = null; do { position = position.getNext(); if (position != null) { pathInfo = position.getUserDefinedContent(); } } while ((position != null) && ((pathInfo == null) || ((pathInfo != null) && pathInfo.getType().equals( AvroPathNode.Type.CONTENT)))); return ((position != null) && position .getUserDefinedContent() .getType() .equals(AvroPathNode.Type.MAP_END)); } private static String getAttrValue( Attributes atts, String namespaceUri, String name) { /* Attributes in XML Schema each have their own namespace, which * is not supported in Avro. So, we will see if we can find the * attribute using the existing namespace, and if not, we will * walk all of them to see which one has the same name. */ String value = atts.getValue(namespaceUri, name); if (value == null) { for (int attrIndex = 0; attrIndex < atts.getLength(); ++attrIndex) { if (atts.getLocalName(attrIndex).equals(name)) { value = atts.getValue(attrIndex); break; } } } return value; } private void write( XmlSchemaTypeInfo xmlType, QName xmlQName, Schema schema, String data) throws IOException { write(xmlType, xmlQName, schema, data, -1); } private void write( XmlSchemaTypeInfo xmlType, QName xmlQName, Schema schema, String data, int unionIndex) throws IOException { /* If the data is empty or null, write * it as a null or string, if possible. */ final XmlSchemaBaseSimpleType baseType = xmlType.getBaseType(); if ((data == null) || data.isEmpty()) { boolean isNullable = (schema.getType().equals(Schema.Type.NULL)); boolean isString = (schema.getType().equals(Schema.Type.STRING)); int nullUnionIndex = -1; int stringIndex = -1; if (!isNullable && !isString && schema.getType().equals(Schema.Type.UNION)) { for (int typeIndex = 0; typeIndex < schema.getTypes().size(); ++typeIndex) { final Schema.Type type = schema.getTypes().get(typeIndex).getType(); if (type.equals(Schema.Type.NULL)) { nullUnionIndex = typeIndex; isNullable = true; break; } else if (type.equals(Schema.Type.STRING)) { isString = true; stringIndex = typeIndex; } } } if (isString && (data != null) && data.isEmpty()) { // Preserve empty strings when possible. if (stringIndex >= 0) { out.writeIndex(stringIndex); } out.writeString(data); } else if (isNullable) { if (nullUnionIndex >= 0) { out.writeIndex(nullUnionIndex); } out.writeNull(); } else if (isString) { if (stringIndex >= 0) { out.writeIndex(stringIndex); } out.writeString(""); } else { throw new IOException( "Cannot write a null or empty string " + "as a non-null or non-string type."); } return; } switch ( schema.getType() ) { case ARRAY: { /* While unions of lists of different types are technically possible, * supporting them here would be difficult, to say the least. For * now, only one array type will be supported in a union. */ if (unionIndex >= 0) { out.writeIndex(unionIndex); } if ( XmlSchemaBaseSimpleType.DURATION.equals(baseType) ) { final DatatypeFactory datatypeFactory = Utils.getDatatypeFactory(); final Duration xmlDuration = datatypeFactory.newDuration(data); final int months = xmlDuration.getYears() * 12 + xmlDuration.getMonths(); final int days = xmlDuration.getDays(); final int millis = ((((xmlDuration.getHours() * 60) + xmlDuration.getMinutes()) * 60) + xmlDuration.getSeconds()) * 1000; out.writeArrayStart(); out.setItemCount(3); out.startItem(); out.writeInt(months); out.startItem(); out.writeInt(days); out.startItem(); out.writeInt(millis); out.writeArrayEnd(); break; } if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION) ) { xmlType = Utils.chooseUnionType(xmlType, null, schema, unionIndex); } if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.LIST) ) { xmlType = xmlType.getChildTypes().get(0); } final String[] items = data.split(" "); final List<String> itemList = new ArrayList<String>(items.length); for (String item : items) { if ( !item.isEmpty() ) { itemList.add(item); } } out.writeArrayStart(); out.setItemCount( itemList.size() ); for (String item : itemList) { out.startItem(); write(xmlType, xmlQName, schema.getElementType(), item); } out.writeArrayEnd(); break; } case UNION: { int textIndex = -1; int bytesIndex = -1; Schema bytesType = null; final List<Schema> subTypes = schema.getTypes(); boolean written = false; for (int subTypeIndex = 0; subTypeIndex < subTypes.size(); ++subTypeIndex) { // Try the text types last. final Schema subType = subTypes.get(subTypeIndex); if (subType.getType().equals(Schema.Type.BYTES)) { bytesIndex = subTypeIndex; bytesType = subType; continue; } else if (subType.getType().equals(Schema.Type.STRING)) { textIndex = subTypeIndex; continue; } // Determine the corresponding XML union type. XmlSchemaTypeInfo xmlSubType = xmlType; if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION) ) { xmlSubType = Utils.chooseUnionType( xmlType, xmlQName, subType, subTypeIndex); } if (xmlSubType != null) { try { write(xmlSubType, xmlQName, subType, data, subTypeIndex); written = true; break; } catch (Exception e) { /* Could not parse the value using the * provided type; try the next one. */ } } } if (!written) { if (bytesIndex >= 0) { XmlSchemaTypeInfo subType = xmlType; if (xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION)) { subType = Utils.chooseUnionType( xmlType, xmlQName, schema.getTypes().get(bytesIndex), bytesIndex); } // Only write the bytes if we know how. if (subType != null) { try { write(subType, xmlQName, bytesType, data, bytesIndex); written = true; } catch (Exception e) { // Cannot write the data as bytes either. } } } if (!written && (textIndex >= 0)) { out.writeIndex(textIndex); out.writeString(data); } else if (!written) { throw new IOException( "Cannot write \"" + data + "\" as one of the types in " + schema.toString()); } } break; } case BYTES: { byte[] bytes = null; switch (baseType) { case BIN_BASE64: bytes = DatatypeConverter.parseBase64Binary(data); break; case BIN_HEX: bytes = DatatypeConverter.parseHexBinary(data); break; case DECIMAL: { final BigDecimal decimal = Utils.createBigDecimalFrom(data, schema); final BigInteger unscaledValue = decimal.unscaledValue(); bytes = unscaledValue.toByteArray(); break; } default: throw new IllegalArgumentException( "Cannot generate bytes for data of a base type of " + baseType); } if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeBytes(bytes); break; } case STRING: { if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeString(data); break; } case ENUM: { if ( !schema.hasEnumSymbol(data) ) { final int numSymbols = schema.getEnumSymbols().size(); StringBuilder errMsg = new StringBuilder("\""); errMsg.append(data); errMsg.append("\" is not a member of the symbols [\""); for (int symbolIndex = 0; symbolIndex < numSymbols - 1; ++symbolIndex) { errMsg.append( schema.getEnumSymbols().get(symbolIndex) ); errMsg.append("\", \""); } errMsg.append( schema.getEnumSymbols().get(numSymbols - 1) ); errMsg.append("\"]."); throw new IOException( errMsg.toString() ); } if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeEnum( schema.getEnumOrdinal(data) ); break; } case DOUBLE: { try { final double value = Double.parseDouble(data); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeDouble(value); } catch (NumberFormatException nfe) { throw new IOException("\"" + data + "\" is not a double.", nfe); } break; } case FLOAT: { try { final float value = Float.parseFloat(data); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeFloat(value); } catch (NumberFormatException nfe) { throw new IOException("\"" + data + "\" is not a float.", nfe); } break; } case LONG: { switch (baseType) { case DECIMAL: { try { final long value = Long.parseLong(data); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeLong(value); } catch (NumberFormatException nfe) { throw new IOException("\"" + data + "\" is not a long.", nfe); } break; } case DATETIME: { try { Calendar timestampCal = DatatypeConverter.parseDateTime(data); timestampCal.setTimeZone( Utils.getGmtTimeZone() ); final long value = timestampCal.getTimeInMillis(); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeLong(value); } catch (IllegalArgumentException e) { throw new IOException("\"" + data + "\" is not a datetime.", e); } break; } default: throw new IOException("Unrecognized long type: " + baseType); } break; } case INT: { switch (baseType) { case DECIMAL: { try { final int value = Integer.parseInt(data); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeInt(value); } catch (NumberFormatException nfe) { throw new IOException("\"" + data + "\" is not an int.", nfe); } break; } case DATE: { try { final Calendar dateCal = DatatypeConverter.parseDate(data); dateCal.setTimeZone( Utils.getGmtTimeZone() ); final long diffInMillis = dateCal.getTimeInMillis() - Utils.getUnixEpoch().getTimeInMillis(); final long diffInDays = TimeUnit.DAYS.convert(diffInMillis, TimeUnit.MILLISECONDS); if (unionIndex >= 0) { out.writeIndex(unionIndex); out.writeInt((int) diffInDays); } } catch (IllegalArgumentException e) { throw new IOException("\"" + data + "\" is not a date.", e); } break; } case TIME: { try { final Calendar timeCal = DatatypeConverter.parseTime(data); timeCal.setTimeZone( Utils.getGmtTimeZone() ); if (unionIndex >= 0) { out.writeIndex(unionIndex); out.writeInt((int) timeCal.getTimeInMillis()); } } catch (IllegalArgumentException e) { throw new IOException("\"" + data + "\" is not a time.", e); } break; } default: throw new IOException( "Unrecognized integer type " + baseType); } break; } case BOOLEAN: { if (data.equalsIgnoreCase("true") || data.equalsIgnoreCase("false")) { if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeBoolean( Boolean.parseBoolean(data) ); } else { throw new IOException('"' + data + "\" is not a boolean."); } break; } case RECORD: { switch (baseType) { case QNAME: { try { final QName qName = DatatypeConverter.parseQName(data, nsContext); if (unionIndex >= 0) { out.writeIndex(unionIndex); } out.writeString( qName.getNamespaceURI() ); out.writeString( qName.getLocalPart() ); } catch (IllegalArgumentException e) { throw new IOException("\"" + data + "\" is not a QName.", e); } break; } default: throw new IOException( "Cannot write a record of XML Schema Type " + baseType); } break; } default: throw new IOException("Cannot write data of type " + schema.getType()); } } } public XmlDatumWriter(XmlDatumConfig config, Schema avroSchema) throws IOException { if (config == null) { throw new IllegalArgumentException("XmlDatumConfig cannot be null."); } xmlSchemaCollection = new XmlSchemaCollection(); xmlSchemaCollection.setBaseUri(config.getBaseUri()); for (StreamSource source : config.getSources()) { xmlSchemaCollection.read(source); } final XmlSchemaStateMachineGenerator stateMachineGen = new XmlSchemaStateMachineGenerator(); final XmlSchemaWalker walker = new XmlSchemaWalker(xmlSchemaCollection, stateMachineGen); walker.setUserRecognizedTypes( Utils.getAvroRecognizedTypes() ); AvroSchemaGenerator avroSchemaGen = null; if (avroSchema == null) { avroSchemaGen = new AvroSchemaGenerator( config.getBaseUri(), config.getSchemaUrls(), config.getSchemaFiles()); walker.addVisitor(avroSchemaGen); } final XmlSchemaElement rootElement = xmlSchemaCollection.getElementByQName(config.getRootTagName()); walker.walk(rootElement); stateMachine = stateMachineGen.getStartNode(); if (avroSchema == null) { schema = avroSchemaGen.getSchema(); } else { schema = avroSchema; } } public XmlDatumWriter(XmlDatumConfig config) throws IOException { this(config, null); } /** * Returns the {@link Schema} this <code>XmlDatumWriter</code> is * writing against - either the one automatically generated from * the {@link XmlDatumConfig} or the {@link Schema} set after that. */ public Schema getSchema() { return schema; } /** * Sets the schema to use when writing the XML * {@link Document} to the {@link Encoder}. * * @see org.apache.avro.io.DatumWriter#setSchema(org.apache.avro.Schema) */ @Override public void setSchema(Schema schema) { if (schema == null) { throw new IllegalArgumentException("Avro schema cannot be null."); } this.schema = schema; } /** * Writes the {@link Document} to the {@link Encoder} in accordance * with the {@link Schema} set in {@link #setSchema(Schema)}. * * <p> * If no {@link Schema} was provided, builds one from the {@link Document} * and its {@link XmlSchemaCollection}. The schema can then be retrieved * from {@link #getSchema()}. * </p> * * @see DatumWriter#write(java.lang.Object, org.apache.avro.io.Encoder) */ @Override public void write(Document doc, Encoder out) throws IOException { // 1. Build the path through the schema that describes the document. final XmlSchemaPathFinder pathFinder = new XmlSchemaPathFinder(stateMachine); final SaxWalkerOverDom walker = new SaxWalkerOverDom(pathFinder); try { walker.walk(doc); } catch (Exception se) { throw new IOException("Unable to parse the document.", se); } final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path = pathFinder.getXmlSchemaTraversal(); // 2. Apply Avro schema metadata on top of the document. final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false); applier.apply(path); // 3. Encode the document. walker.removeContentHandler(pathFinder); walker.addContentHandler( new Writer(path, out) ); try { walker.walk(doc); } catch (SAXException e) { throw new IOException("Unable to encode the document.", e); } } /** * Writes the XML in the provided {@link File} to the {@link Encoder} in * accordance with the {@link Schema} set in {@link #setSchema(Schema)}. * * <p> * If no {@link Schema} was provided, builds one from the {@link Document} * and its {@link XmlSchemaCollection}. The schema can then be retrieved * from {@link #getSchema()}. * </p> */ public void write(File xmlFile, Encoder out) throws IOException, ParserConfigurationException, SAXException { final SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); // 1. Build the path through the schema that describes the document. final SAXParser pathFindingParser = factory.newSAXParser(); final XmlSchemaPathFinder pathFinder = new XmlSchemaPathFinder(stateMachine); pathFindingParser.parse(xmlFile, pathFinder); final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path = pathFinder.getXmlSchemaTraversal(); // 2. Apply Avro schema metadata on top of the document. final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false); applier.apply(path); // 3. Encode the document. final SAXParser encodingParser = factory.newSAXParser(); encodingParser.parse(xmlFile, new Writer(path, out)); } /** * Writes the XML retrieved from the provided {@link URL} to the * {@link Encoder} in accordance with the {@link Schema} set in * {@link #setSchema(Schema)}. * * <p> * If no {@link Schema} was provided, builds one from the {@link Document} * and its {@link XmlSchemaCollection}. The schema can then be retrieved * from {@link #getSchema()}. * </p> */ public void write(URL xmlUrl, Encoder out) throws IOException, ParserConfigurationException, SAXException { final SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); // 1. Build the path through the schema that describes the document. final SAXParser pathFindingParser = factory.newSAXParser(); final XmlSchemaPathFinder pathFinder = new XmlSchemaPathFinder(stateMachine); InputStream pathFindingStream = null; try { pathFindingStream = xmlUrl.openStream(); pathFindingParser.parse(pathFindingStream, pathFinder); } finally { if (pathFindingStream != null) { try { pathFindingStream.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path = pathFinder.getXmlSchemaTraversal(); // 2. Apply Avro schema metadata on top of the document. final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false); applier.apply(path); // 3. Encode the document. InputStream encodingStream = null; try { encodingStream = xmlUrl.openStream(); final SAXParser encodingParser = factory.newSAXParser(); encodingParser.parse(encodingStream, new Writer(path, out)); } finally { if (encodingStream != null) { try { encodingStream.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } } }