/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.avro.xml; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import javax.xml.namespace.QName; import org.apache.avro.Schema; import org.apache.ws.commons.schema.XmlSchemaAttribute; import org.apache.ws.commons.schema.XmlSchemaElement; import org.apache.ws.commons.schema.XmlSchemaUse; import org.apache.ws.commons.schema.docpath.XmlSchemaDocumentNode; import org.apache.ws.commons.schema.docpath.XmlSchemaPathNode; import org.apache.ws.commons.schema.docpath.XmlSchemaStateMachineNode; import org.apache.ws.commons.schema.walker.XmlSchemaAttrInfo; import org.apache.ws.commons.schema.walker.XmlSchemaTypeInfo; /** * Applies an Avro schema to a tree described by * {@link XmlSchemaDocumentNode}s and {@link XmlSchemaDocumentPathNode}s. * * <p> * Schema evolution is handled with the following conversions: * <ul> * <li>STRING, BOOLEAN, ENUM, DOUBLE, FLOAT, LONG, INT -> STRING</li> * <li>DOUBLE, FLOAT, LONG, INT -> DOUBLE</li> * <li>FLOAT, LONG, INT -> FLOAT</li> * <li>LONG, INT -> LONG</li> * <li>INT -> INT</li> * <li>BOOLEAN -> BOOLEAN</li> * <li>BYTES -> BYTES</li> * <li>ENUM -> ENUM when destination ENUM is a superset of the source.</li> * <li>RECORD -> RECORD when all the fields can be converted as well.</li> * </ul> * </p> * * <p> * Also joins sibling map elements under the same map, * and tracks the content nodes of a mixed element. * </p> */ final class AvroSchemaApplier { private List<Schema> unionOfValidElementsStack; private List<AvroRecordInfo> avroRecordStack; private final Schema avroSchema; private final Map<Schema.Type, Set<Schema.Type>> conversionCache; private final boolean xmlIsWritten; /** * {@link XmlSchemaPathNode} contain their destination * {@link XmlSchemaDocumentNode}, but not their originating * one. Since we do not "leave" a {@link XmlSchemaDocumentNode} * until we traverse to its parent, we need to track the parent * node in addition to the current one. */ private static class StackEntry { StackEntry(XmlSchemaDocumentNode<AvroRecordInfo> docNode) { this.docNode = docNode; this.parentNode = docNode.getParent(); } final XmlSchemaDocumentNode<AvroRecordInfo> docNode; final XmlSchemaDocumentNode<AvroRecordInfo> parentNode; int occurrence; } /** * Creates a new <code>AvroSchemaApplier</code> * with the provided root node. */ AvroSchemaApplier(Schema avroSchema, boolean xmlIsWritten) { this.avroSchema = avroSchema; this.xmlIsWritten = xmlIsWritten; conversionCache = new HashMap<Schema.Type, Set<Schema.Type>>(); unionOfValidElementsStack = new ArrayList<Schema>(); avroRecordStack = new ArrayList<AvroRecordInfo>(); if ( avroSchema.getType().equals(Schema.Type.ARRAY) ) { // ARRAY of UNION of RECORDs/MAPs is not valid when writing XML. if (xmlIsWritten) { throw new IllegalArgumentException( "The Avro Schema cannot be an ARRAY of UNION of MAPs/RECORDs when " + "writing XML; it must conform to the corresponding XML schema."); } /* The user is only looking to retrieve specific elements from the XML * document. Likewise, the next valid elements are only the ones in * that list. * * (The expected format is Array<Union<Type>>) */ if ( !avroSchema.getElementType().getType().equals(Schema.Type.UNION) ) { throw new IllegalArgumentException( "If retrieving only a subset of elements in the document, the Avro" + " Schema must be an ARRAY of UNION of those types, not an ARRAY" + " of " + avroSchema.getElementType().getType()); } // Confirm all of the elements in the UNION are either RECORDs or MAPs. verifyIsUnionOfMapsAndRecords(avroSchema.getElementType(), true); unionOfValidElementsStack.add(avroSchema.getElementType()); } else if ( avroSchema.getType().equals(Schema.Type.UNION) ) { /* It is possible for the root element to actually be the root of a * substitution group. If this happens, the root element could be * one of many different record types. * * This can only be valid if the schema is a union of records. */ verifyIsUnionOfMapsAndRecords(avroSchema, true); unionOfValidElementsStack.add(avroSchema); } else if ( avroSchema.getType().equals(Schema.Type.RECORD) || avroSchema.getType().equals(Schema.Type.MAP) ) { // This is a definition of the root element. List<Schema> union = new ArrayList<Schema>(1); union.add(avroSchema); unionOfValidElementsStack.add( Schema.createUnion(union) ); } else { throw new IllegalArgumentException( "The Avro Schema must be one of the following types: RECORD, MAP," + " UNION of RECORDs/MAPs, or ARRAY of UNION of RECORDs/MAPs."); } } void apply( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> pathStart) { // Add schema information to the document tree. apply(pathStart.getDocumentNode()); // Count maps. findMaps(pathStart); // Update child count for mixed elements. applyContent(pathStart); } private void apply(XmlSchemaDocumentNode<AvroRecordInfo> docNode) { switch (docNode.getStateMachineNode().getNodeType()) { case ELEMENT: processElement(docNode); break; case ALL: case CHOICE: case SEQUENCE: case SUBSTITUTION_GROUP: processGroup(docNode); break; case ANY: // Ignored break; default: throw new IllegalArgumentException( "Document node has an unrecognized type of " + docNode.getStateMachineNode().getNodeType() + '.'); } } private void processElement(XmlSchemaDocumentNode<AvroRecordInfo> doc) { if (!doc .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT)) { throw new IllegalStateException( "Attempted to process an element when the node type is " + doc.getStateMachineNode().getNodeType()); } final XmlSchemaElement element = doc.getStateMachineNode().getElement(); final List<Schema> validNextElements = unionOfValidElementsStack .get(unionOfValidElementsStack.size() - 1) .getTypes(); Schema elemSchema = null; int schemaIndex = 0; int mapSchemaIndex = -1; if (validNextElements != null) { for (; schemaIndex < validNextElements.size(); ++schemaIndex) { Schema possibleSchema = validNextElements.get(schemaIndex); Schema valueType = possibleSchema; if ( possibleSchema.getType().equals(Schema.Type.MAP) ) { valueType = possibleSchema.getValueType(); if ( valueType.getType().equals(Schema.Type.UNION) ) { /* This XML document has multiple sibling tags representable as * MAPs. We need to cycle through them and find the best fit. */ for (mapSchemaIndex = 0; mapSchemaIndex < valueType.getTypes().size(); ++mapSchemaIndex) { final Schema unionType = valueType.getTypes().get(mapSchemaIndex); if ( !unionType.getType().equals(Schema.Type.RECORD) ) { throw new IllegalStateException( "MAPs in Avro Schemas for XML documents must have a value" + " type of either RECORD or UNION of RECORD, not UNION" + " with " + unionType.getType()); } if (typeMatchesElement(unionType, element)) { elemSchema = possibleSchema; break; } } /* If we walked through all of the map elements and did * not find a matching UNION, reset the mapSchemaIndex * and check the next candidate. */ if (elemSchema == null) { mapSchemaIndex = -1; continue; } else { // We found the element! Stop looking. break; } } } if ( !valueType.getType().equals(Schema.Type.RECORD) ) { throw new IllegalStateException( "RECORD, MAP of RECORD, and MAP of UNION of RECORD are allowed. " + valueType.getType() + " cannot exist in any level of that hierarchy."); } /* If we reach here, we have not found the schema, and valueType is of * type RECORD (either the original RECORD or the child of a MAP) and * needs to be checked. */ if (typeMatchesElement(valueType, element)) { elemSchema = possibleSchema; break; } } } if (xmlIsWritten && (elemSchema == null)) { throw new IllegalStateException( "Element \"" + element.getQName() + "\" does not have a corresponding Avro schema. One is needed when" + " writing XML."); } final XmlSchemaTypeInfo typeInfo = doc.getStateMachineNode().getElementType(); Schema unionOfChildrenTypes = null; if (elemSchema != null) { final List<XmlSchemaAttrInfo> attributes = doc.getStateMachineNode().getAttributes(); // Match the element's attributes against the element's schema. for (XmlSchemaAttrInfo attribute : attributes) { processAttribute( element.getQName(), elemSchema, attribute.getAttribute(), attribute.getType(), mapSchemaIndex); } /* Child elements are in a field under the same name as the element. * * In the Avro schema, they may be NULL (no children), a * primitive type, or an ARRAY of UNION of MAPs and RECORDs. */ Schema valueType = elemSchema; if (elemSchema.getType().equals(Schema.Type.MAP)) { valueType = elemSchema.getValueType(); if (mapSchemaIndex >= 0) { valueType = valueType.getTypes().get(mapSchemaIndex); } } Schema.Field childrenField = valueType.getField( element.getName() ); /* If the element has no children, a NULL placeholder is used instead. * Likewise, if the children field is null, it means the children have * been removed in order to be filtered out. */ if (xmlIsWritten && (childrenField == null)) { throw new IllegalStateException( "The children of " + element.getQName() + " in Avro Schema {" + elemSchema.getNamespace() + "}" + elemSchema.getName() + " must exist. If there are no children, an Avro NULL" + " placeholder is required."); } if (childrenField != null) { final Schema childrenSchema = childrenField.schema(); switch (childrenSchema.getType()) { case ARRAY: { if (typeInfo.getType().equals(XmlSchemaTypeInfo.Type.LIST)) { break; } // All group types are ARRAY of UNION of MAP/RECORD. if ( !childrenSchema .getElementType() .getType() .equals(Schema.Type.UNION) ) { throw new IllegalStateException( "If the children of " + element.getQName() + " in Avro Schema {" + elemSchema.getNamespace() + "}" + elemSchema.getName() + " are in a group, the corresponding Avro Schema MUST BE an" + " ARRAY of UNION of MAPs/RECORDs, not " + childrenSchema.getElementType().getType()); } verifyIsUnionOfMapsAndRecords( childrenSchema.getElementType(), typeInfo.isMixed()); unionOfChildrenTypes = childrenSchema.getElementType(); } break; case BOOLEAN: case BYTES: case DOUBLE: case ENUM: case FLOAT: case INT: case LONG: case STRING: case RECORD: { if (!confirmEquivalent( typeInfo, element.getQName(), childrenSchema) ) { throw new IllegalStateException( "Cannot convert between " + typeInfo + " and " + childrenSchema + " for simple content of " + element.getQName() + " in Avro Schema {" + elemSchema.getNamespace() + "}" + elemSchema.getName()); } } break; case NULL: // There are no children, so no further types are valid. break; case UNION: if (typeInfo.getType().equals(XmlSchemaTypeInfo.Type.UNION)) { break; } else if (element.isNillable() && (childrenSchema.getTypes().size() == 2)) { break; } default: throw new IllegalStateException( "Children of element " + element.getQName() + " in Avro Schema {" + elemSchema.getNamespace() + "}" + elemSchema.getName() + " must be either an ARRAY of UNION of MAP/RECORD or a" + " primitive type, not " + childrenSchema.getType()); } } AvroRecordInfo recordInfo = null; if (avroRecordStack.isEmpty() && (doc.getParent() == null)) { recordInfo = new AvroRecordInfo(elemSchema); avroRecordStack.add(recordInfo); } else { recordInfo = new AvroRecordInfo(elemSchema, schemaIndex, mapSchemaIndex); /* Maps will be counted separately, as their * children are not part of this array. * * The stack will be empty if the root element * is part of a substitution group. */ if (!elemSchema.getType().equals(Schema.Type.MAP) && !avroRecordStack.isEmpty()) { for (int docIter = 0; docIter < doc.getIteration(); ++docIter) { avroRecordStack .get(avroRecordStack.size() - 1) .incrementChildCount(); } } avroRecordStack.add(recordInfo); } doc.setUserDefinedContent(recordInfo); } /* If the root schema is an ARRAY of UNION, then the next valid * element will be one of its entries. Otherwise, there are no * next valid entries. * * We want to push that on the stack for when we exit children * of the current element. */ if ((unionOfChildrenTypes == null) && avroSchema.getType().equals(Schema.Type.ARRAY) ) { unionOfChildrenTypes = avroSchema.getElementType(); } // Process the children, if any. if (unionOfChildrenTypes != null) { unionOfValidElementsStack.add(unionOfChildrenTypes); processChildren(doc); unionOfValidElementsStack.remove(unionOfValidElementsStack.size() - 1); } if (elemSchema != null) { avroRecordStack.remove(avroRecordStack.size() - 1); } } private void processAttribute( QName elementName, Schema elementSchema, XmlSchemaAttribute attribute, XmlSchemaTypeInfo attributeType, int mapUnionIndex) { Schema valueType = elementSchema; if ( valueType.getType().equals(Schema.Type.MAP) ) { valueType = valueType.getValueType(); if (mapUnionIndex >= 0) { valueType = valueType.getTypes().get(mapUnionIndex); } } final Schema.Field attrField = valueType.getField( attribute.getName() ); if (xmlIsWritten && (attrField == null) && !attribute.getUse().equals(XmlSchemaUse.OPTIONAL) && !attribute.getUse().equals(XmlSchemaUse.PROHIBITED)) { throw new IllegalStateException( "Element " + elementName + " has a " + attribute.getUse() + " attribute named " + attribute.getQName() + " - when writing to XML, a field in the Avro record must exist."); } if (attrField != null) { Schema attrType = attrField.schema(); if ( attribute.getUse().equals(XmlSchemaUse.OPTIONAL) && attrType.getType().equals(Schema.Type.UNION) ) { /* The XML Schema Attribute may have already been a union, so we * need to walk all of the subtypes and pull out the non-NULL ones. */ final ArrayList<Schema> subset = new ArrayList<Schema>(attrType.getTypes().size() - 1); for (Schema unionSchema : attrType.getTypes()) { if ( !unionSchema.getType().equals(Schema.Type.NULL) ) { subset.add(unionSchema); } } if (subset.size() == 1) { attrType = subset.get(0); } else { attrType = Schema.createUnion(subset); } } if (!confirmEquivalent( attributeType, attribute.getQName(), attrType)) { throw new IllegalStateException( "Cannot convert element " + elementName + " attribute " + attribute.getQName() + " types between " + attributeType.getBaseType() + " and " + attrField.schema()); } } } private void processChildren(XmlSchemaDocumentNode<AvroRecordInfo> doc) { for (int iteration = 1; iteration <= doc.getIteration(); ++iteration) { final SortedMap<Integer, XmlSchemaDocumentNode<AvroRecordInfo>> children = doc.getChildren(iteration); if (children != null) { for (Map.Entry<Integer, XmlSchemaDocumentNode<AvroRecordInfo>> child : children.entrySet()) { apply(child.getValue()); } } } } private void processGroup(XmlSchemaDocumentNode<AvroRecordInfo> doc) { /* The union of valid types is already on the stack from * the owning element. We just need to walk the children. */ switch( doc.getStateMachineNode().getNodeType() ) { case SUBSTITUTION_GROUP: case ALL: case CHOICE: case SEQUENCE: processChildren(doc); break; default: throw new IllegalStateException( "Attempted to process a group, but the document node is of type " + doc.getStateMachineNode().getNodeType()); } } // Confirms the root-level Schema is a UNION of MAPs, RECORDs, or both. private static void verifyIsUnionOfMapsAndRecords( Schema schema, boolean isMixed) { for (Schema unionType : schema.getTypes()) { if (!unionType.getType().equals(Schema.Type.RECORD) && !unionType.getType().equals(Schema.Type.MAP) && !(isMixed && unionType.getType().equals(Schema.Type.STRING))) { throw new IllegalArgumentException( "The Avro Schema may either be a UNION or an ARRAY of UNION, but" + " only if all of the elements in the UNION are of either type" + " RECORD or MAP, not " + unionType.getType()); } else if (unionType.getType().equals(Schema.Type.MAP)) { if ( unionType.getValueType().getType().equals(Schema.Type.UNION) ) { for (Schema mapUnionType : unionType.getValueType().getTypes()) { if (!mapUnionType.getType().equals(Schema.Type.RECORD)) { throw new IllegalArgumentException( "If using a UNION of MAP of UNION, all of the UNION types" + " must be RECORD, not " + mapUnionType.getType()); } } } else if ( !unionType .getValueType() .getType() .equals(Schema.Type.RECORD)) { throw new IllegalArgumentException( "If the Avro Schema is a UNION of MAPs or an ARRAY of UNION of" + " MAPs, all MAP value types must be RECORD or UNION of RECORD," + " not " + unionType.getValueType().getType()); } } } } private static boolean typeMatchesElement(Schema type, XmlSchemaElement element) { boolean match = false; if (type.getName().equals( element.getName() )) { // Confirm the namespaces match. String ns = element.getQName().getNamespaceURI(); if ((ns != null) && !ns.isEmpty()) { try { if (Utils.getAvroNamespaceFor(ns).equals( type.getNamespace())) { // Namespaces match. match = true; } } catch (URISyntaxException e) { throw new IllegalStateException( "Element \"" + element.getQName() + "\" has a namespace that is not a valid URI.", e); } } else { // There is no namespace; auto-match. match = true; } } return match; } /* Confirms two XML Schema simple types are equivalent. Supported types are: * * BOOLEAN * BYTES * DOUBLE * ENUM * FLOAT * INT * LONG * STRING */ private boolean confirmEquivalent( XmlSchemaTypeInfo xmlType, QName xmlTypeQName, Schema avroType) { final Schema xmlAvroType = Utils.getAvroSchemaFor(xmlType, xmlTypeQName, false); if ((avroType != null) && (xmlAvroType == null)) { return false; } else if ((avroType == null) && (xmlAvroType != null)) { return false; } else if ((avroType == null) && (xmlAvroType == null)) { return true; } if (xmlIsWritten) { return confirmEquivalent(avroType, xmlAvroType); } else { return confirmEquivalent(xmlAvroType, avroType); } } /* Confirms two XML Schema simple types are equivalent. Supported types are: * * BOOLEAN * BYTES * DOUBLE * ENUM * FLOAT * INT * LONG * STRING */ private boolean confirmEquivalent(Schema readerType, Schema writerType) { if (readerType.getType().equals(Schema.Type.ARRAY) && (writerType.getType().equals(Schema.Type.ARRAY))) { return confirmEquivalent( readerType.getElementType(), writerType.getElementType()); } else if (readerType.getType().equals(Schema.Type.UNION) && writerType.getType().equals(Schema.Type.UNION)) { // O(N^2) cross-examination. int numFound = 0; for (Schema readerUnionType : writerType.getTypes()) { for (Schema writerUnionType : readerType.getTypes()) { if ( confirmEquivalent(readerUnionType, writerUnionType) ) { ++numFound; break; } } } return (readerType.getTypes().size() == numFound); } if ( conversionCache.containsKey(writerType.getType()) ) { return conversionCache.get( writerType.getType() ) .contains( readerType.getType() ); } final HashSet<Schema.Type> convertibleFrom = new HashSet<Schema.Type>(); switch ( writerType.getType() ) { case STRING: // STRING, BOOLEAN, ENUM, DOUBLE, FLOAT, LONG, INT -> STRING convertibleFrom.add(Schema.Type.STRING); convertibleFrom.add(Schema.Type.BOOLEAN); convertibleFrom.add(Schema.Type.ENUM); /* falls through */ case DOUBLE: // DOUBLE, FLOAT, LONG, INT -> DOUBLE convertibleFrom.add(Schema.Type.DOUBLE); /* falls through */ case FLOAT: // FLOAT, LONG, INT -> FLOAT convertibleFrom.add(Schema.Type.FLOAT); /* falls through */ case LONG: // LONG, INT -> LONG convertibleFrom.add(Schema.Type.LONG); /* falls through */ case INT: // INT -> INT convertibleFrom.add(Schema.Type.INT); break; case BOOLEAN: // BOOLEAN -> BOOLEAN convertibleFrom.add(Schema.Type.BOOLEAN); break; case BYTES: // BYTES -> BYTES convertibleFrom.add(Schema.Type.BYTES); break; case ENUM: case RECORD: // These are more complex. break; default: throw new IllegalArgumentException( "Cannot confirm the equivalency of a reader of type " + readerType.getType() + " and a writer of type " + writerType.getType()); } if ( !convertibleFrom.isEmpty() ) { conversionCache.put(writerType.getType(), convertibleFrom); return convertibleFrom.contains( readerType.getType() ); } /* If we're here, it's because the writer is either an ENUM or a RECORD. * For ENUMs, confirm the writer elements are a superset of the reader * elements. For RECORDs, confirm the fields are convertible. */ if (writerType.getType().equals(Schema.Type.ENUM) && readerType.getType().equals(Schema.Type.ENUM) ) { final List<String> writerSymbols = writerType.getEnumSymbols(); final List<String> readerSymbols = readerType.getEnumSymbols(); for (String readerSymbol : readerSymbols) { if ( !writerSymbols.contains(readerSymbol) ) { return false; } } return true; } else if ( writerType.getType().equals(Schema.Type.RECORD) && readerType.getType().equals(Schema.Type.RECORD) ) { final List<Schema.Field> writerFields = writerType.getFields(); final List<Schema.Field> readerFields = readerType.getFields(); if (readerFields.size() == writerFields.size()) { boolean equivalent = true; for (int fieldIdx = 0; fieldIdx < writerFields.size(); ++fieldIdx) { equivalent = confirmEquivalent( readerFields.get(fieldIdx).schema(), writerFields.get(fieldIdx).schema()); if (!equivalent) { break; } } return equivalent; } } return false; } /** * Avro maps are tricky because they must be defined all at once, but * depending on the schema, their elements may be scattered all across * the document. * * This implementation looks for map nodes that are clustered together, * and counts them for when {@link XmlDatumWriter} takes over. A cluster * starts the first time we reach a path node whose underlying Avro schema * is of type {@link Schema.Type#MAP}. A cluster ends when the next * traversal out of a map node is to its parent element. (Intermediary * groups do not count as the end of the cluster.) * * @param path The path to check if is a map node. */ private static void findMaps( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path) { Map<QName, List<List<AvroPathNode>>> occurrencesByName = new HashMap<QName, List<List<AvroPathNode>>>(); final ArrayList<StackEntry> docNodeStack = new ArrayList<StackEntry>(); AvroPathNode mostRecentlyLeftMap = null; while(path != null) { final boolean isElement = path .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT); final AvroRecordInfo record = path.getDocumentNode().getUserDefinedContent(); final boolean isMapNode = (record != null) && record.getAvroSchema().getType().equals(Schema.Type.MAP); switch (path.getDirection()) { case SIBLING: { if (isElement) { /* This is an element increasing its own occurrence. * This means we need to pop the previous element off * of the stack and start a new one. */ final StackEntry stackEntry = docNodeStack.remove(docNodeStack.size() - 1); if (mostRecentlyLeftMap != null) { addEndNode(occurrencesByName, mostRecentlyLeftMap); } mostRecentlyLeftMap = null; if (stackEntry .docNode .getUserDefinedContent() .getAvroSchema() .getType().equals(Schema.Type.MAP) ) { mostRecentlyLeftMap = new AvroPathNode( path, AvroPathNode.Type.MAP_END, stackEntry .docNode .getStateMachineNode() .getElement() .getQName(), stackEntry.occurrence); } } } /* falls through */ case CHILD: { if (isElement) { StackEntry entry = new StackEntry(path.getDocumentNode()); if (isMapNode) { final QName currQName = path .getStateMachineNode() .getElement() .getQName(); List<List<AvroPathNode>> occurrences = null; if ((mostRecentlyLeftMap == null) || !currQName.equals( mostRecentlyLeftMap.getQName() )) { if (mostRecentlyLeftMap != null) { addEndNode(occurrencesByName, mostRecentlyLeftMap); } final ArrayList<AvroPathNode> pathIndices = new ArrayList<AvroPathNode>(); pathIndices.add( new AvroPathNode( path, AvroPathNode.Type.MAP_START)); incrementMapParentChildCount(path); if (!occurrencesByName.containsKey(currQName)) { occurrences = new ArrayList<List<AvroPathNode>>(); occurrencesByName.put(currQName, occurrences); } else { occurrences = occurrencesByName.get(currQName); } occurrences.add(pathIndices); } else { occurrences = occurrencesByName.get(currQName); occurrences .get(occurrences.size() - 1) .add( new AvroPathNode( path, AvroPathNode.Type.ITEM_START)); } entry.occurrence = occurrences.size() - 1; mostRecentlyLeftMap = null; } docNodeStack.add(entry); } break; } case PARENT: { final StackEntry stackEntry = docNodeStack.get(docNodeStack.size() - 1); if (stackEntry.parentNode == path.getDocumentNode()) { docNodeStack.remove(docNodeStack.size() - 1); if (mostRecentlyLeftMap != null) { addEndNode(occurrencesByName, mostRecentlyLeftMap); } mostRecentlyLeftMap = null; if (stackEntry .docNode .getUserDefinedContent() .getAvroSchema() .getType().equals(Schema.Type.MAP) ) { mostRecentlyLeftMap = new AvroPathNode( path, AvroPathNode.Type.MAP_END, stackEntry .docNode .getStateMachineNode() .getElement() .getQName(), stackEntry.occurrence); } } break; } case CONTENT: break; default: throw new IllegalStateException( "Path of " + path.getStateMachineNode() + " has an unrecognized direction of " + path.getDirection() + "."); } path = path.getNext(); } /* Will be 1 if the root is an element, * and 0 if the root is a substitution group. */ if (docNodeStack.size() > 1) { throw new IllegalStateException( "Expected the stack to have no more than one " + "element in it at the end, but found " + docNodeStack.size() + "."); } for (Map.Entry<QName, List<List<AvroPathNode>>> entry : occurrencesByName.entrySet()) { for (List<AvroPathNode> avroMapNodes : entry.getValue()) { // The MAP_END node doesn't count as a child. avroMapNodes.get(0).setMapSize(avroMapNodes.size() - 1); for (AvroPathNode avroMapNode : avroMapNodes) { avroMapNode.getPathNode().setUserDefinedContent(avroMapNode); } } } } private static void addEndNode( Map<QName, List<List<AvroPathNode>>> occurrencesByName, AvroPathNode mostRecentlyLeftMap) { final List<List<AvroPathNode>> occurrences = occurrencesByName.get(mostRecentlyLeftMap.getQName()); final List<AvroPathNode> nodes = occurrences.get(mostRecentlyLeftMap.getOccurrence()); nodes.add(mostRecentlyLeftMap); } /* All of the elements in a map are grouped together, and likewise cannot be * counted as part of the MAP's parent's children. Likewise, each time we * find a new MAP, we only increment the parent's child count by one. */ private static void incrementMapParentChildCount( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path) { if (!path.getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT)) { throw new IllegalArgumentException( "Starting node should be at an element, not a " + path.getStateMachineNode().getNodeType() + '.'); } XmlSchemaDocumentNode<AvroRecordInfo> docNode = path.getDocumentNode(); do { docNode = docNode.getParent(); } while (!docNode .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT)); if (docNode.getUserDefinedContent() == null) { throw new IllegalStateException( "Reached a node representing " + docNode.getStateMachineNode() + ", but it contains no Avro record information."); } docNode.getUserDefinedContent().incrementChildCount(); } private static void applyContent( XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> startNode) { XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path = startNode; final ArrayList<StackEntry> docNodeStack = new ArrayList<StackEntry>(); while (path != null) { final boolean isElement = path .getStateMachineNode() .getNodeType() .equals(XmlSchemaStateMachineNode.Type.ELEMENT); switch(path.getDirection()) { case SIBLING: if (isElement) { /* This is an element increasing its own occurrence. * This means we need to pop the previous element off * of the stack and start a new one. */ docNodeStack.remove(docNodeStack.size() - 1); } /* falls through */ case CHILD: if (isElement) { StackEntry entry = new StackEntry(path.getDocumentNode()); docNodeStack.add(entry); } break; case PARENT: { final StackEntry stackEntry = docNodeStack.get(docNodeStack.size() - 1); if (stackEntry.parentNode == path.getDocumentNode()) { docNodeStack.remove(docNodeStack.size() - 1); } break; } case CONTENT: { if ((path.getNext() != null) && path .getNext() .getDirection() .equals(XmlSchemaPathNode.Direction.CONTENT)) { /* The writer only writes one content entry, after all of the * individual content entries have been merged together. So * we should skip any content entries that are immediately * followed by another content entry. */ break; } final StackEntry entry = docNodeStack.get(docNodeStack.size() - 1); final AvroRecordInfo recordInfo = entry.docNode.getUserDefinedContent(); Schema schema = recordInfo.getAvroSchema(); if (schema.getType().equals(Schema.Type.MAP)) { schema = schema.getValueType(); if (recordInfo.getMapUnionIndex() >= 0) { schema = schema.getTypes().get(recordInfo.getMapUnionIndex()); } } final XmlSchemaElement elem = entry.docNode.getStateMachineNode().getElement(); final XmlSchemaTypeInfo elemType = entry.docNode.getStateMachineNode().getElementType(); final Schema.Field childField = schema.getField(elem.getQName().getLocalPart()); if (elemType.isMixed() && (childField != null)) { schema = childField.schema(); int unionIdx = -1; if (schema.getType().equals(Schema.Type.ARRAY) && schema .getElementType() .getType() .equals(Schema.Type.UNION)) { final List<Schema> unionTypes = schema.getElementType().getTypes(); for (unionIdx = 0; unionIdx < unionTypes.size(); ++unionIdx) { if (unionTypes .get(unionIdx) .getType() .equals(Schema.Type.STRING)) { break; } } if (unionIdx == unionTypes.size()) { throw new IllegalStateException( "Element " + elem.getQName() + " is a mixed type, but its internal" + " union does not have a STRING!"); } recordInfo.incrementChildCount(); final AvroPathNode pathNode = path.getUserDefinedContent(); if (pathNode == null) { path.setUserDefinedContent(new AvroPathNode(unionIdx)); } else { throw new IllegalStateException( "The path node is for CONTENT, but an " + "AvroPathNode already exists!"); } } } break; } default: throw new IllegalStateException( "Path of " + path.getStateMachineNode() + " has an unrecognized direction of " + path.getDirection() + "."); } path = path.getNext(); } } }