XmlDatumWriter.java example

Explorer
xml-to-avro-master
- avro-to-xml
  - src
    - main
      - java
        avro
        complex_schema
        allTheThings.java
        anyAndFriends.java
        avroEnum.java
        backtrack.java
        complexExtension.java
        enums
        avroEnum.java
        firstMap.java
        fixed.java
        listOfUnion.java
        mixedType.java
        prohibit.java
        qName.java
        realRoot.java
        secondMap.java
        simpleExtension.java
        simpleRestriction.java
        unsignedLongList.java
        value.java
        xmlEnum.java
        mpigott
        avro
        xml
        Main.java
        sql
        xml
        SqlAttribute.java
        SqlRelationship.java
        SqlSchema.java
        SqlSchemaGenerator.java
        SqlTable.java
        SqlType.java
        SqlXmlConfig.java
        package-info.java
        org
        apache
        avro
        xml
        AvroPathNode.java
        AvroRecordInfo.java
        AvroSchemaApplier.java
        AvroSchemaGenerator.java
        Utils.java
        XmlDatumConfig.java
        XmlDatumReader.java
        XmlDatumWriter.java
        package-info.java
        w3
        www
        _2001
        xmlschema
        qName.java
    - test
      - java
        org
        apache
        avro
        xml
        TestAvroSchemaApplier.java
        TestAvroSchemaGenerator.java
        TestUtils.java
        TestXmlDatumWriter.java
        TestXmlToAvroAndBack.java
        UtilsForTests.java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.avro.xml;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;

import javax.xml.bind.DatatypeConverter;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;
import javax.xml.namespace.QName;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.stream.StreamSource;

import org.apache.avro.Schema;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Encoder;
import org.apache.ws.commons.schema.XmlSchemaAttribute;
import org.apache.ws.commons.schema.XmlSchemaCollection;
import org.apache.ws.commons.schema.XmlSchemaElement;
import org.apache.ws.commons.schema.constants.Constants;
import org.apache.ws.commons.schema.docpath.SaxWalkerOverDom;
import org.apache.ws.commons.schema.docpath.XmlSchemaDocumentNode;
import org.apache.ws.commons.schema.docpath.XmlSchemaNamespaceContext;
import org.apache.ws.commons.schema.docpath.XmlSchemaPathFinder;
import org.apache.ws.commons.schema.docpath.XmlSchemaPathNode;
import org.apache.ws.commons.schema.docpath.XmlSchemaStateMachineGenerator;
import org.apache.ws.commons.schema.docpath.XmlSchemaStateMachineNode;
import org.apache.ws.commons.schema.walker.XmlSchemaAttrInfo;
import org.apache.ws.commons.schema.walker.XmlSchemaBaseSimpleType;
import org.apache.ws.commons.schema.walker.XmlSchemaTypeInfo;
import org.apache.ws.commons.schema.walker.XmlSchemaWalker;
import org.w3c.dom.Document;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Reads an XML {@link Document} and writes it to an {@link Encoder}.
 * <p>
 * Generates an Avro {@link Schema} on the fly from the XML Schema itself. 
 * That {@link Schema} can be retrieved by calling {@link #getSchema()}.
 * </p>
 */
public class XmlDatumWriter implements DatumWriter<Document> {

  private static final QName NIL_ATTR =
      new QName("http://www.w3.org/2001/XMLSchema-instance", "nil");

  private final XmlSchemaCollection xmlSchemaCollection;
  private final XmlSchemaStateMachineNode stateMachine;
  private Schema schema;

  private static class StackEntry {
    XmlSchemaDocumentNode<AvroRecordInfo> docNode;
    boolean receivedContent;

    StackEntry(XmlSchemaDocumentNode<AvroRecordInfo> docNode) {
      this.docNode = docNode;
      this.receivedContent = false;
    }
  }

  private static class Writer extends DefaultHandler {
    private static final XmlSchemaTypeInfo XML_MIXED_CONTENT_TYPE =
        new XmlSchemaTypeInfo(XmlSchemaBaseSimpleType.STRING);

    private static final Schema AVRO_MIXED_CONTENT_SCHEMA =
        Schema.create(Schema.Type.STRING);

    private XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> currLocation;
    private StringBuilder content;
    private QName currAnyElem;
    private ArrayList<StackEntry> stack;

    private final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path;
    private final Encoder out;
    private final XmlSchemaNamespaceContext nsContext;

    Writer(
        XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path,
        Encoder out) {

      this.path = path;
      this.out = out;

      nsContext = new XmlSchemaNamespaceContext();
      stack = new ArrayList<StackEntry>();
      currLocation = null;
      content = null;
      currAnyElem = null;
    }

    @Override
    public void startDocument() throws SAXException {
      currLocation = path;
    }

    @Override
    public void startPrefixMapping(String prefix, String uri)
        throws SAXException {

      nsContext.addNamespace(prefix, uri);
    }

    @Override
    public void endPrefixMapping(String prefix) throws SAXException {
      nsContext.removeNamespace(prefix);
    }

    @Override
    public void startElement(
        String uri,
        String localName,
        String qName,
        Attributes atts) throws SAXException {

      if (currAnyElem != null) {
        // We are inside an any element and not processing this one.
        return;
      }

      final QName elemQName = new QName(uri, localName);

      walkToElement(elemQName);

      if (!currLocation
            .getDirection()
            .equals(XmlSchemaPathNode.Direction.CHILD)
          && !currLocation
               .getDirection()
               .equals(XmlSchemaPathNode.Direction.SIBLING)) {
        throw new IllegalStateException(
            "We are starting an element, so our path node direction should be "
            + "to a CHILD or SIBLING, not "
            + currLocation.getDirection());
      }

      if (currLocation
            .getStateMachineNode()
            .getNodeType()
            .equals(XmlSchemaStateMachineNode.Type.ANY)) {

        // This is an any element; we are not processing it.
        currAnyElem = elemQName;
        return;
      }

      try {
        final XmlSchemaDocumentNode<AvroRecordInfo> doc =
            currLocation.getDocumentNode();
        final AvroRecordInfo recordInfo = doc.getUserDefinedContent();

        Schema avroSchema = recordInfo.getAvroSchema();

        final List<XmlSchemaAttrInfo> attributes =
            doc.getStateMachineNode().getAttributes();

        final HashMap<String, XmlSchemaTypeInfo> attrTypes =
            new HashMap<String, XmlSchemaTypeInfo>();

        final HashMap<String, XmlSchemaAttribute> schemaAttrs =
            new HashMap<String, XmlSchemaAttribute>();

        for (XmlSchemaAttrInfo attribute : attributes) {
          attrTypes.put(
              attribute.getAttribute().getName(),
              attribute.getType());

          schemaAttrs.put(
              attribute.getAttribute().getName(),
              attribute.getAttribute());
        }

        // If there are children, we want to start an array and end it later.
        final StackEntry entry =
            new StackEntry(currLocation.getDocumentNode());

        if (avroSchema.getType().equals(Schema.Type.RECORD)) {
          if ( !stack.isEmpty() ) {
            out.startItem();
          }
          if (recordInfo.getUnionIndex() >= 0) {
            out.writeIndex( recordInfo.getUnionIndex() );
          }

        } else if ( avroSchema.getType().equals(Schema.Type.MAP) ) {
          final AvroPathNode mapNode = currLocation.getUserDefinedContent();
          if (mapNode == null) {
            throw new IllegalStateException(
                "Reached "
                + elemQName
                + ", a MAP node, but there is no map information here.");
          }

          switch ( mapNode.getType() ) {
          case MAP_START:
            {
              if ( !stack.isEmpty() ) {
                out.startItem();
              }
              if (recordInfo.getUnionIndex() >= 0) {
                out.writeIndex( recordInfo.getUnionIndex() );
              }
              out.writeMapStart();
              out.setItemCount( mapNode.getMapSize() );
            }
          case ITEM_START:
            {
              out.startItem();

              avroSchema = avroSchema.getValueType();

              /* If the MAP value is another UNION, reach
               * into that one to fetch the schema.
               */
              final int mapUnionIndex = recordInfo.getMapUnionIndex();
              if (mapUnionIndex >= 0) {
                avroSchema = avroSchema.getTypes().get(mapUnionIndex);
              }

              String key = null;

              for (int fieldIndex = 0;
                  fieldIndex < avroSchema.getFields().size() - 1;
                  ++fieldIndex) {

                final Schema.Field field =
                    avroSchema.getFields().get(fieldIndex);

                final XmlSchemaTypeInfo attrType = attrTypes.get(field.name());

                final XmlSchemaAttribute xsa = schemaAttrs.get(field.name());

                if ((attrType.getUserRecognizedType() != null)
                    && attrType
                         .getUserRecognizedType()
                         .equals(Constants.XSD_ID)) {
                  key =
                      getAttrValue(
                          atts,
                          xsa.getQName().getNamespaceURI(),
                          field.name());

                  if (key == null) {
                    throw new IllegalStateException(
                        "Attribute value for "
                        + xsa.getQName()
                        + " of element "
                        + elemQName
                        + " is null.");
                  }
                  break;
                }
              }

              if (key == null) {
                throw new IllegalStateException(
                    "Unable to find key for element " + elemQName);
              }

              out.writeString(key);

              /* If the MAP value is another UNION, write
               * the union index before continuing.
               */
              if (mapUnionIndex >= 0) {
                out.writeIndex(mapUnionIndex);
              }
              break;
            }
          case MAP_END:
          case CONTENT:
          default:
            throw new IllegalStateException(
                "Did not expect to find a map node of type "
                + mapNode.getType()
                + " when starting "
                + elemQName
                + ".");
          }

        } else {
          throw new IllegalStateException(
              "Elements are either MAPs or RECORDs, not "
              + avroSchema.getType()
              + "s.");
        }

        /* The last element in the set of fields is the children.  We want
         * to process the children separately as they require future calls
         * to characters() and/or startElement().
         */
        for (int fieldIndex = 0;
            fieldIndex < avroSchema.getFields().size() - 1;
            ++fieldIndex) {

          final Schema.Field field = avroSchema.getFields().get(fieldIndex);
          if (field.name().equals(elemQName.getLocalPart())) {
            // We reached the children field early ... not supposed to happen!
            throw new IllegalStateException(
                "The children field is indexed at "
                + fieldIndex
                + " when it was expected to be the last element, or "
                + (avroSchema.getFields().size() - 1)
                + ".");
          }

          final XmlSchemaTypeInfo typeInfo = attrTypes.get( field.name() );
          final QName attrQName = schemaAttrs.get( field.name() ).getQName();

          String value =
              getAttrValue(
                  atts,
                  attrQName.getNamespaceURI(),
                  field.name());

          if (value == null) {
            // See if there is a default or fixed value instead.
            final XmlSchemaAttribute schemaAttr =
                schemaAttrs.get( field.name() );

            value = schemaAttr.getDefaultValue();
            if (value == null) {
              value = schemaAttr.getFixedValue();
            }
          }

          try {
            write(typeInfo, attrQName, field.schema(), value);
          } catch (Exception e) {
            throw new RuntimeException(
                "Could not write "
                + field.name()
                + " in "
                + field.schema().toString()
                + " to the output stream for element "
                + elemQName,
                e);
          }
        }

        final XmlSchemaTypeInfo elemType =
            doc.getStateMachineNode().getElementType();

        boolean isComplexType = true;
        if ( !elemType.getType().equals(XmlSchemaTypeInfo.Type.COMPLEX) ) {
          isComplexType = false;
        }

        if (avroSchema
              .getField( elemQName.getLocalPart() )
              .schema()
              .getType()
              .equals(Schema.Type.ARRAY)
            && isComplexType) {
          out.writeArrayStart();

          if (recordInfo.getNumChildren() > 0) {
            out.setItemCount( recordInfo.getNumChildren() );
          } else {
            out.setItemCount(0);
          }

          /* We expect to receive child elements; no need to look
           * for a default or fixed value once this element exits.
           */
          entry.receivedContent = true;

        } else if (avroSchema
                     .getField( elemQName.getLocalPart() )
                     .schema()
                     .getType()
                     .equals(Schema.Type.NULL) ) {
          out.writeNull();
          entry.receivedContent = true;

        } else {
          final int nilIndex =
              atts.getIndex(
                  NIL_ATTR.getNamespaceURI(),
                  NIL_ATTR.getLocalPart()); 

          if ((nilIndex >= 0)
              && Boolean.parseBoolean(atts.getValue(nilIndex))) {

            write(doc.getStateMachineNode().getElementType(),
                  elemQName,
                  avroSchema.getField( elemQName.getLocalPart() ).schema(),
                  null);
            entry.receivedContent = true;
          }
        }

        stack.add(entry);

      } catch (Exception e) {
        throw new RuntimeException(
            "Unable to write "
            + elemQName
            + " to the output stream.",
            e);
      }
    }

    @Override
    public void characters(char[] ch, int start, int length)
        throws SAXException {

      if (currAnyElem != null) {
        // We do not process wildcard elements.
        return;
      }

      if (stack.isEmpty()) {
        throw new SAXException(
            "We are processing content, but the element stack is empty!");
      }

      final XmlSchemaDocumentNode<AvroRecordInfo> owningElem =
          stack.get(stack.size() - 1).docNode;

      XmlSchemaPathNode path = walkToContent(owningElem);

      if (path == null) {
        final String str = new String(ch, start, length).trim();

        if (str.isEmpty()) {
          return;
        } else {
          if (path == null) {
            throw new SAXException(
                "We are processing characters \""
                + str
                + "\" for "
                + owningElem
                    .getStateMachineNode()
                    .getElement()
                    .getQName()
                + " but the current direction is "
                + currLocation.getDirection()
                + " to "
                + currLocation.getStateMachineNode()
                + ", not CONTENT.");
          }
        }

      } else {
        currLocation = path;

        if (currLocation.getNext() == null) {
          throw new SAXException(
              "We are processing characters for "
              + stack.get(stack.size() - 1)
                  .docNode
                  .getStateMachineNode()
                  .getElement()
                  .getQName()
              + " but somehow the path ends here!");
        }
      }

      /* If characters() will be called multiple times, we want to collect
       * all of them in the "content" StringBuilder, then process it all
       * once the last bit of content has been collected.
       *
       * This includes where content is interspersed with any elements, which
       * are skipped anyway.
       *
       * If this is the last content node, we'll just write it all out here.
       */
      final boolean moreContentComing =
          hasMoreContent(currLocation.getNext(), owningElem);

      String result = null;
      if (moreContentComing
          || ((content != null) && (content.length() > 0))) {

        if (content == null) {
          content = new StringBuilder();
        }
        content.append(ch, start, length);

        if (!moreContentComing) {
          // If this is the last node, process the content.
          result = content.toString();
          content.delete(0, content.length());
        }
      } else {
        // This is the only content node - just write it.
        result = new String(ch, start, length);
      }

      if (result != null) {
        final StackEntry entry = stack.get(stack.size() - 1);
        final XmlSchemaDocumentNode<AvroRecordInfo> docNode = entry.docNode;

        final XmlSchemaTypeInfo elemType =
            docNode.getStateMachineNode().getElementType();

        final QName elemQName =
            docNode
              .getStateMachineNode()
              .getElement()
              .getQName();

        final Schema avroSchema =
           docNode
             .getUserDefinedContent()
             .getAvroSchema()
             .getField(elemQName.getLocalPart())
             .schema();

        try {
          final AvroPathNode contentPathNode =
              currLocation.getUserDefinedContent();

          if ((contentPathNode != null)
              && contentPathNode.getType().equals(AvroPathNode.Type.CONTENT)) {

            out.startItem();
            out.writeIndex(contentPathNode.getContentUnionIndex());

            write(
                XML_MIXED_CONTENT_TYPE,
                elemQName,
                AVRO_MIXED_CONTENT_SCHEMA,
                result);

          } else {
            write(elemType, elemQName, avroSchema, result);
          }
          entry.receivedContent = true;
        } catch (Exception e) {
          throw new RuntimeException(
              "Unable to write the content \""
              + result
              + "\" for "
              + elemQName,
              e);
        }
      }
    }

    @Override
    public void endElement(
        String uri,
        String localName,
        String qName)
        throws SAXException
    {
      final QName elemQName = new QName(uri, localName);

      if (currAnyElem != null) {
        if (currAnyElem.equals(elemQName)) {
          // We are exiting an any element; prepare for the next one!
          currAnyElem = null;
        }
        return;
      }

      final StackEntry entry = stack.remove(stack.size() - 1);
      final XmlSchemaDocumentNode<AvroRecordInfo> docNode = entry.docNode;

      final XmlSchemaTypeInfo elemType =
          docNode.getStateMachineNode().getElementType();

      if (!entry.receivedContent) {

        /* Look for either the default value
         * or fixed value and apply it, if any.
         */
        String value =
            docNode.getStateMachineNode().getElement().getDefaultValue();

        if (value == null) {
          value = docNode.getStateMachineNode().getElement().getFixedValue();
        }

        final AvroRecordInfo record = docNode.getUserDefinedContent();

        Schema avroSchema = record.getAvroSchema();

        if ( avroSchema.getType().equals(Schema.Type.MAP) ) {
          avroSchema = avroSchema.getValueType();

          if (record.getMapUnionIndex() >= 0) {
            avroSchema = avroSchema.getTypes().get(record.getMapUnionIndex());
          }
        }

        avroSchema = avroSchema.getField(localName).schema();

        try {
          write(elemType, elemQName, avroSchema, value);
        } catch (IOException e) {
          throw new RuntimeException(
              "Attempted to write a default value of \""
              + value
              + "\" for "
              + elemQName
              + " and failed.",
              e);
        }
      }

      final QName stackElemQName =
          docNode
            .getStateMachineNode()
            .getElement()
            .getQName();

      if (!stackElemQName.equals(elemQName)) {
        throw new IllegalStateException(
            "We are leaving "
            + elemQName
            + " but the element on the stack is "
            + stackElemQName + ".");
      }

      Schema avroSchema =
          docNode
            .getUserDefinedContent()
            .getAvroSchema();

      boolean isMapEnd = false;
      if (avroSchema.getType().equals(Schema.Type.MAP)) {
        avroSchema = avroSchema.getValueType();

        final int mapUnionIndex =
            docNode.getUserDefinedContent().getMapUnionIndex();
        if (mapUnionIndex >= 0) {
          avroSchema = avroSchema.getTypes().get(mapUnionIndex);
        }

        isMapEnd = isMapEnd();
      }

      boolean isComplexType = true;
      if ( !elemType.getType().equals(XmlSchemaTypeInfo.Type.COMPLEX) ) {
        isComplexType = false;
      }

      if (avroSchema
            .getField( elemQName.getLocalPart() )
            .schema()
            .getType()
            .equals(Schema.Type.ARRAY)
          && isComplexType) {
        try {
          out.writeArrayEnd();
        } catch (Exception e) {
          throw new RuntimeException(
              "Unable to end the array for " + elemQName, e);
        }
      }

      if (isMapEnd) {
        try {
          out.writeMapEnd();
        } catch (Exception e) {
          throw new RuntimeException("Unable to process a MAP_END.", e);
        }

      }
    }

    @Override
    public void endDocument() throws SAXException {
      if (currLocation.getNext() != null) {
        currLocation = currLocation.getNext();
        while (currLocation != null) {
          if (!currLocation
                 .getDirection()
                 .equals(XmlSchemaPathNode.Direction.PARENT)) {
            throw new IllegalStateException(
                "Path has more nodes after document end: "
                + currLocation.getDirection()
                + " | "
                + currLocation.getStateMachineNode());
          }
          currLocation = currLocation.getNext();
        }
      }
    }

    private void walkToElement(QName elemName) {
      if (stack.isEmpty()
          && currLocation
               .getStateMachineNode()
               .getNodeType()
               .equals(XmlSchemaStateMachineNode.Type.ELEMENT)
          && currLocation
               .getStateMachineNode()
               .getElement()
               .getQName()
               .equals(elemName)) {
        return;
      }

      do {
        currLocation = currLocation.getNext();
      } while ((currLocation != null)
                && (currLocation
                      .getDirection()
                      .equals(XmlSchemaPathNode.Direction.PARENT)
                    || (!currLocation
                           .getDirection()
                           .equals(XmlSchemaPathNode.Direction.PARENT)
                         && !currLocation
                               .getStateMachineNode()
                               .getNodeType()
                               .equals(XmlSchemaStateMachineNode.Type.ELEMENT)
                         && !currLocation
                               .getStateMachineNode()
                               .getNodeType()
                               .equals(XmlSchemaStateMachineNode.Type.ANY))));

      if (currLocation == null) {
        throw new IllegalStateException(
            "Cannot find " + elemName + " in the path!");

      } else if (
          currLocation
            .getStateMachineNode()
            .getNodeType()
            .equals(XmlSchemaStateMachineNode.Type.ELEMENT)
          && !currLocation
                .getStateMachineNode()
                .getElement()
                .getQName()
                .equals(elemName)) {
        throw new IllegalStateException(
            "The next element in the path is "
            + currLocation.getStateMachineNode().getElement().getQName()
            + " ("
            + currLocation.getDirection()
            + "), not "
            + elemName
            + ".");
      }
    }

    /**
     * For a path to be exiting a particular element's
     * scope, it must be doing one of four things:
     *
     * 1. It is null, indicating the end of the document.
     * 2. It is a PARENT path to the owning element's parent.
     * 3. It is a CHILD path to the owning element's child (wildcard) element.
     * 4. It is a SIBLING path to a new element instance.
     */
    private static boolean pathExitsElementScope(
        XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path,
        XmlSchemaDocumentNode<AvroRecordInfo> owningElem,
        boolean ignoreAny) {

      // 1. This is the end of the path.
      if (path == null) {
        return true;
      }

      // 2. This is a PARENT path to the owning element's parent.
      final XmlSchemaDocumentNode<AvroRecordInfo> parentElem =
          owningElem.getParent();

      if (path.getDirection().equals(XmlSchemaPathNode.Direction.PARENT)
          && path.getDocumentNode() == parentElem) {
        return true;
      }

      // 3. It is a CHILD path to the owning element's child.
      final XmlSchemaStateMachineNode.Type nodeType =
          path.getStateMachineNode().getNodeType();

      final boolean isElement =
          nodeType.equals(XmlSchemaStateMachineNode.Type.ELEMENT);

      final boolean isAny =
          nodeType.equals(XmlSchemaStateMachineNode.Type.ANY);

      if (path.getDirection().equals(XmlSchemaPathNode.Direction.CHILD)
          && (isElement || (isAny && !ignoreAny))) {
        return true;
      }

      // 4. It is a SIBLING path to a new element instance.
      if (path.getDirection().equals(XmlSchemaPathNode.Direction.SIBLING)
          && (isElement || (isAny && !ignoreAny))) {
        return true;
      }

      // It is none of these things; we are still in the scope.
      return false;
    }

    private static boolean hasMoreContent(
        XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path,
        XmlSchemaDocumentNode<AvroRecordInfo> owningElem) {

      if (path == null) {
        return false;
      }

      while (!pathExitsElementScope(path, owningElem, true)
             && !path
                  .getDirection()
                  .equals(XmlSchemaPathNode.Direction.CONTENT) ) {
        path = path.getNext();
      }

      if ((path != null)
          && path
               .getDirection()
               .equals(XmlSchemaPathNode.Direction.CONTENT) ) {
        return true;
      }

      return false;
    }

    private XmlSchemaPathNode walkToContent(
        XmlSchemaDocumentNode<AvroRecordInfo> owningElem) {

      if (currLocation == null) {
        return null;
      }

      XmlSchemaPathNode path = currLocation.getNext();

      while (!pathExitsElementScope(path, owningElem, false)
             && !path
                  .getDirection()
                  .equals(XmlSchemaPathNode.Direction.CONTENT) ) {
        path = path.getNext();
      }

      if ((path != null)
          && path
               .getDirection()
               .equals(XmlSchemaPathNode.Direction.CONTENT) ) {
        return path;
      }

      return null;
    }

    private boolean isMapEnd() {
      XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> position = currLocation;
      AvroPathNode pathInfo = null;

      do {
        position = position.getNext();
        if (position != null) {
          pathInfo = position.getUserDefinedContent();
        }
      } while ((position != null)
                && ((pathInfo == null)
                    || ((pathInfo != null)
                         && pathInfo.getType().equals(
                             AvroPathNode.Type.CONTENT))));

      return ((position != null)
              && position
                   .getUserDefinedContent()
                   .getType()
                   .equals(AvroPathNode.Type.MAP_END));
    }

    private static String getAttrValue(
        Attributes atts,
        String namespaceUri,
        String name) {

      /* Attributes in XML Schema each have their own namespace, which
       * is not supported in Avro.  So, we will see if we can find the
       * attribute using the existing namespace, and if not, we will
       * walk all of them to see which one has the same name.
       */
      String value =
          atts.getValue(namespaceUri, name);

      if (value == null) {
        for (int attrIndex = 0;
            attrIndex < atts.getLength();
            ++attrIndex) {
          if (atts.getLocalName(attrIndex).equals(name)) {
            value = atts.getValue(attrIndex);
            break;
          }
        }
      }

      return value;
    }

    private void write(
        XmlSchemaTypeInfo xmlType,
        QName xmlQName,
        Schema schema,
        String data) throws IOException {

      write(xmlType, xmlQName, schema, data, -1);
    }

    private void write(
        XmlSchemaTypeInfo xmlType,
        QName xmlQName,
        Schema schema,
        String data,
        int unionIndex)
        throws IOException {

      /* If the data is empty or null, write
       * it as a null or string, if possible.
       */
      final XmlSchemaBaseSimpleType baseType = xmlType.getBaseType();

      if ((data == null) || data.isEmpty()) {
        boolean isNullable = (schema.getType().equals(Schema.Type.NULL));
        boolean isString = (schema.getType().equals(Schema.Type.STRING));
        int nullUnionIndex = -1;
        int stringIndex = -1;
        if (!isNullable
            && !isString
            && schema.getType().equals(Schema.Type.UNION)) {

          for (int typeIndex = 0;
              typeIndex < schema.getTypes().size();
              ++typeIndex) {

            final Schema.Type type =
                schema.getTypes().get(typeIndex).getType();

            if (type.equals(Schema.Type.NULL)) {
              nullUnionIndex = typeIndex;
              isNullable = true;
              break;
            } else if (type.equals(Schema.Type.STRING)) {
              isString = true;
              stringIndex = typeIndex;
            }
          }
        }

        if (isString && (data != null) && data.isEmpty()) {
          // Preserve empty strings when possible.
          if (stringIndex >= 0) {
            out.writeIndex(stringIndex);
          }
          out.writeString(data);

        } else if (isNullable) {
          if (nullUnionIndex >= 0) {
            out.writeIndex(nullUnionIndex);
          }
          out.writeNull();

        } else if (isString) {
          if (stringIndex >= 0) {
            out.writeIndex(stringIndex);
          }
          out.writeString("");

        } else {
            throw new IOException(
                "Cannot write a null or empty string "
                + "as a non-null or non-string type.");
        }

        return;
      }

      switch ( schema.getType() ) {
      case ARRAY:
        {
          /* While unions of lists of different types are technically possible, 
           * supporting them here would be difficult, to say the least.  For
           * now, only one array type will be supported in a union.
           */
          if (unionIndex >= 0) {
            out.writeIndex(unionIndex);
          }

          if ( XmlSchemaBaseSimpleType.DURATION.equals(baseType) ) {
            final DatatypeFactory datatypeFactory = Utils.getDatatypeFactory();
            final Duration xmlDuration = datatypeFactory.newDuration(data);
            final int months =
                xmlDuration.getYears() * 12 + xmlDuration.getMonths();
            final int days = xmlDuration.getDays();
            final int millis =
                ((((xmlDuration.getHours() * 60)
                    + xmlDuration.getMinutes()) * 60)
                    + xmlDuration.getSeconds()) * 1000;
            out.writeArrayStart();
            out.setItemCount(3);
            out.startItem();
            out.writeInt(months);
            out.startItem();
            out.writeInt(days);
            out.startItem();
            out.writeInt(millis);
            out.writeArrayEnd();
            break;
          }

          if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION) ) {
            xmlType = Utils.chooseUnionType(xmlType, null, schema, unionIndex);
          }
          if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.LIST) ) {
            xmlType = xmlType.getChildTypes().get(0);
          }

          final String[] items = data.split(" ");
          final List<String> itemList = new ArrayList<String>(items.length);
          for (String item : items) {
            if ( !item.isEmpty() ) {
              itemList.add(item);
            }
          }
          out.writeArrayStart();
          out.setItemCount( itemList.size() );
          for (String item : itemList) {
            out.startItem();
            write(xmlType, xmlQName, schema.getElementType(), item);
          }
          out.writeArrayEnd();
          break;
        }
      case UNION:
        {
          int textIndex = -1;
          int bytesIndex = -1;

          Schema bytesType = null;

          final List<Schema> subTypes = schema.getTypes();
          boolean written = false;
          for (int subTypeIndex = 0;
              subTypeIndex < subTypes.size();
              ++subTypeIndex) {
            // Try the text types last.
            final Schema subType = subTypes.get(subTypeIndex);
            if (subType.getType().equals(Schema.Type.BYTES)) {
              bytesIndex = subTypeIndex;
              bytesType = subType;
              continue;
            } else if (subType.getType().equals(Schema.Type.STRING)) {
              textIndex = subTypeIndex;
              continue;
            }

            // Determine the corresponding XML union type.
            XmlSchemaTypeInfo xmlSubType = xmlType;
            if ( xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION) ) {
              xmlSubType =
                  Utils.chooseUnionType(
                      xmlType,
                      xmlQName,
                      subType,
                      subTypeIndex);
            }

            if (xmlSubType != null) {
              try {
                write(xmlSubType, xmlQName, subType, data, subTypeIndex);
                written = true;
                break;
              } catch (Exception e) {
                /* Could not parse the value using the
                 * provided type; try the next one.
                 */
              }
            }
          }

          if (!written) {
            if (bytesIndex >= 0) {
              XmlSchemaTypeInfo subType = xmlType;
              if (xmlType.getType().equals(XmlSchemaTypeInfo.Type.UNION)) {
                subType =
                    Utils.chooseUnionType(
                        xmlType,
                        xmlQName,
                        schema.getTypes().get(bytesIndex),
                        bytesIndex);
              }

              // Only write the bytes if we know how.
              if (subType != null) {
                try {
                  write(subType, xmlQName, bytesType, data, bytesIndex);
                  written = true;
                } catch (Exception e) {
                  // Cannot write the data as bytes either.
                }
              }
            }
            if (!written && (textIndex >= 0)) {
              out.writeIndex(textIndex);
              out.writeString(data);

            } else if (!written) {
              throw new IOException(
                  "Cannot write \""
                  + data
                  + "\" as one of the types in "
                  + schema.toString());
            }
          }
          break;
        }
      case BYTES:
        {
          byte[] bytes = null;
          switch (baseType) {
          case BIN_BASE64:
            bytes = DatatypeConverter.parseBase64Binary(data);
            break;
          case BIN_HEX:
            bytes = DatatypeConverter.parseHexBinary(data);
            break;
          case DECIMAL:
            {
              final BigDecimal decimal =
                  Utils.createBigDecimalFrom(data, schema);
              final BigInteger unscaledValue =
                  decimal.unscaledValue();
              bytes = unscaledValue.toByteArray();
              break;
            }
          default:
            throw new IllegalArgumentException(
                "Cannot generate bytes for data of a base type of "
                + baseType);
          }
          if (unionIndex >= 0) {
            out.writeIndex(unionIndex);
          }
          out.writeBytes(bytes);
          break;
        }
      case STRING:
        {
          if (unionIndex >= 0) {
            out.writeIndex(unionIndex);
          }
          out.writeString(data);
          break;
        }
      case ENUM:
        {
          if ( !schema.hasEnumSymbol(data) ) {
            final int numSymbols = schema.getEnumSymbols().size();

            StringBuilder errMsg = new StringBuilder("\"");
            errMsg.append(data);
            errMsg.append("\" is not a member of the symbols [\"");
            for (int symbolIndex = 0;
                symbolIndex < numSymbols - 1;
                ++symbolIndex) {
              errMsg.append( schema.getEnumSymbols().get(symbolIndex) );
              errMsg.append("\", \"");
            }
            errMsg.append( schema.getEnumSymbols().get(numSymbols - 1) );
            errMsg.append("\"].");

            throw new IOException( errMsg.toString() );
          }
          if (unionIndex >= 0) {
            out.writeIndex(unionIndex);
          }
          out.writeEnum( schema.getEnumOrdinal(data) );
          break;
        }
      case DOUBLE:
        {
          try {
            final double value = Double.parseDouble(data);
            if (unionIndex >= 0) {
              out.writeIndex(unionIndex);
            }
            out.writeDouble(value);
          } catch (NumberFormatException nfe) {
            throw new IOException("\"" + data + "\" is not a double.", nfe);
          }
          break;
        }
      case FLOAT:
        {
          try {
            final float value = Float.parseFloat(data);
            if (unionIndex >= 0) {
              out.writeIndex(unionIndex);
            }
            out.writeFloat(value);
          } catch (NumberFormatException nfe) {
            throw new IOException("\"" + data + "\" is not a float.", nfe);
          }
          break;
        }
      case LONG:
        {
          switch (baseType) {
          case DECIMAL:
            {
              try {
                final long value = Long.parseLong(data);
                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                }
                out.writeLong(value);
              } catch (NumberFormatException nfe) {
                throw new IOException("\"" + data + "\" is not a long.", nfe);
              }
              break;
            }
          case DATETIME:
            {
              try {
                Calendar timestampCal = DatatypeConverter.parseDateTime(data);
                timestampCal.setTimeZone( Utils.getGmtTimeZone() );
                final long value = timestampCal.getTimeInMillis();
                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                }
                out.writeLong(value);
              } catch (IllegalArgumentException e) {
                throw new IOException("\"" + data + "\" is not a datetime.", e);
              }
              break;
            }
          default:
            throw new IOException("Unrecognized long type: " + baseType);
          }
          break;
        }
      case INT:
        {
          switch (baseType) {
          case DECIMAL:
            {
              try {
                final int value = Integer.parseInt(data);
                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                }
                out.writeInt(value);
              } catch (NumberFormatException nfe) {
                throw new IOException("\"" + data + "\" is not an int.", nfe);
              }
              break;
            }
          case DATE:
            {
              try {
                final Calendar dateCal = DatatypeConverter.parseDate(data);
                dateCal.setTimeZone( Utils.getGmtTimeZone() );
                final long diffInMillis =
                    dateCal.getTimeInMillis()
                    - Utils.getUnixEpoch().getTimeInMillis();
                final long diffInDays =
                    TimeUnit.DAYS.convert(diffInMillis, TimeUnit.MILLISECONDS);

                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                  out.writeInt((int) diffInDays);
                }
              } catch (IllegalArgumentException e) {
                throw new IOException("\"" + data + "\" is not a date.", e);
              }
              break;
            }
          case TIME:
            {
              try {
                final Calendar timeCal = DatatypeConverter.parseTime(data);
                timeCal.setTimeZone( Utils.getGmtTimeZone() );
                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                  out.writeInt((int) timeCal.getTimeInMillis());
                }
              } catch (IllegalArgumentException e) {
                throw new IOException("\"" + data + "\" is not a time.", e);
              }
              break;
            }
          default:
            throw new IOException(
                "Unrecognized integer type " + baseType);
          }
          break;
        }
      case BOOLEAN:
        {
          if (data.equalsIgnoreCase("true")
              || data.equalsIgnoreCase("false")) {
            if (unionIndex >= 0) {
              out.writeIndex(unionIndex);
            }
            out.writeBoolean( Boolean.parseBoolean(data) );
          } else {
            throw new IOException('"' + data + "\" is not a boolean.");
          }
          break;
        }
      case RECORD:
        {
          switch (baseType) {
          case QNAME:
            {
              try {
                final QName qName =
                    DatatypeConverter.parseQName(data, nsContext);

                if (unionIndex >= 0) {
                  out.writeIndex(unionIndex);
                }
                out.writeString( qName.getNamespaceURI() );
                out.writeString( qName.getLocalPart() );

              } catch (IllegalArgumentException e) {
                throw new IOException("\"" + data + "\" is not a QName.", e);
              }
              break;
            }
           default:
             throw new IOException(
                 "Cannot write a record of XML Schema Type " + baseType);
          }
          break;
        }
      default:
        throw new IOException("Cannot write data of type " + schema.getType());
      }
    }
  }

  public XmlDatumWriter(XmlDatumConfig config, Schema avroSchema)
      throws IOException {

    if (config == null) {
      throw new IllegalArgumentException("XmlDatumConfig cannot be null.");
    }

    xmlSchemaCollection = new XmlSchemaCollection();
    xmlSchemaCollection.setBaseUri(config.getBaseUri());
    for (StreamSource source : config.getSources()) {
      xmlSchemaCollection.read(source);
    }

    final XmlSchemaStateMachineGenerator stateMachineGen =
        new XmlSchemaStateMachineGenerator();

    final XmlSchemaWalker walker =
        new XmlSchemaWalker(xmlSchemaCollection, stateMachineGen);
    walker.setUserRecognizedTypes( Utils.getAvroRecognizedTypes() );

    AvroSchemaGenerator avroSchemaGen = null;
    if (avroSchema == null) {
      avroSchemaGen =
          new AvroSchemaGenerator(
              config.getBaseUri(),
              config.getSchemaUrls(),
              config.getSchemaFiles());
      walker.addVisitor(avroSchemaGen);
    }

    final XmlSchemaElement rootElement =
        xmlSchemaCollection.getElementByQName(config.getRootTagName());
    walker.walk(rootElement);

    stateMachine = stateMachineGen.getStartNode();

    if (avroSchema == null) {
      schema = avroSchemaGen.getSchema();
    } else {
      schema = avroSchema;
    }
  }

  public XmlDatumWriter(XmlDatumConfig config) throws IOException {
    this(config, null);
  }

  /**
   * Returns the {@link Schema} this <code>XmlDatumWriter</code> is
   * writing against - either the one automatically generated from
   * the {@link XmlDatumConfig} or the {@link Schema} set after that.
   */
  public Schema getSchema() {
    return schema;
  }

  /**
   * Sets the schema to use when writing the XML
   * {@link Document} to the {@link Encoder}.
   *
   * @see org.apache.avro.io.DatumWriter#setSchema(org.apache.avro.Schema)
   */
  @Override
  public void setSchema(Schema schema) {
    if (schema == null) {
      throw new IllegalArgumentException("Avro schema cannot be null.");
    }
    this.schema = schema;
  }

  /**
   * Writes the {@link Document} to the {@link Encoder} in accordance
   * with the {@link Schema} set in {@link #setSchema(Schema)}.
   *
   * <p>
   * If no {@link Schema} was provided, builds one from the {@link Document}
   * and its {@link XmlSchemaCollection}.  The schema can then be retrieved
   * from {@link #getSchema()}.
   * </p>
   *
   * @see DatumWriter#write(java.lang.Object, org.apache.avro.io.Encoder)
   */
  @Override
  public void write(Document doc, Encoder out) throws IOException {
    // 1. Build the path through the schema that describes the document.
    final XmlSchemaPathFinder pathFinder =
        new XmlSchemaPathFinder(stateMachine);
    final SaxWalkerOverDom walker = new SaxWalkerOverDom(pathFinder);
    try {
      walker.walk(doc);
    } catch (Exception se) {
      throw new IOException("Unable to parse the document.", se);
    }
    final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path =
        pathFinder.getXmlSchemaTraversal();

    // 2. Apply Avro schema metadata on top of the document. 
    final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false);
    applier.apply(path);

    // 3. Encode the document.
    walker.removeContentHandler(pathFinder);
    walker.addContentHandler( new Writer(path, out) );

    try {
      walker.walk(doc);
    } catch (SAXException e) {
      throw new IOException("Unable to encode the document.", e);
    }
  }

  /**
   * Writes the XML in the provided {@link File} to the {@link Encoder} in
   * accordance with the {@link Schema} set in {@link #setSchema(Schema)}.
   *
   * <p>
   * If no {@link Schema} was provided, builds one from the {@link Document}
   * and its {@link XmlSchemaCollection}.  The schema can then be retrieved
   * from {@link #getSchema()}.
   * </p>
   */
  public void write(File xmlFile, Encoder out)
      throws IOException, ParserConfigurationException, SAXException {

    final SAXParserFactory factory = SAXParserFactory.newInstance();
    factory.setNamespaceAware(true);

    // 1. Build the path through the schema that describes the document.
    final SAXParser pathFindingParser = factory.newSAXParser();

    final XmlSchemaPathFinder pathFinder =
        new XmlSchemaPathFinder(stateMachine);
    pathFindingParser.parse(xmlFile, pathFinder);

    final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path =
        pathFinder.getXmlSchemaTraversal();

    // 2. Apply Avro schema metadata on top of the document. 
    final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false);
    applier.apply(path);

    // 3. Encode the document.
    final SAXParser encodingParser = factory.newSAXParser();
    encodingParser.parse(xmlFile, new Writer(path, out));
  }

  /**
   * Writes the XML retrieved from the provided {@link URL} to the
   * {@link Encoder} in accordance with the {@link Schema} set in
   * {@link #setSchema(Schema)}.
   *
   * <p>
   * If no {@link Schema} was provided, builds one from the {@link Document}
   * and its {@link XmlSchemaCollection}.  The schema can then be retrieved
   * from {@link #getSchema()}.
   * </p>
   */
  public void write(URL xmlUrl, Encoder out)
      throws IOException, ParserConfigurationException, SAXException {

    final SAXParserFactory factory = SAXParserFactory.newInstance();
    factory.setNamespaceAware(true);

    // 1. Build the path through the schema that describes the document.
    final SAXParser pathFindingParser = factory.newSAXParser();

    final XmlSchemaPathFinder pathFinder =
        new XmlSchemaPathFinder(stateMachine);

    InputStream pathFindingStream = null;
    try {
      pathFindingStream = xmlUrl.openStream();
      pathFindingParser.parse(pathFindingStream, pathFinder);
    } finally {
      if (pathFindingStream != null) {
        try {
          pathFindingStream.close();
        } catch (IOException ioe) {
          ioe.printStackTrace();
        }
      }
    }

    final XmlSchemaPathNode<AvroRecordInfo, AvroPathNode> path =
        pathFinder.getXmlSchemaTraversal();

    // 2. Apply Avro schema metadata on top of the document. 
    final AvroSchemaApplier applier = new AvroSchemaApplier(schema, false);
    applier.apply(path);

    // 3. Encode the document.
    InputStream encodingStream = null;
    try {
      encodingStream = xmlUrl.openStream();
      final SAXParser encodingParser = factory.newSAXParser();
      encodingParser.parse(encodingStream, new Writer(path, out));
    } finally {
      if (encodingStream != null) {
        try {
          encodingStream.close();
        } catch (IOException ioe) {
          ioe.printStackTrace();
        }
      }
    }
  }
}