/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro.xml;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.MathContext;
import java.net.URISyntaxException;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.namespace.QName;
import org.apache.avro.Schema;
import org.apache.ws.commons.schema.constants.Constants;
import org.apache.ws.commons.schema.walker.XmlSchemaBaseSimpleType;
import org.apache.ws.commons.schema.walker.XmlSchemaRestriction;
import org.apache.ws.commons.schema.walker.XmlSchemaTypeInfo;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.IntNode;
import org.codehaus.jackson.node.NumericNode;
import org.codehaus.jackson.node.TextNode;
/**
* A set of utilities for encoding and
* decoding XML Documents and Avro data.
*/
class Utils {
private static final int UNDERSCORE_CP = '_';
private static final int PERIOD_CP = '.';
private static final String LOGICAL_TYPE = "logicalType";
// We need to set all time zones to GMT to avoid date conversions.
private static final TimeZone GMT_TIME_ZONE = TimeZone.getTimeZone("GMT");
private static final Calendar UNIX_EPOCH =
Calendar.getInstance(GMT_TIME_ZONE);
private static final Map<QName, Schema.Type> XML_TO_AVRO_TYPE_MAP =
new HashMap<QName, Schema.Type>();
private static DatatypeFactory xmlDatatypeFactory = null;
static {
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_ANYTYPE, Schema.Type.STRING);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_BOOLEAN, Schema.Type.BOOLEAN);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_DECIMAL, Schema.Type.BYTES);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_DOUBLE, Schema.Type.DOUBLE);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_FLOAT, Schema.Type.FLOAT);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_BASE64, Schema.Type.BYTES);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_HEXBIN, Schema.Type.BYTES);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_LONG, Schema.Type.LONG);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_ID, Schema.Type.STRING);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_INT, Schema.Type.INT);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_UNSIGNEDINT, Schema.Type.LONG);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_UNSIGNEDSHORT, Schema.Type.INT);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_QNAME, Schema.Type.RECORD);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_DATE, Schema.Type.INT);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_TIME, Schema.Type.INT);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_DATETIME, Schema.Type.LONG);
XML_TO_AVRO_TYPE_MAP.put(Constants.XSD_DURATION, Schema.Type.ARRAY);
UNIX_EPOCH.set(1970, 0, 1, 0, 0, 0);
UNIX_EPOCH.set(Calendar.MILLISECOND, 0);
}
static TimeZone getGmtTimeZone() {
return GMT_TIME_ZONE;
}
static Calendar getUnixEpoch() {
return UNIX_EPOCH;
}
static DatatypeFactory getDatatypeFactory(){
if (xmlDatatypeFactory == null) {
try {
xmlDatatypeFactory = DatatypeFactory.newInstance();
} catch (DatatypeConfigurationException e) {
throw new IllegalStateException(
"Unable to create the DatatypeFactory for writing XML Schema "
+ "durations.",
e);
}
}
return xmlDatatypeFactory;
}
static Set<QName> getAvroRecognizedTypes() {
return XML_TO_AVRO_TYPE_MAP.keySet();
}
static Schema.Type getAvroSchemaTypeFor(QName qName) {
return XML_TO_AVRO_TYPE_MAP.get(qName);
}
static Schema getAvroSchemaFor(
XmlSchemaTypeInfo typeInfo,
QName qName,
boolean isOptional) {
switch ( typeInfo.getType() ) {
case ATOMIC:
{
Schema schema = null;
if ( isValidEnum(typeInfo) ) {
// This is an enumeration!
final HashMap<XmlSchemaRestriction.Type, List<XmlSchemaRestriction>>
facets = typeInfo.getFacets();
String ns = null;
try {
ns = Utils.getAvroNamespaceFor(qName.getNamespaceURI()) + ".enums";
} catch (URISyntaxException e) {
throw new IllegalArgumentException(
qName + " does not have a valid namespace.", e);
}
final List<XmlSchemaRestriction> enumFacet =
facets.get(XmlSchemaRestriction.Type.ENUMERATION);
final ArrayList<String> symbols =
new ArrayList<String>( enumFacet.size() );
for (XmlSchemaRestriction enumSym : enumFacet) {
symbols.add( enumSym.getValue().toString() );
}
schema =
Schema.createEnum(
qName.getLocalPart(),
"Enumeration of symbols in " + qName,
ns,
symbols);
} else if (
typeInfo.getBaseType().equals(XmlSchemaBaseSimpleType.QNAME)) {
/* QNames will be represented as a RECORD
* with a namespace and a local part.
*/
final List<Schema.Field> fields = new ArrayList<Schema.Field>(2);
fields.add(
new Schema.Field(
"namespace",
Schema.create(Schema.Type.STRING),
"The namespace of this qualified name.",
null) );
fields.add(
new Schema.Field(
"localPart",
Schema.create(Schema.Type.STRING),
"The local part of this qualified name.",
null) );
String ns = null;
try {
ns = Utils.getAvroNamespaceFor( Constants.URI_2001_SCHEMA_XSD );
} catch (URISyntaxException e) {
throw new IllegalStateException(
"Cannot create an Avro namespace for "
+ Constants.URI_2001_SCHEMA_XSD);
}
schema =
Schema.createRecord("qName", "Qualified Name", ns, false);
schema.setFields(fields);
} else if (typeInfo
.getBaseType()
.equals(XmlSchemaBaseSimpleType.DURATION)) {
// Duration is a logical type.
schema = Schema.createArray(Schema.create(Schema.Type.INT));
schema.addProp(LOGICAL_TYPE, new TextNode("duration"));
} else {
final Schema.Type avroType =
XML_TO_AVRO_TYPE_MAP.get( typeInfo.getUserRecognizedType() );
if (avroType == null) {
throw new IllegalArgumentException(
"No Avro type recognized for "
+ typeInfo.getUserRecognizedType());
}
schema = Schema.create(avroType);
// DECIMAL is a logical type.
if (schema.getType().equals(Schema.Type.BYTES)
&& typeInfo
.getBaseType()
.equals(XmlSchemaBaseSimpleType.DECIMAL)) {
/* If there is a restriction on the number of fraction
* and/or total digits, we need to respect it.
*/
int scale = 8;
int precision = MathContext.DECIMAL128.getPrecision();
HashMap<XmlSchemaRestriction.Type, List<XmlSchemaRestriction>>
facets = typeInfo.getFacets();
if (facets != null) {
// Fraction Digits are the scale
final List<XmlSchemaRestriction> fractionDigitsFacets =
facets.get(XmlSchemaRestriction.Type.DIGITS_FRACTION);
if ((fractionDigitsFacets != null)
&& !fractionDigitsFacets.isEmpty()) {
final XmlSchemaRestriction fractionDigitsFacet =
fractionDigitsFacets.get(0);
final Object value = fractionDigitsFacet.getValue();
if (value instanceof Number) {
scale = ((Number) value).intValue();
} else {
try {
scale = Integer.parseInt(value.toString());
} catch (NumberFormatException nfe) {
throw new IllegalStateException(
"Fraction digits facet is not a number: " + value);
}
}
}
// Total Digits are the precision
final List<XmlSchemaRestriction> totalDigitsFacets =
facets.get(XmlSchemaRestriction.Type.DIGITS_TOTAL);
if ((totalDigitsFacets != null)
&& !totalDigitsFacets.isEmpty()) {
final XmlSchemaRestriction totalDigitsFacet =
fractionDigitsFacets.get(0);
final Object value = totalDigitsFacet.getValue();
if (value instanceof Number) {
precision = ((Number) value).intValue();
} else {
try {
precision = Integer.parseInt(value.toString());
} catch (NumberFormatException nfe) {
throw new IllegalStateException(
"Total digits facet is not a number: " + value);
}
}
}
}
schema.addProp(LOGICAL_TYPE, new TextNode("decimal"));
schema.addProp("scale", new IntNode(scale));
schema.addProp("precision", new IntNode(precision));
} else if (schema.getType().equals(Schema.Type.INT)) {
// DATE and TIME are logical types.
switch (typeInfo.getBaseType()) {
case DATE:
schema.addProp(LOGICAL_TYPE, new TextNode("date"));
break;
case TIME:
schema.addProp(LOGICAL_TYPE, new TextNode("time"));
break;
default:
// Not a logical type.
}
} else if (schema.getType().equals(Schema.Type.LONG)
&& typeInfo
.getBaseType()
.equals(XmlSchemaBaseSimpleType.DATETIME)) {
// DATETIME is a logical type.
schema.addProp(LOGICAL_TYPE, new TextNode("timestamp"));
}
}
return createSchemaOf(schema, isOptional, typeInfo.isMixed());
}
case LIST:
{
Schema schema =
Schema.createArray(
getAvroSchemaFor(
typeInfo.getChildTypes().get(0), qName, false) );
return createSchemaOf(schema, isOptional, typeInfo.isMixed());
}
case UNION:
{
List<XmlSchemaTypeInfo> unionTypes = typeInfo.getChildTypes();
List<Schema> avroTypes =
new ArrayList<Schema>(unionTypes.size() + 2);
for (XmlSchemaTypeInfo unionType : unionTypes) {
final Schema avroSchema = getAvroSchemaFor(unionType, qName, false);
if ( !avroTypes.contains(avroSchema) ) {
avroTypes.add(avroSchema);
}
}
if (isOptional) {
final Schema avroSchema = Schema.create(Schema.Type.NULL);
if ( !avroTypes.contains(avroSchema) ) {
avroTypes.add(avroSchema);
}
}
if ( typeInfo.isMixed() ) {
final Schema avroSchema = Schema.create(Schema.Type.STRING);
if ( !avroTypes.contains(avroSchema) ) {
avroTypes.add(avroSchema);
}
}
return Schema.createUnion(avroTypes);
}
case COMPLEX:
{
if ( typeInfo.isMixed() ) {
return Schema.create(Schema.Type.STRING);
}
/* falls through */
}
default:
throw new IllegalArgumentException(
"Cannot create an Avro schema for a "
+ typeInfo.getType()
+ " type.");
}
}
private static Schema createSchemaOf(
Schema schema,
boolean isOptional,
boolean isMixed) {
if (!isOptional && !isMixed) {
return schema;
}
List<Schema> unionTypes = new ArrayList<Schema>(2);
unionTypes.add(schema);
if (isOptional) {
unionTypes.add( Schema.create(Schema.Type.NULL) );
}
if (isMixed) {
unionTypes.add( Schema.create(Schema.Type.STRING) );
}
schema = Schema.createUnion(unionTypes);
return schema;
}
private static boolean isValidEnum(XmlSchemaTypeInfo typeInfo) {
final HashMap<XmlSchemaRestriction.Type, List<XmlSchemaRestriction>>
facets = typeInfo.getFacets();
if (facets == null) {
return false;
}
final List<XmlSchemaRestriction> enumFacets =
facets.get(XmlSchemaRestriction.Type.ENUMERATION);
if (enumFacets == null) {
return false;
}
for (XmlSchemaRestriction enumFacet : enumFacets) {
final String symbol = enumFacet.getValue().toString();
final int length = symbol.length();
for (int offset = 0; offset < length; ) {
final int codepoint = symbol.codePointAt(offset);
if (!Character.isLetterOrDigit(codepoint)
&& (codepoint != UNDERSCORE_CP)) {
return false;
}
offset += Character.charCount(codepoint);
}
}
return true;
}
static XmlSchemaTypeInfo chooseUnionType(
XmlSchemaTypeInfo xmlType,
QName typeQName,
Schema elemType,
int unionIndex) {
XmlSchemaTypeInfo xmlElemType = xmlType;
if (xmlType.getChildTypes().size() <= unionIndex) {
xmlElemType = null;
} else {
for (XmlSchemaTypeInfo childType : xmlType.getChildTypes()) {
final Schema avroSchemaOfChildType =
Utils.getAvroSchemaFor(childType, typeQName, false);
if ( avroSchemaOfChildType.equals(elemType) ) {
xmlElemType = childType;
break;
}
}
}
return xmlElemType;
}
static String getAvroNamespaceFor(String xmlSchemaNamespace)
throws URISyntaxException {
return getAvroNamespaceFor(new URI(xmlSchemaNamespace));
}
static String getAvroNamespaceFor(URL xmlSchemaNamespace)
throws URISyntaxException {
return getAvroNamespaceFor( xmlSchemaNamespace.toURI() );
}
static String getAvroNamespaceFor(java.net.URI uri) {
final ArrayList<String> components = new ArrayList<String>();
// xsd.example.org -> org.example.xsd
final String host = uri.getHost();
if (host != null) {
String[] hostParts = host.split("\\.");
for (int hpIdx = hostParts.length - 1; hpIdx >= 0; --hpIdx) {
if ( !hostParts[hpIdx].isEmpty() ) {
try {
// Java packages can't have numeric components.
Long.parseLong(hostParts[hpIdx]);
components.add("_" + hostParts[hpIdx]);
} catch (NumberFormatException nfe) {
components.add(hostParts[hpIdx].toLowerCase());
}
}
}
}
// path/to/schema.xsd -> path.to.schema.xsd
final String path = uri.getPath();
if (path != null) {
final String[] pathParts = path.split("/");
for (String pathPart : pathParts) {
if ( !pathPart.isEmpty() ) {
try {
// Java packages can't have numeric components.
Long.parseLong(pathPart);
components.add("_" + pathPart);
} catch (NumberFormatException nfe) {
components.add(pathPart.toLowerCase());
}
}
}
}
/* This URI is of the form a:b:c:d:e.
* We can convert that to a.b.c.d.e.
*/
if ( components.isEmpty() ) {
final String schemeSpecificPart = uri.getSchemeSpecificPart();
final String[] schemeParts = schemeSpecificPart.split("\\:");
for (String schemePart : schemeParts) {
if ( !schemePart.isEmpty() ) {
try {
// Java packages can't have numeric components.
Long.parseLong(schemePart);
components.add("_" + schemePart);
} catch (NumberFormatException nfe) {
components.add(schemePart.toLowerCase());
}
}
}
}
if ( components.isEmpty() ) {
throw new IllegalArgumentException(
"URI \"" + uri.toString()
+ "\" does not have enough content to create a namespace for it.");
}
StringBuilder namespace = new StringBuilder(components.get(0));
for (int c = 1; c < components.size(); ++c) {
namespace.append('.').append( createValidName( components.get(c) ) );
}
return namespace.toString();
}
private static String createValidName(String component) {
StringBuilder str = new StringBuilder();
final int length = component.length();
for (int offset = 0; offset < length; ) {
final int codepoint = component.codePointAt(offset);
if (!Character.isLetterOrDigit(codepoint)
&& (codepoint != UNDERSCORE_CP)
&& (codepoint != PERIOD_CP)) {
str.append('_');
} else {
str.append( Character.toChars(codepoint) );
}
offset += Character.charCount(codepoint);
}
return str.toString();
}
static BigDecimal createBigDecimalFrom(byte[] bytes, Schema schema) {
confirmIsValidDecimal(schema);
return new BigDecimal(
new BigInteger(bytes),
getScaleFrom(schema),
getMathContextFrom(schema));
}
static BigDecimal createBigDecimalFrom(String text, Schema schema) {
confirmIsValidDecimal(schema);
final int scale = getScaleFrom(schema);
final MathContext mathContext = getMathContextFrom(schema);
BigDecimal decimal = new BigDecimal(text, mathContext);
if (decimal.scale() != scale) {
decimal = decimal.setScale(scale, mathContext.getRoundingMode());
}
return decimal;
}
private static void confirmIsValidDecimal(Schema schema) {
final JsonNode logicalTypeNode = schema.getJsonProp(LOGICAL_TYPE);
if (logicalTypeNode == null) {
throw new IllegalStateException(
"Attempted to read an XML Schema DECIMAL as an Avro "
+ "logical type, but the logical type is missing!");
} else if (!"decimal".equals(logicalTypeNode.asText())) {
throw new IllegalStateException(
"Attempted to read an XML Schema DECIMAL as an Avro logical "
+ "type, but the logical type is " + logicalTypeNode);
}
}
private static int getScaleFrom(Schema schema) {
int scale = 0;
final JsonNode scaleNode = schema.getJsonProp("scale");
if (scaleNode != null) {
if (!(scaleNode instanceof NumericNode)) {
throw new IllegalStateException(
"Attempted to read an XML Schema DECIMAL as an Avro logical "
+ "type, but the scale is not a number! Found: "
+ scaleNode);
}
scale = scaleNode.asInt();
}
return scale;
}
private static MathContext getMathContextFrom(Schema schema) {
final JsonNode precisionNode = schema.getJsonProp("precision");
if (precisionNode == null) {
throw new IllegalArgumentException(
"Attempted to read an XML Schema DECIMAL as an Avro "
+ "logical type, but the precision is missing!");
} else if (!(precisionNode instanceof NumericNode)) {
throw new IllegalArgumentException(
"Attempted to read an XML Schema DECIMAL as an Avro logical "
+ "type, but the precision is not a number! Found: "
+ precisionNode);
}
return new MathContext( precisionNode.asInt() );
}
}