/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hydrograph.engine.cascading.scheme.avro;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.hive.serde2.avro.AvroSerDe;
import org.apache.hadoop.io.BytesWritable;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.*;
public class CustomCascadingToAvro {
@SuppressWarnings("serial")
private static Map<Class<?>, Type> TYPE_MAP = new HashMap<Class<?>, Type>() {
{
put(Integer.class, Type.INT);
put(Long.class, Type.LONG);
put(Boolean.class, Type.BOOLEAN);
put(Double.class, Type.DOUBLE);
put(Float.class, Type.FLOAT);
put(String.class, Type.STRING);
put(Date.class, Type.INT);
put(BigDecimal.class, Type.BYTES);
// Note : Cascading field type for Array and Map is really a Tuple
put(List.class, Type.ARRAY);
put(Map.class, Type.MAP);
}
};
public static Object[] parseTupleEntry(TupleEntry tupleEntry,
Schema writerSchema) {
if (!(writerSchema.getFields().size() == tupleEntry.size())) {
throw new AvroRuntimeException(
"Arity mismatch between incoming tuple and schema");
}
return parseTuple(tupleEntry.getTuple(), writerSchema);
}
public static Object[] parseTuple(Tuple tuple, Schema writerSchema) {
Object[] result = new Object[writerSchema.getFields().size()];
List<Field> schemaFields = writerSchema.getFields();
for (int i = 0; i < schemaFields.size(); i++) {
Field field = schemaFields.get(i);
// if (!fields.contains(new Fields(field.name()))) {
// System.out.println(fields);
// throw new RuntimeException("Tuple doesn't contain field: "+
// field.name());
// }
Object obj = tuple.getObject(i);
result[i] = toAvro(obj, field.schema());
}
return result;
}
protected static Object toAvro(Object obj, Schema schema) {
switch (schema.getType()) {
case ARRAY:
return toAvroArray(obj, schema);
case STRING:
return obj.toString();
case ENUM:
return toAvroEnum(obj, schema);
case FIXED:
return toAvroFixed(obj, schema);
case RECORD:
Object[] objs;
if (obj instanceof Tuple) {
objs = parseTuple((Tuple) obj, schema);
} else {
objs = parseTupleEntry((TupleEntry) obj, schema);
}
Record record = new Record(schema);
for (int i = 0; i < objs.length; i++) {
record.put(i, objs[i]);
}
return record;
case BYTES:
if (schema.getJsonProp("logicalType") != null) {
if (schema.getJsonProp("logicalType").getValueAsText()
.equalsIgnoreCase("decimal")) {
return decimalToBinary((BigDecimal) obj, schema);
}
} else
return toAvroBytes(obj);
case UNION:
return toAvroUnion(obj, schema);
case NULL:
case BOOLEAN:
case DOUBLE:
case FLOAT:
case INT:
case LONG:
return obj;
default:
throw new AvroRuntimeException("Can't convert from type "
+ schema.getType().toString());
}
}
public static final int PRECISION_TO_BYTE_COUNT[] = new int[38];
static {
for (int prec = 1; prec <= 38; prec++) {
// Estimated number of bytes needed.
PRECISION_TO_BYTE_COUNT[prec - 1] = (int) Math.ceil((Math.log(Math
.pow(10, prec) - 1) / Math.log(2) + 1) / 8);
}
}
private static Object decimalToBinary(final BigDecimal bigDecimal,
Schema schema) {
// int prec = schema.getJsonProp("precision").asInt();
int prec = bigDecimal.precision();
int scale = schema.getJsonProp("scale").asInt();
byte[] decimalBytes = bigDecimal.setScale(scale).unscaledValue()
.toByteArray();
// Estimated number of bytes needed.
int precToBytes = PRECISION_TO_BYTE_COUNT[prec - 1];
if (precToBytes == decimalBytes.length) {
// No padding needed.
return ByteBuffer.wrap(decimalBytes);
}
byte[] tgt = new byte[precToBytes];
if (bigDecimal.signum() == -1) {
// For negative number, initializing bits to 1
for (int i = 0; i < precToBytes; i++) {
tgt[i] |= 0xFF;
}
}
System.arraycopy(decimalBytes, 0, tgt, precToBytes
- decimalBytes.length, decimalBytes.length); // Padding leading
// zeroes/ones.
return ByteBuffer.wrap(tgt);
}
protected static Object toAvroEnum(Object obj, Schema schema) {
return new GenericData.EnumSymbol(schema, obj.toString());
}
protected static Object toAvroFixed(Object obj, Schema schema) {
BytesWritable bytes = (BytesWritable) obj;
return new Fixed(schema, Arrays.copyOfRange(bytes.getBytes(), 0,
bytes.getLength()));
}
protected static Object toAvroBytes(Object obj) {
BytesWritable inBytes = (BytesWritable) obj;
return ByteBuffer.wrap(Arrays.copyOfRange(inBytes.getBytes(), 0,
inBytes.getLength()));
}
protected static Object toAvroArray(Object obj, Schema schema) {
if (obj instanceof Iterable) {
Schema elementSchema = schema.getElementType();
List<Object> array = new ArrayList<Object>();
for (Object element : (Iterable<Object>) obj) {
array.add(toAvro(element, elementSchema));
}
return new GenericData.Array(schema, array);
} else
throw new AvroRuntimeException(
"Can't convert from non-iterable to array");
}
protected static Object toAvroUnion(Object obj, Schema schema) {
if (obj == null) {
return null;
}
List<Schema> types = schema.getTypes();
if (types.size() < 1) {
throw new AvroRuntimeException(
"Union in writer schema has no types");
} else if (types.size() == 1) {
return toAvro(obj, types.get(0));
} else if (types.size() > 2) {
throw new AvroRuntimeException(
"Unions may only consist of a concrete type and null in cascading.avro");
} else if (!types.get(0).getType().equals(Type.NULL)
&& !types.get(1).getType().equals(Type.NULL)) {
throw new AvroRuntimeException(
"Unions may only consist of a concrete type and null in cascading.avro");
} else {
Integer concreteIndex = (types.get(0).getType() == Type.NULL) ? 1
: 0;
return toAvro(obj, types.get(concreteIndex));
}
}
@SuppressWarnings("rawtypes")
protected static Schema generateAvroSchemaFromTupleEntry(
TupleEntry tupleEntry, String recordName, boolean isNullable) {
Fields tupleFields = tupleEntry.getFields();
List<Field> avroFields = new ArrayList<Field>();
for (Comparable fieldName : tupleFields) {
if (!(fieldName instanceof String)) {
throw new AvroRuntimeException(
"Can't generate schema from non-string named fields");
}
Schema fieldSchema = generateAvroSchemaFromElement(
tupleEntry.getObject(fieldName), (String) fieldName,
isNullable);
avroFields.add(new Field((String) fieldName, fieldSchema, null,
null));
}
Schema outputSchema = Schema.createRecord(recordName,
"auto-generated by cascading.avro", null, false);
outputSchema.setFields(avroFields);
return outputSchema;
}
@SuppressWarnings({ "unchecked", "rawtypes" })
protected static Schema generateAvroSchemaFromElement(Object element,
String name, boolean isNullable) {
if (element == null) {
throw new AvroRuntimeException(
"Can't infer schema from null valued element");
} else if (isNullable)
return generateUnionSchema(element, name);
else if (element instanceof TupleEntry)
return generateAvroSchemaFromTupleEntry((TupleEntry) element, name,
isNullable);
else if (element instanceof Map)
return generateAvroSchemaFromMap((Map<String, Object>) element,
name);
else if (element instanceof Iterable)
return generateAvroSchemaFromIterable((Iterable) element, name);
else if (element instanceof BytesWritable)
return Schema.create(Type.BYTES);
else if (element instanceof String)
return Schema.create(Type.STRING);
else if (element instanceof Double)
return Schema.create(Type.DOUBLE);
else if (element instanceof Float)
return Schema.create(Type.FLOAT);
else if (element instanceof Integer)
return Schema.create(Type.INT);
else if (element instanceof Long)
return Schema.create(Type.LONG);
else if (element instanceof Boolean)
return Schema.create(Type.BOOLEAN);
else
throw new AvroRuntimeException("Can't create schema from type "
+ element.getClass());
}
private static Schema generateAvroSchemaFromIterable(Iterable element,
String name) {
Iterator<Object> iterator = element.iterator();
if (!iterator.hasNext()) {
throw new AvroRuntimeException(
"Can't infer list schema from empty iterable");
} else {
Schema itemSchema = generateAvroSchemaFromElement(iterator.next(),
name + "ArrayElement", false);
return Schema.createArray(itemSchema);
}
}
private static Schema generateAvroSchemaFromMap(
Map<String, Object> element, String name) {
if (element.isEmpty()) {
throw new AvroRuntimeException(
"Can't infer map schema from empty map");
} else {
Iterator<Object> iterator = element.values().iterator();
Schema valueSchema = generateAvroSchemaFromElement(iterator.next(),
name + "MapValue", false);
return Schema.createMap(valueSchema);
}
}
private static Schema generateUnionSchema(Object element, String name) {
List<Schema> types = new ArrayList<Schema>();
types.add(Schema.create(Type.NULL));
types.add(generateAvroSchemaFromElement(element, name, false));
return Schema.createUnion(types);
}
public static Schema generateAvroSchemaFromFieldsAndTypes(
String recordName, Fields schemeFields, Class<?>[] schemeTypes,
int[] fieldPrecision, int[] fieldScale) {
if (schemeFields.size() == 0) {
throw new IllegalArgumentException(
"There must be at least one field");
}
int schemeTypesSize = 0;
for (int i = 0; i < schemeTypes.length; i++, schemeTypesSize++) {
if ((schemeTypes[i] == List.class) || (schemeTypes[i] == Map.class)) {
i++;
}
}
if (schemeTypesSize != schemeFields.size()) {
throw new IllegalArgumentException(
"You must have a schemeType for every field");
}
for (int i = 0; i < schemeTypes.length; i++) {
if ((schemeTypes[i] == List.class) || (schemeTypes[i] == Map.class)) {
++i;
if (!isPrimitiveType(schemeTypes[i])) {
throw new IllegalArgumentException(
"Only primitive types are allowed for an Array or Map");
}
}
}
return generateSchema(recordName, schemeFields, schemeTypes, 0,
fieldPrecision, fieldScale);
}
public static void addToTuple(Tuple t, byte[] bytes) {
t.add(new BytesWritable(bytes));
}
@SuppressWarnings("rawtypes")
public static void addToTuple(Tuple t, Enum e) {
t.add(e.toString());
}
public static void addToTuple(Tuple t, List<?> list) {
Tuple listTuple = new Tuple();
for (Object item : list) {
listTuple.add(item);
}
t.add(listTuple);
}
public static void addToTuple(Tuple t, Map<String, ?> map) {
Tuple mapTuple = new Tuple();
for (String key : map.keySet()) {
mapTuple.add(key);
mapTuple.add(map.get(key));
}
t.add(mapTuple);
}
private static boolean isPrimitiveType(Class<?> arrayType) {
// only primitive types are allowed for arrays
return (arrayType == Boolean.class || arrayType == Integer.class
|| arrayType == Long.class || arrayType == Float.class
|| arrayType == Double.class || arrayType == String.class || arrayType == BytesWritable.class);
}
private static Schema generateSchema(String recordName,
Fields schemeFields, Class<?>[] schemeTypes, int depth,
int[] fieldPrecision, int[] fieldScale) {
// Create a 'record' that is made up of fields.
// Since we support arrays and maps that means we can have nested
// records
List<Field> fields = new ArrayList<Field>();
for (int typeIndex = 0, fieldIndex = 0; typeIndex < schemeTypes.length; typeIndex++, fieldIndex++) {
String fieldName = schemeFields.get(fieldIndex).toString();
Class<?>[] subSchemeTypes = new Class[2]; // at most 2, since we
// only allow primitive
// types for arrays and
// maps
subSchemeTypes[0] = schemeTypes[typeIndex];
if ((schemeTypes[typeIndex] == List.class)
|| (schemeTypes[typeIndex] == Map.class)) {
typeIndex++;
subSchemeTypes[1] = schemeTypes[typeIndex];
}
final Schema schema = createAvroSchema(recordName, schemeFields,
subSchemeTypes, depth + 1, fieldPrecision[typeIndex],
fieldScale[typeIndex]);
final Schema nullSchema = Schema.create(Type.NULL);
List<Schema> schemas = new LinkedList<Schema>() {
{
add(nullSchema);
add(schema);
}
};
fields.add(new Field(fieldName, Schema.createUnion(schemas),
"", null));
}
// Avro doesn't like anonymous records - so create a named one.
if (depth > 0) {
recordName = recordName + depth;
}
Schema schema = Schema.createRecord(recordName, "auto generated", "",
false);
schema.setFields(fields);
return schema;
}
private static Schema createAvroSchema(String recordName,
Fields schemeFields, Class<?>[] fieldTypes, int depth,
int fieldPrecision, int fieldScale) {
Map<Class<?>, Type> avroType = toAvroSchemaType(fieldTypes[0]);
int remainingFields = schemeFields.size() - 1;
if (avroType.get(fieldTypes[0]) == Type.ARRAY) {
Schema schema;
if (remainingFields == 0) {
schema = Schema.createArray(Schema.create(toAvroSchemaType(
fieldTypes[1]).get(avroType.get(fieldTypes[0]))));
} else {
Class<?> arrayTypes[] = { fieldTypes[1] };
schema = Schema.createArray(createAvroSchema(recordName,
Fields.offsetSelector(schemeFields.size() - 1, 1),
arrayTypes, depth + 1, fieldPrecision, fieldScale));
}
return schema;
} else if (avroType.get(fieldTypes[0]) == Type.MAP) {
Schema schema;
if (remainingFields == 0) {
schema = Schema.createMap(Schema.create(toAvroSchemaType(
fieldTypes[1]).get(avroType.get(fieldTypes[0]))));
} else {
Class<?> mapTypes[] = { fieldTypes[1] };
schema = Schema.createMap(createAvroSchema(recordName,
Fields.offsetSelector(schemeFields.size() - 1, 1),
mapTypes, depth + 1, fieldPrecision, fieldScale));
}
return schema;
} else if (avroType.get(fieldTypes[0]) == Type.ENUM) {
Class<?> clazz = fieldTypes[0];
Object[] names = clazz.getEnumConstants();
List<String> enumNames = new ArrayList<String>(names.length);
for (Object name : names) {
enumNames.add(name.toString());
}
return Schema.createEnum(fieldTypes[0].getName(), null, null,
enumNames);
} else if (fieldTypes[0] == Date.class) {
return AvroSchemaUtils.getSchemaFor("{" + "\"type\":\""
+ AvroSerDe.AVRO_LONG_TYPE_NAME + "\","
+ "\"logicalType\":\"" + AvroSerDe.DATE_TYPE_NAME + "\"}");
} else if (fieldTypes[0] == BigDecimal.class) {
String precision;
if (String.valueOf(fieldPrecision).equals("-999"))
precision = "-999";
else
precision = String.valueOf(fieldScale);
String scale = String.valueOf(fieldScale);
return AvroSchemaUtils.getSchemaFor("{" + "\"type\":\"bytes\","
+ "\"logicalType\":\"decimal\"," + "\"precision\":"
+ precision + "," + "\"scale\":" + scale + "}");
} else {
return Schema.create(avroType.get(fieldTypes[0]));
}
}
private static Map<Class<?>, Type> toAvroSchemaType(Class<?> clazz) {
if (TYPE_MAP.containsKey(clazz)) {
return TYPE_MAP;
} else {
throw new UnsupportedOperationException("The class type " + clazz
+ " is currently unsupported");
}
}
}