/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.BinaryNode;
import com.fasterxml.jackson.databind.node.BooleanNode;
import com.fasterxml.jackson.databind.node.MissingNode;
import com.fasterxml.jackson.databind.node.NullNode;
import com.fasterxml.jackson.databind.node.NumericNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.DatasetRecordException;
import org.kitesdk.data.ValidationException;
public class JsonUtil {
private static final JsonFactory FACTORY = new JsonFactory();
public static Iterator<JsonNode> parser(final InputStream stream) {
try {
JsonParser parser = FACTORY.createParser(stream);
parser.setCodec(new ObjectMapper());
return parser.readValuesAs(JsonNode.class);
} catch (IOException e) {
throw new DatasetIOException("Cannot read from stream", e);
}
}
public static JsonNode parse(String json) {
return parse(json, JsonNode.class);
}
public static <T> T parse(String json, Class<T> returnType) {
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.readValue(json, returnType);
} catch (JsonParseException e) {
throw new ValidationException("Invalid JSON", e);
} catch (JsonMappingException e) {
throw new ValidationException("Invalid JSON", e);
} catch (IOException e) {
throw new DatasetIOException("Cannot initialize JSON parser", e);
}
}
public static JsonNode parse(File file) {
return parse(file, JsonNode.class);
}
public static <T> T parse(File file, Class<T> returnType) {
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.readValue(file, returnType);
} catch (JsonParseException e) {
throw new ValidationException("Invalid JSON", e);
} catch (JsonMappingException e) {
throw new ValidationException("Invalid JSON", e);
} catch (IOException e) {
throw new DatasetIOException("Cannot initialize JSON parser", e);
}
}
public static JsonNode parse(InputStream in) {
return parse(in, JsonNode.class);
}
public static <T> T parse(InputStream in, Class<T> returnType) {
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.readValue(in, returnType);
} catch (JsonParseException e) {
throw new ValidationException("Invalid JSON", e);
} catch (JsonMappingException e) {
throw new ValidationException("Invalid JSON", e);
} catch (IOException e) {
throw new DatasetIOException("Cannot initialize JSON parser", e);
}
}
public abstract static class JsonTreeVisitor<T> {
protected LinkedList<String> recordLevels = Lists.newLinkedList();
public T object(ObjectNode object, Map<String, T> fields) {
return null;
}
public T array(ArrayNode array, List<T> elements) {
return null;
}
public T binary(BinaryNode binary) {
return null;
}
public T text(TextNode text) {
return null;
}
public T number(NumericNode number) {
return null;
}
public T bool(BooleanNode bool) {
return null;
}
public T missing(MissingNode missing) {
return null;
}
public T nullNode(NullNode nullNode) {
return null;
}
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(
value="BC_UNCONFIRMED_CAST",
justification="Uses precondition to validate casts")
public static <T> T visit(JsonNode node, JsonTreeVisitor<T> visitor) {
switch (node.getNodeType()) {
case OBJECT:
Preconditions.checkArgument(node instanceof ObjectNode,
"Expected instance of ObjectNode: " + node);
// use LinkedHashMap to preserve field order
Map<String, T> fields = Maps.newLinkedHashMap();
Iterator<Map.Entry<String, JsonNode>> iter = node.fields();
while (iter.hasNext()) {
Map.Entry<String, JsonNode> entry = iter.next();
visitor.recordLevels.push(entry.getKey());
fields.put(entry.getKey(), visit(entry.getValue(), visitor));
visitor.recordLevels.pop();
}
return visitor.object((ObjectNode) node, fields);
case ARRAY:
Preconditions.checkArgument(node instanceof ArrayNode,
"Expected instance of ArrayNode: " + node);
List<T> elements = Lists.newArrayListWithExpectedSize(node.size());
for (JsonNode element : node) {
elements.add(visit(element, visitor));
}
return visitor.array((ArrayNode) node, elements);
case BINARY:
Preconditions.checkArgument(node instanceof BinaryNode,
"Expected instance of BinaryNode: " + node);
return visitor.binary((BinaryNode) node);
case STRING:
Preconditions.checkArgument(node instanceof TextNode,
"Expected instance of TextNode: " + node);
return visitor.text((TextNode) node);
case NUMBER:
Preconditions.checkArgument(node instanceof NumericNode,
"Expected instance of NumericNode: " + node);
return visitor.number((NumericNode) node);
case BOOLEAN:
Preconditions.checkArgument(node instanceof BooleanNode,
"Expected instance of BooleanNode: " + node);
return visitor.bool((BooleanNode) node);
case MISSING:
Preconditions.checkArgument(node instanceof MissingNode,
"Expected instance of MissingNode: " + node);
return visitor.missing((MissingNode) node);
case NULL:
Preconditions.checkArgument(node instanceof NullNode,
"Expected instance of NullNode: " + node);
return visitor.nullNode((NullNode) node);
default:
throw new IllegalArgumentException(
"Unknown node type: " + node.getNodeType() + ": " + node);
}
}
public static Object convertToAvro(GenericData model, JsonNode datum,
Schema schema) {
if (datum == null) {
return null;
}
switch (schema.getType()) {
case RECORD:
DatasetRecordException.check(datum.isObject(),
"Cannot convert non-object to record: %s", datum);
Object record = model.newRecord(null, schema);
for (Schema.Field field : schema.getFields()) {
model.setField(record, field.name(), field.pos(),
convertField(model, datum.get(field.name()), field));
}
return record;
case MAP:
DatasetRecordException.check(datum.isObject(),
"Cannot convert non-object to map: %s", datum);
Map<String, Object> map = Maps.newLinkedHashMap();
Iterator<Map.Entry<String, JsonNode>> iter = datum.fields();
while (iter.hasNext()) {
Map.Entry<String, JsonNode> entry = iter.next();
map.put(entry.getKey(), convertToAvro(
model, entry.getValue(), schema.getValueType()));
}
return map;
case ARRAY:
DatasetRecordException.check(datum.isArray(),
"Cannot convert to array: %s", datum);
List<Object> list = Lists.newArrayListWithExpectedSize(datum.size());
for (JsonNode element : datum) {
list.add(convertToAvro(model, element, schema.getElementType()));
}
return list;
case UNION:
return convertToAvro(model, datum,
resolveUnion(datum, schema.getTypes()));
case BOOLEAN:
DatasetRecordException.check(datum.isBoolean(),
"Cannot convert to boolean: %s", datum);
return datum.booleanValue();
case FLOAT:
DatasetRecordException.check(datum.isFloat() || datum.isInt(),
"Cannot convert to float: %s", datum);
return datum.floatValue();
case DOUBLE:
DatasetRecordException.check(
datum.isDouble() || datum.isFloat() ||
datum.isLong() || datum.isInt(),
"Cannot convert to double: %s", datum);
return datum.doubleValue();
case INT:
DatasetRecordException.check(datum.isInt(),
"Cannot convert to int: %s", datum);
return datum.intValue();
case LONG:
DatasetRecordException.check(datum.isLong() || datum.isInt(),
"Cannot convert to long: %s", datum);
return datum.longValue();
case STRING:
DatasetRecordException.check(datum.isTextual(),
"Cannot convert to string: %s", datum);
return datum.textValue();
case ENUM:
DatasetRecordException.check(datum.isTextual(),
"Cannot convert to string: %s", datum);
return model.createEnum(datum.textValue(), schema);
case BYTES:
DatasetRecordException.check(datum.isBinary(),
"Cannot convert to binary: %s", datum);
try {
return ByteBuffer.wrap(datum.binaryValue());
} catch (IOException e) {
throw new DatasetRecordException("Failed to read JSON binary", e);
}
case FIXED:
DatasetRecordException.check(datum.isBinary(),
"Cannot convert to fixed: %s", datum);
byte[] bytes;
try {
bytes = datum.binaryValue();
} catch (IOException e) {
throw new DatasetRecordException("Failed to read JSON binary", e);
}
DatasetRecordException.check(bytes.length < schema.getFixedSize(),
"Binary data is too short: %s bytes for %s", bytes.length, schema);
return model.createFixed(null, bytes, schema);
case NULL:
return null;
default:
// don't use DatasetRecordException because this is a Schema problem
throw new IllegalArgumentException("Unknown schema type: " + schema);
}
}
private static Object convertField(GenericData model, JsonNode datum,
Schema.Field field) {
try {
Object value = convertToAvro(model, datum, field.schema());
if (value != null || SchemaUtil.nullOk(field.schema())) {
return value;
} else {
return model.getDefaultValue(field);
}
} catch (DatasetRecordException e) {
// add the field name to the error message
throw new DatasetRecordException(String.format(
"Cannot convert field %s", field.name()), e);
} catch (AvroRuntimeException e) {
throw new DatasetRecordException(String.format(
"Field %s: cannot make %s value: '%s'",
field.name(), field.schema(), String.valueOf(datum)), e);
}
}
private static Schema resolveUnion(JsonNode datum, Collection<Schema> schemas) {
Set<Schema.Type> primitives = Sets.newHashSet();
List<Schema> others = Lists.newArrayList();
for (Schema schema : schemas) {
if (PRIMITIVES.containsKey(schema.getType())) {
primitives.add(schema.getType());
} else {
others.add(schema);
}
}
// Try to identify specific primitive types
Schema primitiveSchema = null;
if (datum == null || datum.isNull()) {
primitiveSchema = closestPrimitive(primitives, Schema.Type.NULL);
} else if (datum.isShort() || datum.isInt()) {
primitiveSchema = closestPrimitive(primitives,
Schema.Type.INT, Schema.Type.LONG,
Schema.Type.FLOAT, Schema.Type.DOUBLE);
} else if (datum.isLong()) {
primitiveSchema = closestPrimitive(primitives,
Schema.Type.LONG, Schema.Type.DOUBLE);
} else if (datum.isFloat()) {
primitiveSchema = closestPrimitive(primitives,
Schema.Type.FLOAT, Schema.Type.DOUBLE);
} else if (datum.isDouble()) {
primitiveSchema = closestPrimitive(primitives, Schema.Type.DOUBLE);
} else if (datum.isBoolean()) {
primitiveSchema = closestPrimitive(primitives, Schema.Type.BOOLEAN);
}
if (primitiveSchema != null) {
return primitiveSchema;
}
// otherwise, select the first schema that matches the datum
for (Schema schema : others) {
if (matches(datum, schema)) {
return schema;
}
}
throw new DatasetRecordException(String.format(
"Cannot resolve union: %s not in %s", datum, schemas));
}
// this does not contain string, bytes, or fixed because the datum type
// doesn't necessarily determine the schema.
private static ImmutableMap<Schema.Type, Schema> PRIMITIVES = ImmutableMap
.<Schema.Type, Schema>builder()
.put(Schema.Type.NULL, Schema.create(Schema.Type.NULL))
.put(Schema.Type.BOOLEAN, Schema.create(Schema.Type.BOOLEAN))
.put(Schema.Type.INT, Schema.create(Schema.Type.INT))
.put(Schema.Type.LONG, Schema.create(Schema.Type.LONG))
.put(Schema.Type.FLOAT, Schema.create(Schema.Type.FLOAT))
.put(Schema.Type.DOUBLE, Schema.create(Schema.Type.DOUBLE))
.build();
private static Schema closestPrimitive(Set<Schema.Type> possible, Schema.Type... types) {
for (Schema.Type type : types) {
if (possible.contains(type) && PRIMITIVES.containsKey(type)) {
return PRIMITIVES.get(type);
}
}
return null;
}
private static boolean matches(JsonNode datum, Schema schema) {
switch (schema.getType()) {
case RECORD:
if (datum.isObject()) {
// check that each field is present or has a default
boolean missingField = false;
for (Schema.Field field : schema.getFields()) {
if (!datum.has(field.name()) && field.defaultValue() == null) {
missingField = true;
break;
}
}
if (!missingField) {
return true;
}
}
break;
case UNION:
if (resolveUnion(datum, schema.getTypes()) != null) {
return true;
}
break;
case MAP:
if (datum.isObject()) {
return true;
}
break;
case ARRAY:
if (datum.isArray()) {
return true;
}
break;
case BOOLEAN:
if (datum.isBoolean()) {
return true;
}
break;
case FLOAT:
if (datum.isFloat() || datum.isInt()) {
return true;
}
break;
case DOUBLE:
if (datum.isDouble() || datum.isFloat() ||
datum.isLong() || datum.isInt()) {
return true;
}
break;
case INT:
if (datum.isInt()) {
return true;
}
break;
case LONG:
if (datum.isLong() || datum.isInt()) {
return true;
}
break;
case STRING:
if (datum.isTextual()) {
return true;
}
break;
case ENUM:
if (datum.isTextual() && schema.hasEnumSymbol(datum.textValue())) {
return true;
}
break;
case BYTES:
case FIXED:
if (datum.isBinary()) {
return true;
}
break;
case NULL:
if (datum == null || datum.isNull()) {
return true;
}
break;
default: // UNION or unknown
throw new IllegalArgumentException("Unsupported schema: " + schema);
}
return false;
}
public static Schema inferSchema(InputStream incoming, final String name,
int numRecords) {
Iterator<Schema> schemas = Iterators.transform(parser(incoming),
new Function<JsonNode, Schema>() {
@Override
public Schema apply(JsonNode node) {
return inferSchema(node, name);
}
});
if (!schemas.hasNext()) {
return null;
}
Schema result = schemas.next();
for (int i = 1; schemas.hasNext() && i < numRecords; i += 1) {
result = SchemaUtil.merge(result, schemas.next());
}
return result;
}
public static Schema inferSchema(JsonNode node, String name) {
return visit(node, new JsonSchemaVisitor(name));
}
public static Schema inferSchemaWithMaps(JsonNode node, String name) {
return visit(node, new JsonSchemaVisitor(name).useMaps());
}
private static class JsonSchemaVisitor extends JsonTreeVisitor<Schema> {
private static final Joiner DOT = Joiner.on('.');
private final String name;
private boolean objectsToRecords = true;
public JsonSchemaVisitor(String name) {
this.name = name;
}
public JsonSchemaVisitor useMaps() {
this.objectsToRecords = false;
return this;
}
@Override
public Schema object(ObjectNode object, Map<String, Schema> fields) {
if (objectsToRecords || recordLevels.size() < 1) {
List<Schema.Field> recordFields = Lists.newArrayListWithExpectedSize(
fields.size());
for (Map.Entry<String, Schema> entry : fields.entrySet()) {
recordFields.add(new Schema.Field(
entry.getKey(), entry.getValue(),
"Type inferred from '" + object.get(entry.getKey()) + "'",
null));
}
Schema recordSchema;
if (recordLevels.size() < 1) {
recordSchema = Schema.createRecord(name, null, null, false);
} else {
recordSchema = Schema.createRecord(
DOT.join(recordLevels), null, null, false);
}
recordSchema.setFields(recordFields);
return recordSchema;
} else {
// translate to a map; use LinkedHashSet to preserve schema order
switch (fields.size()) {
case 0:
return Schema.createMap(Schema.create(Schema.Type.NULL));
case 1:
return Schema.createMap(Iterables.getOnlyElement(fields.values()));
default:
return Schema.createMap(SchemaUtil.mergeOrUnion(fields.values()));
}
}
}
@Override
public Schema array(ArrayNode ignored, List<Schema> elementSchemas) {
// use LinkedHashSet to preserve schema order
switch (elementSchemas.size()) {
case 0:
return Schema.createArray(Schema.create(Schema.Type.NULL));
case 1:
return Schema.createArray(Iterables.getOnlyElement(elementSchemas));
default:
return Schema.createArray(SchemaUtil.mergeOrUnion(elementSchemas));
}
}
@Override
public Schema binary(BinaryNode ignored) {
return Schema.create(Schema.Type.BYTES);
}
@Override
public Schema text(TextNode ignored) {
return Schema.create(Schema.Type.STRING);
}
@Override
public Schema number(NumericNode number) {
if (number.isInt()) {
return Schema.create(Schema.Type.INT);
} else if (number.isLong()) {
return Schema.create(Schema.Type.LONG);
} else if (number.isFloat()) {
return Schema.create(Schema.Type.FLOAT);
} else if (number.isDouble()) {
return Schema.create(Schema.Type.DOUBLE);
} else {
throw new UnsupportedOperationException(
number.getClass().getName() + " is not supported");
}
}
@Override
public Schema bool(BooleanNode ignored) {
return Schema.create(Schema.Type.BOOLEAN);
}
@Override
public Schema nullNode(NullNode ignored) {
return Schema.create(Schema.Type.NULL);
}
@Override
public Schema missing(MissingNode ignored) {
throw new UnsupportedOperationException("MissingNode is not supported.");
}
}
}