package tap.formats.avro; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class JsonToGenericRecord { private static ObjectMapper m = new ObjectMapper(); private static final Logger LOG = LoggerFactory .getLogger(JsonToGenericRecord.class); public static GenericContainer jsonToRecord(String line, Schema schema) throws IOException { JsonNode rootNode = m.readTree(line); if (rootNode.isArray()) { GenericArray<GenericRecord> arr = new GenericData.Array<GenericRecord>( rootNode.size(), schema); for (int i = 0; i < rootNode.size(); i++) { arr.add(recordFromNode(rootNode.get(i), schema, line)); } return arr; } else if (rootNode.isObject()) { GenericRecord r = recordFromNode(rootNode, schema, line); return r; } else { LOG.debug("no container?"); } return null; } private static Map<Schema, Set<String>> errors = new HashMap<Schema, Set<String>>(); private static GenericRecord recordFromNode(JsonNode node, Schema schema, String container) { schema = getObject(schema); GenericRecord r = new GenericData.Record(schema); Iterator<JsonNode> it = node.getElements(); Iterator<String> itn = node.getFieldNames(); while (it.hasNext()) { JsonNode child = it.next(); String name = replaceInvalidNameChars(itn.next()); Field field = schema.getField(name); if (field == null) { if (addToErrors(schema, name)) LOG.warn("skipping unmapped field " + name + " contained in " + container); continue; } Schema childSchema = field.schema(); if (child.isArray()) { GenericArray<Object> o = recordFromArrayNode(child, childSchema, name); r.put(name, o); } else if (child.isObject()) { GenericRecord o = recordFromNode(child, childSchema, name); r.put(name, o); } else if (child.isInt()) { if (acceptsInt(childSchema)) { r.put(name, child.getIntValue()); } else if (acceptsLong(childSchema)) { r.put(name, (long) child.getIntValue()); } else if (acceptsFloat(childSchema)) { r.put(name, (float) child.getIntValue()); } else if (acceptsDouble(childSchema)) { r.put(name, (double) child.getIntValue()); } else { if (addToErrors(schema, name)) LOG.error("Can't store an int in field " + name); } } else if (child.isLong()) { if (acceptsLong(childSchema)) { r.put(name, (long) child.getLongValue()); } else if (acceptsFloat(childSchema)) { r.put(name, (float) child.getLongValue()); } else if (acceptsDouble(childSchema)) { r.put(name, (double) child.getLongValue()); } else { if (addToErrors(schema, name)) LOG.error("Can't store a long in field " + name); } } else if (child.isBinary()) { try { r.put(name, child.getBinaryValue()); } catch (IOException e) { throw new RuntimeException(e); } } else if (child.isBoolean()) { r.put(name, child.getBooleanValue()); } else if (child.isDouble()) { if (acceptsDouble(childSchema)) { r.put(name, child.getDoubleValue()); } else if (acceptsFloat(childSchema)) { r.put(name, (float) child.getDoubleValue()); } else { if (addToErrors(schema, name)) LOG.error("Can't store a double in field " + name); } } else if (child.isTextual()) { r.put(name, child.getTextValue()); // new // org.apache.avro.util.Utf8(child.getTextValue())); } } return r; } private static boolean addToErrors(Schema schema, String name) { Set<String> err = errors.get(schema); if (err == null) { err = new TreeSet<String>(); errors.put(schema, err); } return err.add(name); } private static GenericArray<Object> recordFromArrayNode(JsonNode node, Schema schema, String container) { schema = getArray(schema); GenericArray<Object> r = new GenericData.Array<Object>(node.size(), schema); Schema childSchema = schema.getElementType(); for (int i = 0; i < node.size(); i++) { JsonNode child = node.get(i); if (child.isArray()) { GenericArray<Object> o = recordFromArrayNode(child, childSchema, container); r.add(o); } else if (child.isObject()) { GenericRecord o = recordFromNode(child, childSchema, container); r.add(o); } else if (child.isInt()) { if (acceptsInt(childSchema)) { r.add(child.getIntValue()); } else if (acceptsLong(childSchema)) { r.add((long) child.getIntValue()); } else if (acceptsFloat(childSchema)) { r.add((float) child.getIntValue()); } else if (acceptsDouble(childSchema)) { r.add((double) child.getIntValue()); } else { LOG.error("Can't store an int in field " + childSchema); } } else if (child.isLong()) { if (acceptsLong(childSchema)) { r.add((long) child.getLongValue()); } else if (acceptsFloat(childSchema)) { r.add((float) child.getLongValue()); } else if (acceptsDouble(childSchema)) { r.add((double) child.getLongValue()); } else { LOG.error("Can't store a long in field " + childSchema); } } else if (child.isBinary()) { try { r.add(child.getBinaryValue()); } catch (IOException e) { throw new RuntimeException(e); } } else if (child.isBoolean()) { r.add(child.getBooleanValue()); } else if (child.isDouble()) { if (acceptsDouble(childSchema)) { r.add(child.getDoubleValue()); } else if (acceptsFloat(childSchema)) { r.add((float) child.getDoubleValue()); } else { LOG.error("Can't store a double in field " + childSchema); } } else if (child.isTextual()) { r.add(new org.apache.avro.util.Utf8(child.getTextValue())); } } return r; } private static Schema getObject(Schema schema) { if (schema.getType() == Schema.Type.RECORD) return schema; if (schema.getType() == Schema.Type.UNION) { for (Schema s : schema.getTypes()) { if (s.getType() == Schema.Type.RECORD) { return s; } } } return null; } private static Schema getArray(Schema schema) { if (schema.getType() == Schema.Type.ARRAY) return schema; if (schema.getType() == Schema.Type.UNION) { for (Schema s : schema.getTypes()) { if (s.getType() == Schema.Type.ARRAY) { return s; } } } return null; } private static boolean accepts(Schema childSchema, Schema.Type type) { if (childSchema.getType() == type) return true; if (childSchema.getType() == Schema.Type.UNION) { for (Schema s : childSchema.getTypes()) { if (s.getType() == type) { return true; } } } return false; } private static boolean acceptsFloat(Schema childSchema) { return accepts(childSchema, Schema.Type.FLOAT); } private static boolean acceptsDouble(Schema childSchema) { return accepts(childSchema, Schema.Type.DOUBLE); } private static boolean acceptsLong(Schema childSchema) { return accepts(childSchema, Schema.Type.LONG); } private static boolean acceptsInt(Schema childSchema) { return accepts(childSchema, Schema.Type.INT); } // hand-written for speed - regexps are 100x slower static String replaceInvalidNameChars(String name) { int len = name.length(); // scan for first invalid char int i = 0; for (; i < len; i++) { char ch = name.charAt(i); if (!(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9' || ch == '_')) break; } if (i == len) return name; StringBuilder copy = new StringBuilder(len); copy.append(name, 0, i); copy.append('_'); i++; for (; i < len; i++) { char ch = name.charAt(i); if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9' || ch == '_') { copy.append(ch); } else { copy.append('_'); } } return copy.toString(); } }