package com.esri.hadoop.hive.serde; import java.io.IOException; import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.codehaus.jackson.JsonFactory; import org.codehaus.jackson.JsonGenerationException; import org.codehaus.jackson.JsonGenerator; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.JsonParser; import org.codehaus.jackson.JsonProcessingException; import org.codehaus.jackson.JsonToken; import com.esri.core.geometry.ogc.OGCGeometry; import com.esri.hadoop.hive.GeometryUtils; import com.esri.hadoop.shims.HiveShims; abstract public class BaseJsonSerDe implements SerDe { static final Log LOG = LogFactory.getLog(BaseJsonSerDe.class.getName()); static protected JsonFactory jsonFactory = new JsonFactory(); protected int numColumns; protected int geometryColumn = -1; protected ArrayList<String> columnNames; protected ArrayList<ObjectInspector> columnOIs; protected boolean [] columnSet; protected StructObjectInspector rowOI; // contains the type information for the fields returned protected String attrLabel = "attributes"; // "properties" /* rowBase keeps a base copy of the Writable for each field so they can be reused for * all records. When deserialize is called, row is initially nulled out. Then for each attribute * found in the JSON record the Writable reference is copied from rowBase to row * and set to the appropriate value. Then row is returned. This why values don't linger from * previous records. */ ArrayList<Writable> rowBase; ArrayList<Writable> row; @Override public void initialize(Configuration cfg, Properties tbl) throws SerDeException { geometryColumn = -1; // Read the configuration parameters String columnNameProperty = tbl.getProperty(HiveShims.serdeConstants.LIST_COLUMNS); String columnTypeProperty = tbl.getProperty(HiveShims.serdeConstants.LIST_COLUMN_TYPES); ArrayList<TypeInfo> typeInfos = TypeInfoUtils .getTypeInfosFromTypeString(columnTypeProperty); columnNames = new ArrayList<String>(); columnNames.addAll(Arrays.asList(columnNameProperty.toLowerCase().split(","))); numColumns = columnNames.size(); columnOIs = new ArrayList<ObjectInspector>(numColumns); columnSet = new boolean[numColumns]; for (int c = 0; c < numColumns; c++) { TypeInfo colTypeInfo = typeInfos.get(c); if (colTypeInfo.getCategory() != Category.PRIMITIVE){ throw new SerDeException("Only primitive field types are accepted"); } if (colTypeInfo.getTypeName().equals("binary")) { if (geometryColumn >= 0) { // only one column can be defined as binary for geometries throw new SerDeException( "Multiple binary columns defined. Define only one binary column for geometries"); } columnOIs.add(GeometryUtils.geometryTransportObjectInspector); geometryColumn = c; } else { columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(colTypeInfo)); } } // standardStruct uses ArrayList to store the row. rowOI = ObjectInspectorFactory.getStandardStructObjectInspector( columnNames, columnOIs); // constructing the row objects, etc, which will be reused for all rows. rowBase = new ArrayList<Writable>(numColumns); row = new ArrayList<Writable>(numColumns); // set each value in rowBase to the writable that corresponds with its PrimitiveObjectInspector for (int c = 0; c < numColumns; c++) { PrimitiveObjectInspector poi = (PrimitiveObjectInspector)columnOIs.get(c); Writable writable; try { writable = (Writable)poi.getPrimitiveWritableClass().newInstance(); } catch (InstantiationException e) { throw new SerDeException("Error creating Writable from ObjectInspector", e); } catch (IllegalAccessException e) { throw new SerDeException("Error creating Writable from ObjectInspector", e); } rowBase.add(writable); row.add(null); // default all values to null } } // /initialize @Override public Object deserialize(Writable json_in) throws SerDeException { Text json = (Text) json_in; // null out array because we reuse it and we don't want values persisting // from the last record for (int i=0;i<numColumns;i++) row.set(i, null); try { JsonParser parser = jsonFactory.createJsonParser(json.toString()); JsonToken token = parser.nextToken(); while (token != null) { if (token == JsonToken.START_OBJECT) { if ("geometry".equals(parser.getCurrentName())) { if (geometryColumn > -1) { // create geometry and insert into geometry field OGCGeometry ogcGeom = parseGeom(parser); row.set(geometryColumn, ogcGeom == null ? null : GeometryUtils.geometryToEsriShapeBytesWritable(ogcGeom)); } else { // no geometry in select field set, don't even bother parsing parser.skipChildren(); } } else if (attrLabel.equals(parser.getCurrentName())) { token = parser.nextToken(); while (token != JsonToken.END_OBJECT && token != null) { // hive makes all column names in the queries column list lower case String name = parser.getText().toLowerCase(); parser.nextToken(); // figure out which column index corresponds with the attribute name int fieldIndex = columnNames.indexOf(name); if (fieldIndex >= 0) { setRowFieldFromParser(fieldIndex, parser); } token = parser.nextToken(); } token = parser.nextToken(); } } token = parser.nextToken(); } } catch (JsonParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return row; } @Override public ObjectInspector getObjectInspector() throws SerDeException { return rowOI; } @Override public SerDeStats getSerDeStats() { return null; } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } @Override public Writable serialize(Object obj, ObjectInspector oi) throws SerDeException { StandardStructObjectInspector structOI = (StandardStructObjectInspector) oi; // get list of writables, one for each field in the row List<Object> fieldWritables = structOI.getStructFieldsDataAsList(obj); StringWriter writer = new StringWriter(); try { JsonGenerator jsonGen = jsonFactory.createJsonGenerator(writer); jsonGen.writeStartObject(); // first write attributes jsonGen.writeObjectFieldStart(attrLabel); for (int i = 0; i < fieldWritables.size(); i++) { if (i == geometryColumn) continue; // skip geometry, it comes later try { generateJsonFromValue(fieldWritables.get(i), i, jsonGen); } catch (JsonProcessingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } jsonGen.writeEndObject(); // if geometry column exists, write it if (geometryColumn > -1) { Object got = fieldWritables.get(geometryColumn); if (got == null) { jsonGen.writeObjectField("geometry", null); } else { BytesWritable bytesWritable = null; if (got instanceof BytesWritable) bytesWritable = (BytesWritable)got; else // SparkSQL, #97 bytesWritable = new BytesWritable((byte[])got); // idea: avoid extra object OGCGeometry ogcGeometry = GeometryUtils.geometryFromEsriShape(bytesWritable); jsonGen.writeRaw(",\"geometry\":" + outGeom(ogcGeometry)); } } jsonGen.writeEndObject(); jsonGen.close(); } catch (JsonGenerationException e) { LOG.error("Error generating JSON", e); return null; } catch (IOException e) { LOG.error("Error generating JSON", e); return null; } return new Text(writer.toString()); } /** * Send to the generator, the value of the cell, using column type * * @param value The attribute value as the object given by Hive * @param fieldIndex column index of field in row * @param jsonGen JsonGenerator * @throws JsonProcessingException * @throws IOException */ private void generateJsonFromValue(Object value, int fieldIndex, JsonGenerator jsonGen) throws JsonProcessingException, IOException { String label = columnNames.get(fieldIndex); PrimitiveObjectInspector poi = (PrimitiveObjectInspector)this.columnOIs.get(fieldIndex); if (value == null) { jsonGen.writeObjectField(label, null); } else if (value instanceof LazyPrimitive<?,?>) { // have seen LazyString, #25 generateJsonFromLazy((LazyPrimitive<?,?>)value, fieldIndex, label, poi, jsonGen); } else if (value instanceof Writable) { generateJsonFromWritable((Writable)value, fieldIndex, label, poi, jsonGen); } else { // SparkSQL, #97 jsonGen.writeObjectField(label, value); } } private void generateJsonFromLazy(LazyPrimitive<?,?> value, int fieldIndex, String label, PrimitiveObjectInspector poi, JsonGenerator jsonGen) throws JsonProcessingException, IOException { generateJsonFromWritable(value.getWritableObject(), fieldIndex, label, poi, jsonGen); } private void generateJsonFromWritable(Writable value, int fieldIndex, String label, PrimitiveObjectInspector poi, JsonGenerator jsonGen) throws JsonProcessingException, IOException { jsonGen.writeObjectField(label, poi.getPrimitiveJavaObject(value)); } // Write OGCGeometry to JSON abstract protected String outGeom(OGCGeometry geom); // Parse OGCGeometry from JSON abstract protected OGCGeometry parseGeom(JsonParser parser); /** * Copies the Writable at fieldIndex from rowBase to row, then sets the value of the Writable * to the value in parser * * @param fieldIndex column index of field in row * @param parser JsonParser pointing to the attribute * @throws JsonParseException * @throws IOException */ private void setRowFieldFromParser(int fieldIndex, JsonParser parser) throws JsonParseException, IOException{ PrimitiveObjectInspector poi = (PrimitiveObjectInspector)this.columnOIs.get(fieldIndex); if (JsonToken.VALUE_NULL == parser.getCurrentToken()) return; // leave the row-cell as null // set the field in the row to the writable from rowBase row.set(fieldIndex, rowBase.get(fieldIndex)); switch (poi.getPrimitiveCategory()){ case BYTE: ((ByteWritable)row.get(fieldIndex)).set(parser.getByteValue()); break; case SHORT: ((ShortWritable)row.get(fieldIndex)).set(parser.getShortValue()); break; case INT: ((IntWritable)row.get(fieldIndex)).set(parser.getIntValue()); break; case LONG: ((LongWritable)row.get(fieldIndex)).set(parser.getLongValue()); break; case DOUBLE: ((DoubleWritable)row.get(fieldIndex)).set(parser.getDoubleValue()); break; case FLOAT: ((FloatWritable)row.get(fieldIndex)).set(parser.getFloatValue()); break; case BOOLEAN: ((BooleanWritable)row.get(fieldIndex)).set(parser.getBooleanValue()); break; default: // STRING/unrecognized ((Text)row.get(fieldIndex)).set(parser.getText()); break; } } }