package net.iponweb.hadoop.streaming.parquet; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.Progressable; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroup; import org.apache.parquet.hadoop.ParquetRecordWriter; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.Stack; public class JsonRecordWriterWrapper extends TextRecordWriterWrapper { ObjectMapper mapper; JsonRecordWriterWrapper(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException { super(w,fs,conf,name,progress); mapper = new ObjectMapper(); } @Override public void write(Text key, Text value) throws IOException { try { // parse K as JSON and convert into parquet group Group grp = factory.newGroup(); JsonNode node = mapper.readTree(key.toString()); Iterator<PathAction> ai = recorder.iterator(); Stack<Group> savedGroup = new Stack<>(); Stack<JsonNode> savedNode = new Stack<>(); while (ai.hasNext()) { PathAction a = ai.next(); switch(a.getAction()) { case GROUPSTART: savedGroup.push(grp); grp = grp.addGroup(a.getName()); savedNode.push(node); node = node.get(a.getName()); break; case GROUPEND: grp = savedGroup.pop(); node = savedNode.pop(); break; case FIELD: String colName = a.getName(); JsonNode stubNode = node.get(colName); PrimitiveType.PrimitiveTypeName primType = a.getType(); try { if (stubNode == null || stubNode.isNull()) { if (a.getRepetition() == Type.Repetition.OPTIONAL || a.getRepetition() == Type.Repetition.REPEATED) continue; else throw new InvalidRecordException("json column '" + colName + "' is null, while defined as non-optional in parquet schema"); } // If we have 'repeated' field, assume that we should expect JSON-encoded array // Convert array and append all values int repetition = 1; boolean repeated = false; ArrayList<JsonNode> s_vals = null; if (a.getRepetition() == Type.Repetition.REPEATED) { repeated = true; s_vals = new ArrayList<>(); Iterator <JsonNode> itr = stubNode.iterator(); repetition = 0; while(itr.hasNext()) { s_vals.add(itr.next()); // No array-of-objects! repetition ++; } } for (int j = 0; j < repetition; j ++) { if (repeated) { // extract new s stubNode = s_vals.get(j); if (stubNode == null || stubNode.isNull()) continue; } switch (primType) { case INT32: grp.append(colName, stubNode.getIntValue()); break; case INT64: case INT96: grp.append(colName, stubNode.getLongValue()); break; case DOUBLE: grp.append(colName, stubNode.getDoubleValue()); break; case FLOAT: grp.append(colName, (float) stubNode.getDoubleValue()); break; case BOOLEAN: grp.append(colName, stubNode.getBooleanValue()); break; case BINARY: grp.append(colName, stubNode.getTextValue()); break; default: throw new RuntimeException("Can't handle type " + primType); } } } catch (Exception e) { e.printStackTrace(); throw new IOException(e); } } } realWriter.write(null, (SimpleGroup) grp); } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } } }