/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package sensim; // package org.apache.pig.piggybank.storage; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.pig.LoadFunc; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat; import org.apache.pig.data.DataBag; import org.apache.pig.data.DefaultBagFactory; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * Parses JSON data from a JSON file one line at a time. Each line is expected * to contain a separate JSON record. * * Usage: * e.g. JSON that looks like: * -------------------------------------------------------------- * {"menu": { * "id": "file", * "value": "File", * "popup": { * "menuitem": [ * {"value": "New", "onclick": "CreateNewDoc()"}, * {"value": "Open", "onclick": "OpenDoc()"}, * {"value": "Close", "onclick": "CloseDoc()"} * ] * } * }} * -------------------------------------------------------------- * **The above json record is expanded for readability. This entire record should be * condensed to one line in your json file. * * register the jar containing this class (e.g. piggybank.jar) * a = load '/tmp/jsontest' using org.pig.piggybank.storage.JsonLoader() as (json:map[]); * b = foreach a generate flatten(json#'menu') as menu; * c = foreach b generate flatten(menu#'popup') as popup; * d = foreach c generate flatten(popup#'menuitem') as menu; * e = foreach d generate flatten(men#'value') as val; * */ public class JsonLoader extends LoadFunc { private static final TupleFactory tupleFactory = TupleFactory.getInstance(); private ObjectMapper mapper; private LineRecordReader in = null; public JsonLoader() { super(); mapper = new ObjectMapper(); } @SuppressWarnings("unchecked") @Override public InputFormat getInputFormat() throws IOException { return new PigTextInputFormat(); } @Override public Tuple getNext() throws IOException { boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text val = in.getCurrentValue(); if (val == null) { return null; } String line = val.toString(); if (line.length() > 0) { Tuple t = parseStringToTuple(line); if (t != null) { return t; } } return null; } protected Tuple parseStringToTuple(String line) throws IOException { try { Map<String, Object> values = new HashMap<String, Object>(); JsonNode node = mapper.readTree(line); System.out.println(mapper.readTree(line)); flatten_value(node, values); return tupleFactory.newTuple(values); } catch (NumberFormatException e) { int errCode = 6018; String errMsg = "Error while reading input - Very big number exceeds the scale of long: " + line; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } catch (JsonParseException e) { int errCode = 6018; String errMsg = "Error while reading input - Could not json-decode string: " + line; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } private void flatten_value(JsonNode node, Map<String, Object> values) { Iterator<String> keys = node.getFieldNames(); Iterator<JsonNode> nodes = node.getElements(); while (keys.hasNext()) { String key = keys.next(); JsonNode value = nodes.next(); System.out.println(key + ":" + value.toString()); if (value.isArray()) { ArrayNode array = (ArrayNode) value; DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); for (JsonNode innervalue : array) { flatten_array(innervalue, bag); } values.put(key, bag); } else if (value.isObject()) { Map<String, Object> values2 = new HashMap<String, Object>(); flatten_value((ObjectNode) value, values2); values.put(key, tupleFactory.newTuple(values2)); } else { values.put(key, value != null ? value.toString().replaceAll("[\"]", "") : null); } } } private void flatten_array(JsonNode value, DataBag bag) { if(value.isArray()) { ArrayNode array = (ArrayNode)value; DataBag b = DefaultBagFactory.getInstance().newDefaultBag(); for(JsonNode innervalue :array) { flatten_array(innervalue, b); } bag.addAll(b); } else if (value.isObject()){ Map<String, Object> values2 = new HashMap<String, Object>(); flatten_value((ObjectNode)value, values2); bag.add(tupleFactory.newTuple(values2)); } else { if(value !=null) { bag.add( tupleFactory.newTuple(value)); } } } @SuppressWarnings("unchecked") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { in = (LineRecordReader) reader; } @Override public void setLocation(String location, Job job) throws IOException { PigFileInputFormat.setInputPaths(job, location); } }