/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.builtin; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Properties; import org.codehaus.jackson.JsonFactory; import org.codehaus.jackson.JsonParser; import org.codehaus.jackson.JsonToken; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.pig.Expression; import org.apache.pig.LoadCaster; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.PigWarning; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import org.apache.pig.parser.ParserException; /** * A loader for data stored using {@link JsonStorage}. This is not a generic * JSON loader. It depends on the schema being stored with the data when * conceivably you could write a loader that determines the schema from the * JSON. */ public class JsonLoader extends LoadFunc implements LoadMetadata { protected RecordReader reader = null; protected ResourceSchema schema = null; private String udfcSignature = null; private JsonFactory jsonFactory = null; private TupleFactory tupleFactory = TupleFactory.getInstance(); private BagFactory bagFactory = BagFactory.getInstance(); private static final String SCHEMA_SIGNATURE = "pig.jsonloader.schema"; public JsonLoader() { } public JsonLoader(String schemaString) throws IOException { schema = new ResourceSchema(Utils.parseSchema(schemaString)); } public void setLocation(String location, Job job) throws IOException { // Tell our input format where we will be reading from FileInputFormat.setInputPaths(job, location); } @SuppressWarnings("unchecked") public InputFormat getInputFormat() throws IOException { // We will use TextInputFormat, the default Hadoop input format for // text. It has a LongWritable key that we will ignore, and the value // is a Text (a string writable) that the JSON data is in. return new TextInputFormat(); } public LoadCaster getLoadCaster() throws IOException { // We do not expect to do casting of byte arrays, because we will be // returning typed data. return null; } @SuppressWarnings("unchecked") public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = reader; // Get the schema string from the UDFContext object. UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); String strSchema = p.getProperty(SCHEMA_SIGNATURE); if (strSchema == null) { throw new IOException("Could not find schema in UDF context"); } // Parse the schema from the string stored in the properties object. schema = new ResourceSchema(Utils.getSchemaFromString(strSchema)); jsonFactory = new JsonFactory(); } public Tuple getNext() throws IOException { Text val = null; try { // Read the next key value pair from the record reader. If it's // finished, return null if (!reader.nextKeyValue()) return null; // Get the current value. We don't use the key. val = (Text)reader.getCurrentValue(); } catch (InterruptedException ie) { throw new IOException(ie); } // Create a parser specific for this input line. This may not be the // most efficient approach. byte[] newBytes = new byte[val.getLength()]; System.arraycopy(val.getBytes(), 0, newBytes, 0, val.getLength()); ByteArrayInputStream bais = new ByteArrayInputStream(newBytes); JsonParser p = jsonFactory.createJsonParser(bais); // Create the tuple we will be returning. We create it with the right // number of fields, as the Tuple object is optimized for this case. ResourceFieldSchema[] fields = schema.getFields(); Tuple t = tupleFactory.newTuple(fields.length); // Read the start object marker. Throughout this file if the parsing // isn't what we expect we return a tuple with null fields rather than // throwing an exception. That way a few mangled lines don't fail the // job. if (p.nextToken() != JsonToken.START_OBJECT) { warn("Bad record, could not find start of record " + val.toString(), PigWarning.UDF_WARNING_1); return t; } // Read each field in the record for (int i = 0; i < fields.length; i++) { t.set(i, readField(p, fields[i], i)); } if (p.nextToken() != JsonToken.END_OBJECT) { warn("Bad record, could not find end of record " + val.toString(), PigWarning.UDF_WARNING_1); return t; } p.close(); return t; } private Object readField(JsonParser p, ResourceFieldSchema field, int fieldnum) throws IOException { // Read the next token JsonToken tok = p.nextToken(); if (tok == null) { warn("Early termination of record, expected " + schema.getFields().length + " fields bug found " + fieldnum, PigWarning.UDF_WARNING_1); return null; } // Check to see if this value was null if (tok == JsonToken.VALUE_NULL) return null; // Read based on our expected type switch (field.getType()) { case DataType.INTEGER: // Read the field name tok = p.nextToken(); if (tok == JsonToken.VALUE_NULL) return null; return p.getIntValue(); case DataType.LONG: tok = p.nextToken(); if (tok == JsonToken.VALUE_NULL) return null; return p.getLongValue(); case DataType.FLOAT: tok = p.nextToken(); return p.getFloatValue(); case DataType.DOUBLE: tok = p.nextToken(); if (tok == JsonToken.VALUE_NULL) return null; return p.getDoubleValue(); case DataType.BYTEARRAY: tok = p.nextToken(); if (tok == JsonToken.VALUE_NULL) return null; byte[] b = p.getText().getBytes(); // Use the DBA constructor that copies the bytes so that we own // the memory return new DataByteArray(b, 0, b.length); case DataType.CHARARRAY: tok = p.nextToken(); if (tok == JsonToken.VALUE_NULL) return null; return p.getText(); case DataType.MAP: // Should be a start of the map object if (p.nextToken() != JsonToken.START_OBJECT) { warn("Bad map field, could not find start of object, field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } Map<String, String> m = new HashMap<String, String>(); while (p.nextToken() != JsonToken.END_OBJECT) { String k = p.getCurrentName(); String v = p.getText(); m.put(k, v); } return m; case DataType.TUPLE: if (p.nextToken() != JsonToken.START_OBJECT) { warn("Bad tuple field, could not find start of object, " + "field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } ResourceSchema s = field.getSchema(); ResourceFieldSchema[] fs = s.getFields(); Tuple t = tupleFactory.newTuple(fs.length); for (int j = 0; j < fs.length; j++) { t.set(j, readField(p, fs[j], j)); } if (p.nextToken() != JsonToken.END_OBJECT) { warn("Bad tuple field, could not find end of object, " + "field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } return t; case DataType.BAG: if (p.nextToken() != JsonToken.START_ARRAY) { warn("Bad bag field, could not find start of array, " + "field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } s = field.getSchema(); fs = s.getFields(); // Drill down the next level to the tuple's schema. s = fs[0].getSchema(); fs = s.getFields(); DataBag bag = bagFactory.newDefaultBag(); JsonToken innerTok; while ((innerTok = p.nextToken()) != JsonToken.END_ARRAY) { if (innerTok != JsonToken.START_OBJECT) { warn("Bad bag tuple field, could not find start of " + "object, field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } t = tupleFactory.newTuple(fs.length); for (int j = 0; j < fs.length; j++) { t.set(j, readField(p, fs[j], j)); } if (p.nextToken() != JsonToken.END_OBJECT) { warn("Bad bag tuple field, could not find end of " + "object, field " + fieldnum, PigWarning.UDF_WARNING_1); return null; } bag.add(t); } return bag; default: throw new IOException("Unknown type in input schema: " + field.getType()); } } //------------------------------------------------------------------------ public void setUDFContextSignature(String signature) { udfcSignature = signature; } public ResourceSchema getSchema(String location, Job job) throws IOException { ResourceSchema s; if (schema!=null) { s = schema; } else { // Parse the schema s = (new JsonMetadata()).getSchema(location, job, true); if (s == null) { throw new IOException("Unable to parse schema found in file in " + location); } } // Now that we have determined the schema, store it in our // UDFContext properties object so we have it when we need it on the // backend UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); p.setProperty(SCHEMA_SIGNATURE, s.toString()); return s; } public ResourceStatistics getStatistics(String location, Job job) throws IOException { // We don't implement this one. return null; } public String[] getPartitionKeys(String location, Job job) throws IOException { // We don't have partitions return null; } public void setPartitionFilter(Expression partitionFilter) throws IOException { // We don't have partitions } }