/**
* Copyright (C) 2011 Metamarkets http://metamx.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metamx.milano.pig;
import com.google.protobuf.ByteString;
import com.google.protobuf.DescriptorProtos;
import com.google.protobuf.Descriptors;
import com.google.protobuf.DynamicMessage;
import com.google.protobuf.Message;
import com.metamx.milano.hadoop.MilanoProtoFileOutputFormat;
import com.metamx.milano.proto.MilanoTool;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
import org.apache.pig.ResourceSchema;
import org.apache.pig.StoreFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
import java.io.IOException;
import java.util.Iterator;
import java.util.Properties;
/**
*
*/
public class MilanoStoreFunc extends StoreFunc
{
private Logger log = Logger.getLogger(MilanoStoreFunc.class);
private RecordWriter<String, Message> recordWriter;
private String udfSignature;
private Descriptors.Descriptor schemaDescriptor;
/**
* Set the UDF Signature which is used to store the ProtoBuf Schema between the client and mapper/reducer.
* Called in both the client and mapper/reducer context.
*
* @param signature The UDF signature.
*/
@Override
public void setStoreFuncUDFContextSignature(String signature)
{
udfSignature = signature;
}
/* *** Client side calls *** */
/**
* Here we take the ResourceSchema and generate the ProtoBuf Descriptor.
* We take the Descriptor, base64 encode it, and store it into the UDFContext.
* The majority of the work of this function is done in getProtoSchema
*
* @param schema The ResourceSchema supplied by Pig.
*
* @throws IOException Thrown if the descriptor can not be validated.
*/
@Override
public void checkSchema(ResourceSchema schema) throws IOException
{
Properties props = getUDFProps();
log.debug(String.format("Generating Descriptor and storing base64 into \"milano.pig.proto.schema.base64\""));
DescriptorProtos.DescriptorProto protoSchema = getProtoSchema(schema);
DescriptorProtos.FileDescriptorProto fileDescriptorProto = DescriptorProtos.FileDescriptorProto
.newBuilder()
.addMessageType(protoSchema)
// .setName("milano_dynamic_type.proto")
// .setPackage("metamx.milano.dynamic")
.build();
props.setProperty(
"milano.pig.proto.schema.base64",
MilanoTool.with(
protoSchema,
fileDescriptorProto,
DescriptorProtos.FileDescriptorSet.getDefaultInstance()
).getBase64()
);
}
/**
* This takes a Pig ResourceSchema and build a ProtoBuf DescriptorProto out of it.
*
* @param schema The ResourceSchema to create a DescriptorProto for.
*
* @return A DescriptorProto matching the ResourceSchema. Note that this is not a Descriptor but the Proto Object representing one.
*
* @throws IOException Thrown if the ResourceSchema contains an unsupported or unknown type.
*/
private DescriptorProtos.DescriptorProto getProtoSchema(ResourceSchema schema) throws IOException
{
DescriptorProtos.DescriptorProto.Builder builder = DescriptorProtos.DescriptorProto.newBuilder();
builder.setName("PigTupleSchema");
int index = 1;
for (ResourceSchema.ResourceFieldSchema fieldSchema : schema.getFields()) {
String fieldName = fieldSchema.getName();
fieldName = fieldName.substring(fieldName.lastIndexOf(':') + 1);
log.debug(
String.format(
"Starting field [%s] of type [%s] for index [%s] with full name [%s]",
fieldName,
DataType.findTypeName(fieldSchema.getType()),
index,
fieldSchema.getName()
)
);
switch (fieldSchema.getType()) {
// String Types handled with addStringField
case DataType.CHARARRAY:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_STRING, index, fieldName);
break;
case DataType.BYTEARRAY:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_BYTES, index, fieldName);
break;
case DataType.DOUBLE:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_DOUBLE, index, fieldName);
break;
case DataType.FLOAT:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_FLOAT, index, fieldName);
break;
case DataType.INTEGER:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_INT32, index, fieldName);
break;
case DataType.LONG:
addField(builder, DescriptorProtos.FieldDescriptorProto.Type.TYPE_INT64, index, fieldName);
break;
// Containers These are totally untested.
case DataType.TUPLE:
// Recurse
log.debug(String.format("Adding tuple field [%s]", fieldName));
builder.addFieldBuilder()
.mergeFrom(getProtoSchema(fieldSchema.getSchema()))
.setName(fieldName)
.setNumber(index)
.build();
break;
case DataType.BAG:
// Create list and recurse
log.debug(String.format("Adding bag field [%s]", fieldName));
ResourceSchema.ResourceFieldSchema[] fs = fieldSchema.getSchema().getFields();
if (fs.length != 1 || fs[0].getType() != DataType.TUPLE) {
throw logAndReturnIOE("Found a bag without a tuple inside!");
}
builder.addFieldBuilder()
.setOptions(builder.addFieldBuilder().getOptionsBuilder().setPacked(true))
.setLabel(DescriptorProtos.FieldDescriptorProto.Label.LABEL_REPEATED)
.mergeFrom(getProtoSchema(fieldSchema.getSchema()))
.setName(fieldName)
.setNumber(index)
.build();
break;
// Unsupported Types
case DataType.MAP:
throw logAndReturnIOE("The Pig MAP type is currently not supported.");
case DataType.NULL:
throw logAndReturnIOE("The Pig NULL type is currently not supported.");
case DataType.ERROR:
throw logAndReturnIOE("The Pig ERROR type is currently not supported.");
case DataType.BIGCHARARRAY:
case DataType.GENERIC_WRITABLECOMPARABLE:
throw new UnsupportedOperationException("Pig datatype not supported.");
// Pig Internal Types
case DataType.BOOLEAN:
case DataType.BYTE:
case DataType.INTERNALMAP:
throw logAndReturnIOE("Use of internal type in schema definition.");
// Unknown (duh)
case DataType.UNKNOWN:
default:
throw logAndReturnIOE("Unknown data type.");
}
index++;
}
return builder.build();
}
/**
* This adds a field to a DescriptorProto being built.
*
* @param builder The builder to add the field to.
* @param fieldType The Type of field to add.
* @param index The number to assign the field.
* @param name The name to give the field.
*/
private void addField(
final DescriptorProtos.DescriptorProto.Builder builder,
final DescriptorProtos.FieldDescriptorProto.Type fieldType,
final int index,
final String name
)
{
log.debug(String.format("Adding field [%s] of type [%s]", name, fieldType.name()));
builder.addFieldBuilder()
.setType(fieldType)
.setName(name)
.setNumber(index)
.build();
}
/* *** Mapper/Reducer side calls *** */
/**
* This does the setup for the mapper/reducer side.
*
* @param location The output path.
* @param job The job config.
*
* @throws IOException Currently not thrown, but is part of the overridden signature.
*/
@Override
public void setStoreLocation(String location, Job job) throws IOException
{
FileOutputFormat.setOutputPath(job, new Path(location));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
Properties props = getUDFProps();
job.getConfiguration()
.set("com.metamx.milano.proto.descriptor.base64", (String) props.get("milano.pig.proto.schema.base64"));
}
/**
* This retrieves the OutputFormat for use by Hadoop.
*
* @return An {@link com.metamx.milano.hadoop.MilanoProtoFileOutputFormat}
*
* @throws IOException Currently not thrown, but part of the overridden signature.
*/
@Override
public OutputFormat getOutputFormat() throws IOException
{
assert udfSignature != null;
log.debug("Getting OutputFormat");
return new MilanoProtoFileOutputFormat();
}
/**
* Prepare this for writing to the OutputFormat.
* This expects "milano.pig.proto.schema.base64" to be present in the UDFContext.
*
* @param writer The RecordWriter supplied by the OutputFormat returned from getOutputFormat.
*
* @throws IOException Thrown when the UDFContext contains an invalid or non-existent base64 encoded DescriptorProto bytes.
*/
@Override
public void prepareToWrite(RecordWriter writer) throws IOException
{
log.debug("Preparing to write");
@SuppressWarnings("unchecked")
RecordWriter<String, Message> tempWriter = writer;
recordWriter = tempWriter;
Properties props = getUDFProps();
schemaDescriptor = MilanoTool.withBase64((String) props.get("milano.pig.proto.schema.base64")).getDescriptor();
}
/**
* This takes a Pig Tuple, converts it to a ProtoBuf Message, and writes it to the RecordWriter.
* The majority of the work for this function is done by serializeTuple.
*
* @param tuple The Tuple to convert and write out.
*
* @throws IOException Thrown when interrupted or if serializeTuple throws the same.
*/
@Override
public void putNext(Tuple tuple) throws IOException
{
DynamicMessage.Builder builder = DynamicMessage.newBuilder(schemaDescriptor);
serializeTuple(tuple, builder, schemaDescriptor);
try {
recordWriter.write("", builder.build());
}
catch (InterruptedException e) {
throw new IOException(e);
}
}
/**
* This takes a Tuple and sets up messageBuilder.
* The messageBuilder could be for a different type than the tupleDescriptor.
* The tupleDescriptor is used to parse the incoming Tuples.
* The messageBuilder is the type to actually write out to the file.
*
* @param tuple The Tuple to process.
* @param messageBuilder A message builder for the output Message type.
* @param tupleDescriptor A descriptor used to decode the Tuple (as generated by getProtoSchema).
*
* @throws IOException Thrown when a unsupported type is encountered in the tupleDescriptor.
*/
private void serializeTuple(
final Tuple tuple,
final Message.Builder messageBuilder,
final Descriptors.Descriptor tupleDescriptor
) throws IOException
{
Iterator<Object> tupleIterator = tuple.getAll().iterator();
Descriptors.Descriptor messageDescriptor = messageBuilder.getDescriptorForType();
// Here we process the two descriptors in parallel. For the current version this is actually unnecessary,
// but this is in anticipation of being able to specify the actual message without using the dynamic TypeMetadata.
for (Descriptors.FieldDescriptor tupleFieldDescriptor : tupleDescriptor.getFields()) {
Object item = tupleIterator.next();
Message.Builder itemBuilder;
if (item == null) {
continue;
}
Descriptors.FieldDescriptor messageFieldDescriptor = messageDescriptor.findFieldByName(tupleFieldDescriptor.getName());
switch (tupleFieldDescriptor.getJavaType()) {
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
messageBuilder.setField(messageFieldDescriptor, item);
break;
case BYTE_STRING:
// Here we convert the Pig object into bytes to be serialized, this could contain arbitrary binary data so it must be handled differently.
messageBuilder.setField(messageFieldDescriptor, ByteString.copyFrom(DataType.toBytes(item)));
break;
// This functionality is totally untested.
case MESSAGE:
Descriptors.Descriptor subTupleDescriptor = tupleFieldDescriptor.getMessageType();
if (tupleFieldDescriptor.isRepeated()) {
//This is a bag
for (Object subTuple : ((DataBag) item)) {
itemBuilder = messageBuilder.newBuilderForField(messageFieldDescriptor);
serializeTuple((Tuple) subTuple, itemBuilder, subTupleDescriptor);
messageBuilder.addRepeatedField(messageFieldDescriptor, itemBuilder);
}
} else {
//This is a tuple
itemBuilder = messageBuilder.newBuilderForField(messageFieldDescriptor);
serializeTuple((Tuple) item, itemBuilder, subTupleDescriptor);
messageBuilder.setField(tupleFieldDescriptor, itemBuilder);
}
break;
case BOOLEAN:
log.error("Boolean field in generated tuple descriptor");
throw logAndReturnIOE("Boolean field in generated tuple descriptor.");
case ENUM:
log.error("Enum field in generated tuple descriptor");
throw logAndReturnIOE("Enum field in generated tuple descriptor.");
default:
throw logAndReturnIOE("Unknown data type.");
}
}
}
/**
* Helper function to log and then return an IOE.
*
* @param message The message to report to both the logger and set in the IOE.
*
* @return An IOE with the given message.
*/
private IOException logAndReturnIOE(String message)
{
IOException ioe = new IOException(message);
log.error(message, ioe);
return ioe;
}
/**
* Retrieves the Properties object from the UDFContext.
*
* @return The properties for this UDF.
*/
private Properties getUDFProps()
{
UDFContext udfContext = UDFContext.getUDFContext();
return udfContext.getUDFProperties(this.getClass(), new String[]{udfSignature});
}
}