MilanoLoadFunc.java example

Explorer
milano-master
- core
  - src
    - main
      - java
        com
        metamx
        milano
        io
        MilanoProtoFile.java
        proto
        MilanoTool.java
    - test
      - java
        com
        metamx
        milano
        ProtoTestObjects.java
        io
        MilanoProtoFileTests.java
        proto
        MilanoToolTests.java
- hadoop-pig
  - src
    - main
      - java
        com
        metamx
        milano
        hadoop
        MilanoProtoFileInputFormat.java
        MilanoProtoFileOutputFormat.java
        MilanoProtoFileRecordReader.java
        pig
        MilanoLoadFunc.java
        MilanoStoreFunc.java
    - test
      - java
        com
        metamx
        milano
        ProtoTestObjects.java
        hadoop
        MilanoProtoFileInputFormatTests.java
        MilanoProtoFileOutputFormatTests.java
        pig
        MilanoStoreLoadFuncTests.java
/**
 * Copyright (C) 2011 Metamarkets http://metamx.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metamx.milano.pig;

import com.google.common.base.Joiner;
import com.google.common.base.Throwables;
import com.google.protobuf.ByteString;
import com.google.protobuf.Descriptors;
import com.google.protobuf.Message;
import com.metamx.milano.generated.io.MilanoTypeMetadata;
import com.metamx.milano.hadoop.MilanoProtoFileInputFormat;
import com.metamx.milano.hadoop.MilanoProtoFileRecordReader;
import com.metamx.milano.io.MilanoProtoFile;
import com.metamx.milano.proto.MilanoTool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.log4j.Logger;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;

/**
 *
 */
public class MilanoLoadFunc extends LoadFunc implements LoadMetadata //, LoadPushDown -- Not yet supported
{
  private static final Logger log = Logger.getLogger(MilanoLoadFunc.class);
  private String udfSignature;
  private MilanoProtoFileRecordReader recordReader;
  private TupleFactory tupleFactory = TupleFactory.getInstance();
  private BagFactory bagFactory = BagFactory.getInstance();
  private MilanoTypeMetadata.TypeMetadata typeMetadata;
  private Descriptors.Descriptor descriptor;


  /**
   * Set the UDF Signature which is used to store the ProtoBuf Schema between the client and mapper/reducer.
   * Called in both the client and mapper/reducer context.
   *
   * @param signature The UDF signature.
   */
  @Override
  public void setUDFContextSignature(String signature)
  {
    udfSignature = signature;
  }

  @Override
  public void setLocation(String location, Job job) throws IOException
  {
    Path basePath = new Path(location);
    FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration());

    Set<Path> paths = new TreeSet<Path>();

    if (fileSystem.getFileStatus(basePath).isDir()) {
      getPaths(basePath, paths, fileSystem);
    } else {
      paths.add(basePath);
    }

    log.info("Setting input to " + paths);
    FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths));
  }

  private void getPaths(Path baseDirectory, Set<Path> paths, FileSystem fileSystem) throws IOException
  {
    FileStatus[] files = fileSystem.listStatus(baseDirectory);
    for (FileStatus file : files) {
      Path path = file.getPath();
      FileStatus fileStatus = fileSystem.getFileStatus(path);
      if (fileStatus.isDir()) {
        getPaths(path, paths, fileSystem);
      } else {
        paths.add(baseDirectory);
      }
    }
  }

  @Override
  public InputFormat getInputFormat() throws IOException
  {
    log.debug("Getting InputFormat");
    return new MilanoProtoFileInputFormat();
  }

  @Override
  public void prepareToRead(RecordReader reader, PigSplit split) throws IOException
  {
    log.debug("Preparing to read");
    recordReader = (MilanoProtoFileRecordReader) reader;
    typeMetadata = recordReader.getMetadata();
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();
  }

  @Override
  public Tuple getNext() throws IOException
  {
    try {
      if (!recordReader.nextKeyValue()) {
        return null;
      }

      return buildTuple(recordReader.getCurrentValue(), descriptor);
    }
    catch (InterruptedException e) {
      log.error("Interrupted", e);
      throw Throwables.propagate(e);
    }
  }

  /**
   * This recursively builds a Pig Tuple from a Message and a Descriptor.
   *
   * @param message    The Message to decode into a Tuple.
   * @param descriptor The Descriptor to use in decoding the Message.
   *
   * @return The new Tuple.
   *
   * @throws IOException Thrown when we receive an unsupported type in the Message/Descriptor (ENUM/BOOLEAN).
   */
  private Tuple buildTuple(Message message, final Descriptors.Descriptor descriptor) throws IOException
  {
    List<Object> tuple = new ArrayList<Object>();

    for (Descriptors.FieldDescriptor fieldDescriptor : descriptor.getFields()) {
      // HACK: For some reason the FieldDescriptor from the Descriptor doesn't match the one from the Message.
      // HACK: This is a hack to get around that problem.
      Descriptors.FieldDescriptor messageFieldDescriptor = message.getDescriptorForType()
                                                                  .findFieldByName(fieldDescriptor.getName());
      switch (fieldDescriptor.getJavaType()) {
        case INT:
        case LONG:
        case FLOAT:
        case DOUBLE:
        case STRING:
          tuple.add(message.getField(messageFieldDescriptor));
          break;
        case BYTE_STRING:
          // Pig doesn't understand ByteString. Here we convert to a byte[] which Pig does understand.
          tuple.add(new DataByteArray(((ByteString) message.getField(messageFieldDescriptor)).toByteArray()));
          break;

        // This functionality is totally untested.
        case MESSAGE:
          if (fieldDescriptor.isRepeated()) {
            // We have a bag.
            List<Tuple> bag = new ArrayList<Tuple>();

            int count = message.getRepeatedFieldCount(messageFieldDescriptor);
            for (int i = 0; i < count; i++) {
              bag.add(
                  buildTuple(
                      (Message) message.getRepeatedField(messageFieldDescriptor, i),
                      fieldDescriptor.getMessageType()
                  )
              );
            }

            tuple.add(bagFactory.newDefaultBag(bag));
          } else {
            // Just a tuple.
            tuple.add(
                buildTuple(
                    (Message) message.getField(messageFieldDescriptor),
                    messageFieldDescriptor.getMessageType()
                )
            );
          }
          break;

        case ENUM:
        case BOOLEAN:
          throw new IOException(String.format("Type %s not supported.", fieldDescriptor.getJavaType().toString()));
      }
    }

    return tupleFactory.newTuple(tuple);
  }

  /**
   * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata.
   * This is the method by which we pass the schema types and names directly to pig without having to specify them directly.
   *
   * @param location As passed to relativeToAbsolutePath
   * @param job      The job.
   *
   * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist.
   *
   * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type.
   */
  @Override
  public ResourceSchema getSchema(String location, Job job) throws IOException
  {
    Configuration conf = job.getConfiguration();
    Properties props = ConfigurationUtil.toProperties(conf);

    // HACK: Here we open the file directly to read the TypeMetadata.
    // HACK: There may be a better more direct way to do this, but it works for now.
    Path path = new Path(location);
    FileSystem fileSystem = path.getFileSystem(conf);

    FileStatus fileStatus = fileSystem.getFileStatus(path);
    if (fileStatus.isDir()) {
      log.debug(String.format("Path is a directory."));
      path = getFilePath(path, fileSystem);
      if (path == null) {
        return null;
      }
    } else if (!fileSystem.exists(path)) {
      return null;
    }

    MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path));
    typeMetadata = reader.getMetadata();
    reader.close();

    if (typeMetadata == null) {
      return null;
    }
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();

    return new ResourceSchema(getMessageSchema(descriptor));
  }

  private Path getFilePath(Path path, FileSystem fileSystem) throws IOException
  {
    Path newPath = null;
    FileStatus[] files = fileSystem.listStatus(path);
    for (FileStatus file : files) {
      if (file.isDir()) {
        newPath = getFilePath(file.getPath(), fileSystem);
        if (newPath != null) {
          break;
        }
      } else {
        newPath = file.getPath();
        break;
      }
    }

    return newPath;
  }

  /**
   * This takes a Descriptor and recursively creates a Pig Schema out of it.
   *
   * @param descriptor The descriptor to use.
   *
   * @return A Schema representing the structure of the descriptor.
   *
   * @throws IOException Thrown if an unsupported type is encountered.
   */
  private Schema getMessageSchema(final Descriptors.Descriptor descriptor) throws IOException
  {
    Schema schema = new Schema();
    for (Descriptors.FieldDescriptor fieldDescriptor : descriptor.getFields()) {
      String name = fieldDescriptor.getName();
      switch (fieldDescriptor.getJavaType()) {
        case INT:
          schema.add(new Schema.FieldSchema(name, DataType.INTEGER));
          break;
        case LONG:
          schema.add(new Schema.FieldSchema(name, DataType.LONG));
          break;
        case FLOAT:
          schema.add(new Schema.FieldSchema(name, DataType.FLOAT));
          break;
        case DOUBLE:
          schema.add(new Schema.FieldSchema(name, DataType.DOUBLE));
          break;
        case STRING:
          schema.add(new Schema.FieldSchema(name, DataType.CHARARRAY));
          break;
        case BYTE_STRING:
          schema.add(new Schema.FieldSchema(name, DataType.BYTEARRAY));
          break;
        case MESSAGE:
          Descriptors.Descriptor messageType = fieldDescriptor.getMessageType();
          if (fieldDescriptor.isRepeated()) {
            // We have a bag.
            schema.add(new Schema.FieldSchema(name, getMessageSchema(messageType), DataType.BAG));
          } else {
            // Just a tuple.
            schema.add(new Schema.FieldSchema(name, getMessageSchema(messageType), DataType.TUPLE));
          }
          break;
        case ENUM:
        case BOOLEAN:
        default:
          throw new IOException("Unsupported data type.");
      }
    }

    return schema;
  }

  @Override
  public ResourceStatistics getStatistics(String location, Job job) throws IOException
  {
    return null; //Not supported.
  }

  @Override
  public String[] getPartitionKeys(String location, Job job) throws IOException
  {
    return null; //Not supported.
  }

  @Override
  public void setPartitionFilter(Expression partitionFilter) throws IOException
  {
    // No-Op not supported.
  }
/*  // Not yet supported, scheduled for later release.
  @Override
  public List<LoadPushDown.OperatorSet> getFeatures()
  {
    return null;
  }

  @Override
  public LoadPushDown.RequiredFieldResponse pushProjection(LoadPushDown.RequiredFieldList requiredFieldList)
      throws FrontendException
  {
    return null;
  }
*/
}