AvroSerdeUtils.java example

Explorer
hive_blinkdb-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.avro;


import org.apache.avro.Schema;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.mapred.JobConf;

import java.io.IOException;
import java.net.URL;
import java.util.List;
import java.util.Properties;

/**
 * Utilities useful only to the AvroSerde itself.  Not mean to be used by
 * end-users but public for interop to the ql package.
 */
public class AvroSerdeUtils {
  private static final Log LOG = LogFactory.getLog(AvroSerdeUtils.class);

  public static final String SCHEMA_LITERAL = "avro.schema.literal";
  public static final String SCHEMA_URL = "avro.schema.url";
  public static final String SCHEMA_NONE = "none";
  public static final String EXCEPTION_MESSAGE = "Neither " + SCHEMA_LITERAL + " nor "
          + SCHEMA_URL + " specified, can't determine table schema";
  public static final String AVRO_SERDE_SCHEMA = "avro.serde.schema";

  /**
   * Determine the schema to that's been provided for Avro serde work.
   * @param properties containing a key pointing to the schema, one way or another
   * @return schema to use while serdeing the avro file
   * @throws IOException if error while trying to read the schema from another location
   * @throws AvroSerdeException if unable to find a schema or pointer to it in the properties
   */
  public static Schema determineSchemaOrThrowException(Properties properties)
          throws IOException, AvroSerdeException {
    String schemaString = properties.getProperty(SCHEMA_LITERAL);
    if(schemaString != null && !schemaString.equals(SCHEMA_NONE))
      return Schema.parse(schemaString);

    // Try pulling directly from URL
    schemaString = properties.getProperty(SCHEMA_URL);
    if(schemaString == null || schemaString.equals(SCHEMA_NONE))
      throw new AvroSerdeException(EXCEPTION_MESSAGE);

    try {
      if(schemaString.toLowerCase().startsWith("hdfs://"))
        return getSchemaFromHDFS(schemaString, new Configuration());
    } catch(IOException ioe) {
      throw new AvroSerdeException("Unable to read schema from HDFS: " + schemaString, ioe);
    }

    return Schema.parse(new URL(schemaString).openStream());
  }

  /**
   * Attempt to determine the schema via the usual means, but do not throw
   * an exception if we fail.  Instead, signal failure via a special
   * schema.  This is used because Hive calls init on the serde during
   * any call, including calls to update the serde properties, meaning
   * if the serde is in a bad state, there is no way to update that state.
   */
  public static Schema determineSchemaOrReturnErrorSchema(Properties props) {
    try {
      return determineSchemaOrThrowException(props);
    } catch(AvroSerdeException he) {
      LOG.warn("Encountered AvroSerdeException determining schema. Returning " +
              "signal schema to indicate problem", he);
      return SchemaResolutionProblem.SIGNAL_BAD_SCHEMA;
    } catch (Exception e) {
      LOG.warn("Encountered exception determining schema. Returning signal " +
              "schema to indicate problem", e);
      return SchemaResolutionProblem.SIGNAL_BAD_SCHEMA;
    }
  }
  // Protected for testing and so we can pass in a conf for testing.
  protected static Schema getSchemaFromHDFS(String schemaHDFSUrl,
                                            Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FSDataInputStream in = null;

    try {
      in = fs.open(new Path(schemaHDFSUrl));
      Schema s = Schema.parse(in);
      return s;
    } finally {
      if(in != null) in.close();
    }
  }

  /**
   * Determine if an Avro schema is of type Union[T, NULL].  Avro supports nullable
   * types via a union of type T and null.  This is a very common use case.
   * As such, we want to silently convert it to just T and allow the value to be null.
   *
   * @return true if type represents Union[T, Null], false otherwise
   */
  public static boolean isNullableType(Schema schema) {
    return schema.getType().equals(Schema.Type.UNION) &&
           schema.getTypes().size() == 2 &&
             (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) ||
              schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
      // [null, null] not allowed, so this check is ok.
  }

  /**
   * In a nullable type, get the schema for the non-nullable type.  This method
   * does no checking that the provides Schema is nullable.
   */
  public static Schema getOtherTypeFromNullableType(Schema schema) {
    List<Schema> types = schema.getTypes();

    return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
  }

  /**
   * Determine if we're being executed from within an MR job or as part
   * of a select * statement.  The signals for this varies between Hive versions.
   * @param job that contains things that are or are not set in a job
   * @return Are we in a job or not?
   */
  public static boolean insideMRJob(JobConf job) {
    return job != null
           && (HiveConf.getVar(job, HiveConf.ConfVars.PLAN) != null)
           && (!HiveConf.getVar(job, HiveConf.ConfVars.PLAN).isEmpty());
  }
}