KafkaAvroMessageDecoder.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.realtime.impl.kafka;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.utils.retry.RetryPolicies;
import com.linkedin.pinot.core.data.GenericRow;


@NotThreadSafe
public class KafkaAvroMessageDecoder implements KafkaMessageDecoder {
  private static final Logger LOGGER = LoggerFactory.getLogger(KafkaAvroMessageDecoder.class);

  private static final String SCHEMA_REGISTRY_REST_URL = "schema.registry.rest.url";
  private static final String SCHEMA_REGISTRY_SCHEMA_NAME = "schema.registry.schema.name";
  private org.apache.avro.Schema defaultAvroSchema;
  private MD5AvroSchemaMap md5ToAvroSchemaMap;

  // Reusable byte[] to read MD5 from payload. This is OK as this class is used only by a single thread.
  private final byte[] reusableMD5Bytes = new byte[SCHEMA_HASH_LENGTH];

  private String schemaRegistryBaseUrl;
  private DecoderFactory decoderFactory;
  private AvroRecordToPinotRowGenerator avroRecordConvetrer;

  private static final int MAGIC_BYTE_LENGTH = 1;
  private static final int SCHEMA_HASH_LENGTH = 16;
  private static final int HEADER_LENGTH = MAGIC_BYTE_LENGTH + SCHEMA_HASH_LENGTH;

  private static final int SCHEMA_HASH_START_OFFSET = MAGIC_BYTE_LENGTH;

  private static final int MAXIMUM_SCHEMA_FETCH_RETRY_COUNT = 5;
  private static final int MINIMUM_SCHEMA_FETCH_RETRY_TIME_MILLIS = 500;
  private static final float SCHEMA_FETCH_RETRY_EXPONENTIAL_BACKOFF_FACTOR = 2.0f;

  @Override
  public void init(Map<String, String> props, Schema indexingSchema, String topicName) throws Exception {
    schemaRegistryBaseUrl = props.get(SCHEMA_REGISTRY_REST_URL);
    StringUtils.chomp(schemaRegistryBaseUrl, "/");

    String avroSchemaName = topicName;
    if(props.containsKey(SCHEMA_REGISTRY_SCHEMA_NAME) && props.get(SCHEMA_REGISTRY_SCHEMA_NAME) != null &&
        !props.get(SCHEMA_REGISTRY_SCHEMA_NAME).isEmpty()) {
      avroSchemaName = props.get(SCHEMA_REGISTRY_SCHEMA_NAME);
    }

    defaultAvroSchema = fetchSchema(new URL(schemaRegistryBaseUrl + "/latest_with_type=" + avroSchemaName));
    this.avroRecordConvetrer = new AvroRecordToPinotRowGenerator(indexingSchema);
    this.decoderFactory = new DecoderFactory();
    md5ToAvroSchemaMap = new MD5AvroSchemaMap();
  }

  @Override
  public GenericRow decode(byte[] payload, GenericRow destination) {
    return decode(payload, 0, payload.length, destination);
  }

  @Override
  public GenericRow decode(byte[] payload, int offset, int length, GenericRow destination) {
    if (payload == null || payload.length == 0 || length == 0) {
      return null;
    }

    System.arraycopy(payload, SCHEMA_HASH_START_OFFSET + offset, reusableMD5Bytes, 0, SCHEMA_HASH_LENGTH);

    boolean schemaUpdateFailed = false;
    org.apache.avro.Schema schema = md5ToAvroSchemaMap.getSchema(reusableMD5Bytes);
    if (schema == null) {
      final String schemaUri = schemaRegistryBaseUrl + "/id=" + hex(reusableMD5Bytes);
      try {
        schema = fetchSchema(new URL(schemaUri));
        md5ToAvroSchemaMap.addSchema(reusableMD5Bytes, schema);
      } catch (Exception e) {
        schema = defaultAvroSchema;
        LOGGER.error("Error fetching schema using url {}. Attempting to continue with previous schema", schemaUri, e);
        schemaUpdateFailed = true;
      }
    }
    DatumReader<Record> reader = new GenericDatumReader<Record>(schema);
    try {
      GenericData.Record avroRecord =
          reader.read(null, decoderFactory.createBinaryDecoder(payload, HEADER_LENGTH + offset,
              length - HEADER_LENGTH, null));
      return avroRecordConvetrer.transform(avroRecord, schema, destination);
    } catch (IOException e) {
      LOGGER.error("Caught exception while reading message using schema {}{}", (schema==null ? "null" : schema.getName()),
          (schemaUpdateFailed? "(possibly due to schema update failure)" : ""), e);
      return null;
    }
  }

  private String hex(byte[] bytes) {
    StringBuilder builder = new StringBuilder(2 * bytes.length);
    for (byte aByte : bytes) {
      String hexString = Integer.toHexString(0xFF & aByte);
      if (hexString.length() < 2) {
        hexString = "0" + hexString;
      }
      builder.append(hexString);
    }
    return builder.toString();
  }

  private static class SchemaFetcher implements Callable<Boolean> {
    private org.apache.avro.Schema _schema;
    private URL url;

    SchemaFetcher(URL url) {
      this.url = url;
    }

    @Override
    public Boolean call() throws Exception {
      try {
        BufferedReader reader = null;

        reader = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"));
        StringBuilder queryResp = new StringBuilder();
        for (String respLine; (respLine = reader.readLine()) != null; ) {
          queryResp.append(respLine);
        }
        _schema = org.apache.avro.Schema.parse(queryResp.toString());

        return Boolean.TRUE;
      } catch (Exception e) {
        LOGGER.warn("Caught exception while fetching schema", e);
        return Boolean.FALSE;
      }
    }

    public org.apache.avro.Schema getSchema() {
      return _schema;
    }
  }

  private org.apache.avro.Schema fetchSchema(URL url) throws Exception {
    SchemaFetcher schemaFetcher = new SchemaFetcher(url);

    boolean successful = RetryPolicies.exponentialBackoffRetryPolicy(MAXIMUM_SCHEMA_FETCH_RETRY_COUNT,
        MINIMUM_SCHEMA_FETCH_RETRY_TIME_MILLIS, SCHEMA_FETCH_RETRY_EXPONENTIAL_BACKOFF_FACTOR).attempt(schemaFetcher);

    if (successful) {
      return schemaFetcher.getSchema();
    } else {
      throw new RuntimeException(
          "Failed to fetch schema from " + url + " after " + MAXIMUM_SCHEMA_FETCH_RETRY_COUNT + "retries");
    }
  }

  /**
   * Private class for encapsulating MD5 to Avro schema mapping.
   * <ul>
   *   <li> Maintains two lists, one for md5s and another for schema. </li>
   *   <li> MD5 at index i in the MD5 list, corresponds to Schema at index i in the schema list. </li>
   * </ul>
   */
  private static class MD5AvroSchemaMap {
    private List<byte[]> md5s;
    private List<org.apache.avro.Schema> schemas;

    /**
     * Constructor for the class.
     */
    private MD5AvroSchemaMap() {
      md5s = new ArrayList<>();
      schemas = new ArrayList<>();
    }

    /**
     * Returns the Avro schema corresponding to the given MD5.
     *
     * @param md5ForSchema MD5 for which to get the avro schema.
     * @return Avro schema for the given MD5.
     */
    private org.apache.avro.Schema getSchema(byte[] md5ForSchema) {
      for (int i = 0; i < md5s.size(); i++) {
        if (Arrays.equals(md5s.get(i), md5ForSchema)) {
          return schemas.get(i);
        }
      }
      return null;
    }

    /**
     * Adds mapping between MD5 and Avro schema.
     * Caller to ensure that addSchema is called only once per MD5-Schema pair.
     *
     * @param md5 MD5 for the Schema
     * @param schema Avro Schema
     */
    private void addSchema(byte[] md5, org.apache.avro.Schema schema) {
      md5s.add(Arrays.copyOf(md5, md5.length));
      schemas.add(schema);
    }
  }
}