/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hive.stream; import co.cask.cdap.api.data.format.FormatSpecification; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.schema.UnsupportedTypeException; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.data2.transaction.stream.StreamConfig; import co.cask.cdap.format.RecordFormats; import co.cask.cdap.hive.context.ContextManager; import co.cask.cdap.hive.serde.ObjectDeserializer; import co.cask.cdap.internal.io.SchemaTypeAdapter; import co.cask.cdap.proto.Id; import co.cask.cdap.spi.stream.AbstractStreamEventRecordFormat; import com.google.common.collect.Lists; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.List; import java.util.Properties; /** * SerDe to deserialize Stream Events. It MUST implement the deprecated SerDe interface instead of extending the * abstract SerDe class, otherwise we get ClassNotFound exceptions on cdh4.x. */ @SuppressWarnings("deprecation") public class StreamSerDe implements SerDe { private static final Logger LOG = LoggerFactory.getLogger(StreamSerDe.class); // timestamp and headers are guaranteed to be the first columns in a stream table. // the rest of the columns are for the stream body. private static final int BODY_OFFSET = 2; // A GSON object that knowns how to serialize Schema type. private static final Gson GSON = new GsonBuilder() .registerTypeAdapter(Schema.class, new SchemaTypeAdapter()) .create(); private ObjectInspector inspector; private AbstractStreamEventRecordFormat<?> streamFormat; private ObjectDeserializer deserializer; // initialize gets called multiple times by Hive. It may seem like a good idea to put additional settings into // the conf, but be very careful when doing so. If there are multiple hive tables involved in a query, initialize // for each table is called before input splits are fetched for any table. It is therefore not safe to put anything // the input format may need into conf in this method. Rather, use StorageHandler's method to place needed config // into the properties map there, which will get passed here and also copied into the job conf for the input // format to consume. @Override public void initialize(Configuration conf, Properties properties) throws SerDeException { // The columns property comes from the Hive metastore, which has it from the create table statement // It is then important that this schema be accurate and in the right order - the same order as // object inspectors will reflect them. String streamName = properties.getProperty(Constants.Explore.STREAM_NAME); String streamNamespace = properties.getProperty(Constants.Explore.STREAM_NAMESPACE); // no namespace SHOULD be an exception but... Hive calls initialize in several places, one of which is // when you try and drop a table. // When updating to CDAP 2.8, old tables will not have namespace as a serde property. So in order // to avoid a null pointer exception that prevents dropping a table, we handle the null namespace case here. if (streamNamespace == null) { // we also still need an ObjectInspector as Hive uses it to check what columns the table has. this.inspector = new ObjectDeserializer(properties, null).getInspector(); return; } Id.Stream streamId = Id.Stream.from(streamNamespace, streamName); try (ContextManager.Context context = ContextManager.getContext(conf)) { Schema schema = null; // apparently the conf can be null in some versions of Hive? // Because it calls initialize just to get the object inspector if (context != null) { // Get the stream format from the stream config. FormatSpecification formatSpec = getFormatSpec(properties, streamId, context); this.streamFormat = (AbstractStreamEventRecordFormat) RecordFormats.createInitializedFormat(formatSpec); schema = formatSpec.getSchema(); } this.deserializer = new ObjectDeserializer(properties, schema, BODY_OFFSET); this.inspector = deserializer.getInspector(); } catch (UnsupportedTypeException e) { // this should have been validated up front when schema was set on the stream. // if we hit this something went wrong much earlier. LOG.error("Schema unsupported by format.", e); throw new SerDeException("Schema unsupported by format.", e); } catch (IOException e) { LOG.error("Could not get the config for stream {}.", streamName, e); throw new SerDeException("Could not get the config for stream " + streamName, e); } catch (Exception e) { LOG.error("Could not create the format for stream {}.", streamName, e); throw new SerDeException("Could not create the format for stream " + streamName, e); } } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } @Override public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException { // should not be writing to streams through this throw new SerDeException("Stream serialization through Hive is not supported."); } @Override public SerDeStats getSerDeStats() { return new SerDeStats(); } @Override public Object deserialize(Writable writable) throws SerDeException { // The writable should always contains a StreamEvent object provided by the StreamRecordReader ObjectWritable objectWritable = (ObjectWritable) writable; StreamEvent streamEvent = (StreamEvent) objectWritable.get(); // timestamp and headers are always guaranteed to be first. List<Object> event = Lists.newArrayList(); event.add(streamEvent.getTimestamp()); event.add(streamEvent.getHeaders()); try { // The format should always format the stream event into a record. event.addAll(deserializer.translateRecord(streamFormat.read(streamEvent))); return event; } catch (Throwable t) { LOG.info("Unable to format the stream body.", t); throw new SerDeException("Unable to format the stream body.", t); } } @Override public ObjectInspector getObjectInspector() throws SerDeException { return inspector; } /** * Gets the {@link FormatSpecification} for the given stream based on the SerDe properties. * For backward compatibility, if the format specification is not set in the SerDe properties, it will be * fetched from the {@link StreamConfig}. */ private FormatSpecification getFormatSpec(Properties properties, Id.Stream streamId, ContextManager.Context context) throws IOException { String formatSpec = properties.getProperty(Constants.Explore.FORMAT_SPEC); if (formatSpec == null) { StreamConfig config = context.getStreamConfig(streamId); return config.getFormat(); } return GSON.fromJson(formatSpec, FormatSpecification.class); } }