/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hive.datasets; import co.cask.cdap.api.data.batch.RecordScannable; import co.cask.cdap.api.data.batch.RecordWritable; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.schema.UnsupportedTypeException; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.common.ServiceUnavailableException; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.data.dataset.SystemDatasetInstantiator; import co.cask.cdap.hive.context.ContextManager; import co.cask.cdap.hive.serde.ObjectDeserializer; import co.cask.cdap.hive.serde.ObjectSerializer; import co.cask.cdap.internal.io.ReflectionSchemaGenerator; import co.cask.cdap.internal.io.SchemaGenerator; import co.cask.cdap.proto.Id; import com.google.common.collect.Lists; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Properties; /** * SerDe to serialize Dataset Objects. It MUST implement the deprecated SerDe interface instead of extending the * abstract SerDe class, otherwise we get ClassNotFound exceptions on cdh4.x. */ public class DatasetSerDe implements SerDe { private static final Logger LOG = LoggerFactory.getLogger(DatasetSerDe.class); private static final SchemaGenerator schemaGenerator = new ReflectionSchemaGenerator(); private ObjectInspector objectInspector; private ObjectDeserializer deserializer; private ObjectSerializer serializer; private Schema schema; @Override public void initialize(Configuration conf, Properties properties) throws SerDeException { // The column names are saved as the given inspector to #serialize doesn't preserves them // - maybe because it's an external table // The columns property comes from the Hive metastore, which has it from the create table statement // It is then important that this schema be accurate and in the right order - the same order as // object inspectors will reflect them. String datasetName = properties.getProperty(Constants.Explore.DATASET_NAME); String namespace = properties.getProperty(Constants.Explore.DATASET_NAMESPACE); // no namespace SHOULD be an exception but... Hive calls initialize in several places, one of which is // when you try and drop a table. // When updating from CDAP 2.6, old tables will not have namespace as a serde property. So in order // to avoid a null pointer exception that prevents dropping a table, we handle the null namespace case here. if (namespace == null) { // we also still need an ObjectInspector as Hive uses it to check what columns the table has. this.objectInspector = new ObjectDeserializer(properties, null).getInspector(); return; } if (datasetName == null || datasetName.isEmpty()) { throw new SerDeException("Dataset name not found in serde properties."); } // Hive may call initialize a bunch of times... so remember the schema so we don't instantiate the dataset // a bunch of times. if (schema == null) { Id.DatasetInstance datasetId = Id.DatasetInstance.from(namespace, datasetName); getDatasetSchema(conf, datasetId); } this.deserializer = new ObjectDeserializer(properties, schema); ArrayList<String> columnNames = Lists.newArrayList(StringUtils.split(properties.getProperty("columns"), ",")); this.serializer = new ObjectSerializer(columnNames); this.objectInspector = deserializer.getInspector(); } private void getDatasetSchema(Configuration conf, Id.DatasetInstance datasetId) throws SerDeException { try (ContextManager.Context hiveContext = ContextManager.getContext(conf)) { // apparently the conf can be null in some versions of Hive? // Because it calls initialize just to get the object inspector if (hiveContext == null) { LOG.info("Hive provided a null conf, will not be able to get dataset schema."); return; } // some datasets like Table and ObjectMappedTable have schema in the dataset properties try { DatasetSpecification datasetSpec = hiveContext.getDatasetSpec(datasetId); String schemaStr = datasetSpec.getProperty("schema"); if (schemaStr != null) { schema = Schema.parseJson(schemaStr); return; } } catch (DatasetManagementException | ServiceUnavailableException e) { throw new SerDeException("Could not instantiate dataset " + datasetId, e); } catch (IOException e) { throw new SerDeException("Exception getting schema for dataset " + datasetId, e); } // other datasets must be instantiated to get their schema // conf is null if this is a query that writes to a dataset ClassLoader parentClassLoader = conf == null ? null : conf.getClassLoader(); try (SystemDatasetInstantiator datasetInstantiator = hiveContext.createDatasetInstantiator(parentClassLoader)) { Dataset dataset = datasetInstantiator.getDataset(datasetId); if (dataset == null) { throw new SerDeException("Could not find dataset " + datasetId); } Type recordType; if (dataset instanceof RecordScannable) { recordType = ((RecordScannable) dataset).getRecordType(); } else if (dataset instanceof RecordWritable) { recordType = ((RecordWritable) dataset).getRecordType(); } else { throw new SerDeException("Dataset " + datasetId + " is not explorable."); } schema = schemaGenerator.generate(recordType); } catch (UnsupportedTypeException e) { throw new SerDeException("Dataset " + datasetId + " has an unsupported schema.", e); } catch (IOException e) { throw new SerDeException("Exception while trying to instantiate dataset " + datasetId, e); } } catch (IOException e) { throw new SerDeException("Could not get hive context from configuration.", e); } } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } @Override public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException { // NOTE: the object inspector here is not one that we build. It's a default one that Hive built, // that contains generic names for columns. The object is a list of objects, each element // representing one attribute of the Record type. // The object and the objectInspector represent one row of a query result to write into a dataset. // Therefore, it is not guaranteed that the object exactly matches the schema of the dataset // we want to write into. if (!(objectInspector instanceof StructObjectInspector)) { throw new SerDeException("Trying to serialize with unknown object inspector type " + objectInspector.getClass().getName() + ". Expected StructObjectInspector."); } return serializer.serialize(o, objectInspector); } @Override public SerDeStats getSerDeStats() { // TODO: add real Dataset stats - CDAP-12 return new SerDeStats(); } @Override public Object deserialize(Writable writable) throws SerDeException { ObjectWritable objectWritable = (ObjectWritable) writable; Object obj = objectWritable.get(); try { return deserializer.deserialize(obj); } catch (Throwable t) { LOG.error("Unable to deserialize object {}.", obj, t); throw new SerDeException("Unable to deserialize an object.", t); } } @Override public ObjectInspector getObjectInspector() throws SerDeException { return objectInspector; } }