/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.explore.service;
import co.cask.cdap.api.data.batch.RecordScannable;
import co.cask.cdap.api.data.batch.RecordWritable;
import co.cask.cdap.api.data.format.FormatSpecification;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.UnsupportedTypeException;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.ObjectMappedTable;
import co.cask.cdap.api.dataset.lib.PartitionDetail;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.common.DatasetNotFoundException;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data.dataset.SystemDatasetInstantiator;
import co.cask.cdap.data.dataset.SystemDatasetInstantiatorFactory;
import co.cask.cdap.data2.dataset2.lib.table.ObjectMappedTableModule;
import co.cask.cdap.explore.table.CreateStatementBuilder;
import co.cask.cdap.explore.utils.ExploreTableNaming;
import co.cask.cdap.hive.datasets.DatasetStorageHandler;
import co.cask.cdap.hive.objectinspector.ObjectInspectorFactory;
import co.cask.cdap.hive.stream.StreamStorageHandler;
import co.cask.cdap.internal.io.ReflectionSchemaGenerator;
import co.cask.cdap.internal.io.SchemaTypeAdapter;
import co.cask.cdap.proto.Id;
import co.cask.cdap.proto.QueryHandle;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.twill.filesystem.Location;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Executes disabling and enabling of datasets and streams and adding and dropping of partitions.
*/
public class ExploreTableManager {
private static final Logger LOG = LoggerFactory.getLogger(ExploreTableManager.class);
// A GSON object that knowns how to serialize Schema type.
private static final Gson GSON = new GsonBuilder()
.registerTypeAdapter(Schema.class, new SchemaTypeAdapter())
.create();
private final ExploreService exploreService;
private final SystemDatasetInstantiatorFactory datasetInstantiatorFactory;
private final ExploreTableNaming tableNaming;
@Inject
public ExploreTableManager(ExploreService exploreService,
SystemDatasetInstantiatorFactory datasetInstantiatorFactory,
ExploreTableNaming tableNaming) {
this.exploreService = exploreService;
this.datasetInstantiatorFactory = datasetInstantiatorFactory;
this.tableNaming = tableNaming;
}
/**
* Enable exploration on a stream by creating a corresponding Hive table. Enabling exploration on a
* stream that has already been enabled is a no-op. Assumes the stream actually exists.
*
* @param tableName name of the Hive table to create
* @param streamID the ID of the stream
* @param formatSpec the format specification for the table
* @return query handle for creating the Hive table for the stream
* @throws UnsupportedTypeException if the stream schema is not compatible with Hive
* @throws ExploreException if there was an exception submitting the create table statement
* @throws SQLException if there was a problem with the create table statement
*/
public QueryHandle enableStream(String tableName, Id.Stream streamID, FormatSpecification formatSpec)
throws UnsupportedTypeException, ExploreException, SQLException {
String streamName = streamID.getId();
LOG.debug("Enabling explore for stream {} with table {}", streamID, tableName);
// schema of a stream is always timestamp, headers, and then the schema of the body.
List<Schema.Field> fields = Lists.newArrayList(
Schema.Field.of("ts", Schema.of(Schema.Type.LONG)),
Schema.Field.of("headers", Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING))));
if (formatSpec.getSchema() != null) {
fields.addAll(formatSpec.getSchema().getFields());
}
Schema schema = Schema.recordOf("streamEvent", fields);
Map<String, String> serdeProperties = ImmutableMap.of(
Constants.Explore.STREAM_NAME, streamName,
Constants.Explore.STREAM_NAMESPACE, streamID.getNamespaceId(),
Constants.Explore.FORMAT_SPEC, GSON.toJson(formatSpec));
String createStatement = new CreateStatementBuilder(streamName, tableName)
.setSchema(schema)
.setTableComment("CDAP Stream")
.buildWithStorageHandler(StreamStorageHandler.class.getName(), serdeProperties);
LOG.debug("Running create statement for stream {} with table {}: {}", streamName, tableName, createStatement);
return exploreService.execute(streamID.getNamespace(), createStatement);
}
/**
* Disable exploration on the given stream by dropping the Hive table for the stream.
*
* @param tableName name of the table to delete
* @param streamID the ID of the stream to disable
* @return the query handle for disabling the stream
* @throws ExploreException if there was an exception dropping the table
* @throws SQLException if there was a problem with the drop table statement
*/
public QueryHandle disableStream(String tableName, Id.Stream streamID) throws ExploreException, SQLException {
LOG.debug("Disabling explore for stream {} with table {}", streamID, tableName);
String deleteStatement = generateDeleteTableStatement(tableName);
return exploreService.execute(streamID.getNamespace(), deleteStatement);
}
/**
* Enable ad-hoc exploration on the given dataset by creating a corresponding Hive table. If exploration has
* already been enabled on the dataset, this will be a no-op. Assumes the dataset actually exists.
*
* @param datasetId the ID of the dataset to enable
* @param spec the specification for the dataset to enable
* @return query handle for creating the Hive table for the dataset
* @throws IllegalArgumentException if some required dataset property like schema is not set
* @throws UnsupportedTypeException if the schema of the dataset is not compatible with Hive
* @throws ExploreException if there was an exception submitting the create table statement
* @throws SQLException if there was a problem with the create table statement
* @throws DatasetNotFoundException if the dataset had to be instantiated, but could not be found
* @throws ClassNotFoundException if the was a missing class when instantiating the dataset
*/
public QueryHandle enableDataset(Id.DatasetInstance datasetId, DatasetSpecification spec)
throws IllegalArgumentException, ExploreException, SQLException,
UnsupportedTypeException, DatasetNotFoundException, ClassNotFoundException {
String datasetName = datasetId.getId();
Map<String, String> serdeProperties = ImmutableMap.of(
Constants.Explore.DATASET_NAME, datasetName,
Constants.Explore.DATASET_NAMESPACE, datasetId.getNamespaceId());
String createStatement = null;
// explore should only have logic related to exploration and not dataset logic.
// TODO: refactor exploration (CDAP-1573)
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
Dataset dataset = datasetInstantiator.getDataset(datasetId);
if (dataset == null) {
throw new DatasetNotFoundException(datasetId);
}
// CDAP-1573: all these instanceofs are a sign that this logic really belongs in each dataset instead of here
// To be enabled for explore, a dataset must either be RecordScannable/Writable,
// or it must be a FileSet or a PartitionedFileSet with explore enabled in it properties.
if (dataset instanceof Table) {
// valid for a table not to have a schema property. this logic should really be in Table
return createFromSchemaProperty(spec, datasetId, serdeProperties, false);
}
if (dataset instanceof ObjectMappedTable) {
return createFromSchemaProperty(spec, datasetId, serdeProperties, true);
}
boolean isRecordScannable = dataset instanceof RecordScannable;
boolean isRecordWritable = dataset instanceof RecordWritable;
if (isRecordScannable || isRecordWritable) {
Type recordType = isRecordScannable ?
((RecordScannable) dataset).getRecordType() : ((RecordWritable) dataset).getRecordType();
// if the type is a structured record, use the schema property to create the table
// Use == because that's what same class means.
if (StructuredRecord.class == recordType) {
return createFromSchemaProperty(spec, datasetId, serdeProperties, true);
}
// otherwise, derive the schema from the record type
LOG.debug("Enabling explore for dataset instance {}", datasetName);
createStatement = new CreateStatementBuilder(datasetName, tableNaming.getTableName(datasetId))
.setSchema(hiveSchemaFor(recordType))
.setTableComment("CDAP Dataset")
.buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
} else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
Map<String, String> properties = spec.getProperties();
if (FileSetProperties.isExploreEnabled(properties)) {
LOG.debug("Enabling explore for dataset instance {}", datasetName);
createStatement = generateFileSetCreateStatement(datasetId, dataset, properties);
}
}
} catch (IOException e) {
LOG.error("Exception instantiating dataset {}.", datasetId, e);
throw new ExploreException("Exception while trying to instantiate dataset " + datasetId);
}
if (createStatement != null) {
return exploreService.execute(datasetId.getNamespace(), createStatement);
} else {
// if the dataset is not explorable, this is a no op.
return QueryHandle.NO_OP;
}
}
private QueryHandle createFromSchemaProperty(DatasetSpecification spec, Id.DatasetInstance datasetID,
Map<String, String> serdeProperties, boolean shouldErrorOnMissingSchema)
throws ExploreException, SQLException, UnsupportedTypeException {
String schemaStr = spec.getProperty(DatasetProperties.SCHEMA);
// if there is no schema property, we cannot create the table and this is an error
if (schemaStr == null) {
if (shouldErrorOnMissingSchema) {
throw new IllegalArgumentException(String.format(
"Unable to enable exploration on dataset %s because the %s property is not set.",
datasetID.getId(), DatasetProperties.SCHEMA));
} else {
return QueryHandle.NO_OP;
}
}
try {
Schema schema = Schema.parseJson(schemaStr);
String createStatement = new CreateStatementBuilder(datasetID.getId(), tableNaming.getTableName(datasetID))
.setSchema(schema)
.setTableComment("CDAP Dataset")
.buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
return exploreService.execute(datasetID.getNamespace(), createStatement);
} catch (IOException e) {
// shouldn't happen because datasets are supposed to verify this, but just in case
throw new IllegalArgumentException("Unable to parse schema for dataset " + datasetID);
}
}
/**
* Disable exploration on the given dataset by dropping the Hive table for the dataset.
*
* @param datasetID the ID of the dataset to disable
* @param spec the specification for the dataset to disable
* @return the query handle for disabling the dataset
* @throws ExploreException if there was an exception dropping the table
* @throws SQLException if there was a problem with the drop table statement
* @throws DatasetNotFoundException if the dataset had to be instantiated, but could not be found
* @throws ClassNotFoundException if the was a missing class when instantiating the dataset
*/
public QueryHandle disableDataset(Id.DatasetInstance datasetID, DatasetSpecification spec)
throws ExploreException, SQLException, DatasetNotFoundException, ClassNotFoundException {
LOG.debug("Disabling explore for dataset instance {}", datasetID);
String tableName = tableNaming.getTableName(datasetID);
// If table does not exist, nothing to be done
try {
exploreService.getTableInfo(datasetID.getNamespaceId(), tableName);
} catch (TableNotFoundException e) {
// Ignore exception, since this means table was not found.
return QueryHandle.NO_OP;
}
String deleteStatement = null;
String datasetType = spec.getType();
if (ObjectMappedTableModule.FULL_NAME.equals(datasetType) ||
ObjectMappedTableModule.SHORT_NAME.equals(datasetType)) {
deleteStatement = generateDeleteTableStatement(tableName);
LOG.debug("Running delete statement for dataset {} - {}", datasetID, deleteStatement);
return exploreService.execute(datasetID.getNamespace(), deleteStatement);
}
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
Dataset dataset = datasetInstantiator.getDataset(datasetID);
if (dataset == null) {
throw new DatasetNotFoundException(datasetID);
}
if (dataset instanceof RecordScannable || dataset instanceof RecordWritable) {
deleteStatement = generateDeleteTableStatement(tableName);
} else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
Map<String, String> properties = spec.getProperties();
if (FileSetProperties.isExploreEnabled(properties)) {
deleteStatement = generateDeleteTableStatement(tableName);
}
}
} catch (IOException e) {
LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetID, e);
throw new ExploreException("Exception instantiating dataset " + datasetID);
}
if (deleteStatement != null) {
LOG.debug("Running delete statement for dataset {} - {}", datasetID, deleteStatement);
return exploreService.execute(datasetID.getNamespace(), deleteStatement);
} else {
return QueryHandle.NO_OP;
}
}
/**
* Adds a partition to the Hive table for the given dataset.
*
* @param datasetID the ID of the dataset to add a partition to
* @param partitionKey the partition key to add
* @param fsPath the path of the partition
* @return the query handle for adding the partition the dataset
* @throws ExploreException if there was an exception adding the partition
* @throws SQLException if there was a problem with the add partition statement
*/
public QueryHandle addPartition(Id.DatasetInstance datasetID, PartitionKey partitionKey, String fsPath)
throws ExploreException, SQLException {
String addPartitionStatement = String.format(
"ALTER TABLE %s ADD PARTITION %s LOCATION '%s'",
tableNaming.getTableName(datasetID), generateHivePartitionKey(partitionKey), fsPath);
LOG.debug("Add partition for key {} dataset {} - {}", partitionKey, datasetID, addPartitionStatement);
return exploreService.execute(datasetID.getNamespace(), addPartitionStatement);
}
/**
* Adds multiple partitions to the Hive table for the given dataset.
*
* @param datasetID the ID of the dataset to add partitions to
* @param partitionDetails a map of partition key to partition path
* @return the query handle for adding partitions to the dataset
* @throws ExploreException if there was an exception adding the partition
* @throws SQLException if there was a problem with the add partition statement
*/
public QueryHandle addPartitions(Id.DatasetInstance datasetID,
Set<PartitionDetail> partitionDetails) throws ExploreException, SQLException {
if (partitionDetails.isEmpty()) {
return QueryHandle.NO_OP;
}
StringBuilder statement = new StringBuilder()
.append("ALTER TABLE ")
.append(tableNaming.getTableName(datasetID))
.append(" ADD");
for (PartitionDetail partitionDetail : partitionDetails) {
statement.append(" PARTITION")
.append(generateHivePartitionKey(partitionDetail.getPartitionKey()))
.append(" LOCATION '")
.append(partitionDetail.getRelativePath())
.append("'");
}
LOG.debug("Adding partitions for dataset {}", datasetID);
return exploreService.execute(datasetID.getNamespace(), statement.toString());
}
/**
* Drop a partition from the Hive table for the given dataset.
*
* @param datasetID the ID of the dataset to drop the partition from
* @param partitionKey the partition key to drop
* @return the query handle for dropping the partition from the dataset
* @throws ExploreException if there was an exception dropping the partition
* @throws SQLException if there was a problem with the drop partition statement
*/
public QueryHandle dropPartition(Id.DatasetInstance datasetID, PartitionKey partitionKey)
throws ExploreException, SQLException {
String dropPartitionStatement = String.format(
"ALTER TABLE %s DROP PARTITION %s",
tableNaming.getTableName(datasetID), generateHivePartitionKey(partitionKey));
LOG.debug("Drop partition for key {} dataset {} - {}", partitionKey, datasetID, dropPartitionStatement);
return exploreService.execute(datasetID.getNamespace(), dropPartitionStatement);
}
private String generateFileSetCreateStatement(Id.DatasetInstance datasetID, Dataset dataset,
Map<String, String> properties) throws IllegalArgumentException {
String tableName = tableNaming.getTableName(datasetID);
Map<String, String> tableProperties = FileSetProperties.getTableProperties(properties);
Location baseLocation;
Partitioning partitioning = null;
if (dataset instanceof PartitionedFileSet) {
partitioning = ((PartitionedFileSet) dataset).getPartitioning();
baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
} else {
baseLocation = ((FileSet) dataset).getBaseLocation();
}
CreateStatementBuilder createStatementBuilder = new CreateStatementBuilder(datasetID.getId(), tableName)
.setLocation(baseLocation)
.setPartitioning(partitioning)
.setTableProperties(tableProperties);
String schema = FileSetProperties.getExploreSchema(properties);
String format = FileSetProperties.getExploreFormat(properties);
if (format != null) {
if ("parquet".equals(format)) {
return createStatementBuilder.setSchema(FileSetProperties.getExploreSchema(properties))
.buildWithFileFormat("parquet");
}
// for text and csv, we know what to do
Preconditions.checkArgument("text".equals(format) || "csv".equals(format),
"Only text and csv are supported as native formats");
Preconditions.checkNotNull(schema, "for native formats, explore schema must be given in dataset properties");
String delimiter = null;
if ("text".equals(format)) {
delimiter = FileSetProperties.getExploreFormatProperties(properties).get("delimiter");
} else if ("csv".equals(format)) {
delimiter = ",";
}
return createStatementBuilder.setSchema(schema)
.setRowFormatDelimited(delimiter, null)
.buildWithFileFormat("TEXTFILE");
} else {
// for some odd reason, avro tables don't require schema.
// They can be created by setting the avro.schema.literal table property
if (schema != null) {
createStatementBuilder.setSchema(schema);
}
// format not given, look for serde, input format, etc.
String serde = FileSetProperties.getSerDe(properties);
String inputFormat = FileSetProperties.getExploreInputFormat(properties);
String outputFormat = FileSetProperties.getExploreOutputFormat(properties);
Preconditions.checkArgument(serde != null && inputFormat != null && outputFormat != null,
"All of SerDe, InputFormat and OutputFormat must be given in dataset properties");
return createStatementBuilder.setRowFormatSerde(serde)
.buildWithFormats(inputFormat, outputFormat);
}
}
private String generateDeleteTableStatement(String name) {
return String.format("DROP TABLE IF EXISTS %s", tableNaming.cleanTableName(name));
}
private String generateHivePartitionKey(PartitionKey key) {
StringBuilder builder = new StringBuilder("(");
String sep = "";
for (Map.Entry<String, ? extends Comparable> entry : key.getFields().entrySet()) {
String fieldName = entry.getKey();
Comparable fieldValue = entry.getValue();
String quote = fieldValue instanceof String ? "'" : "";
builder.append(sep).append(fieldName).append("=").append(quote).append(fieldValue.toString()).append(quote);
sep = ", ";
}
builder.append(")");
return builder.toString();
}
// TODO: replace with SchemaConverter.toHiveSchema when we tackle queries on Tables.
private String hiveSchemaFor(Type type) throws UnsupportedTypeException {
// This call will make sure that the type is not recursive
try {
new ReflectionSchemaGenerator().generate(type, false);
} catch (Exception e) {
throw new UnsupportedTypeException("Unable to derive schema from " + type, e);
}
ObjectInspector objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(type);
if (!(objectInspector instanceof StructObjectInspector)) {
throw new UnsupportedTypeException(String.format("Type must be a RECORD, but is %s",
type.getClass().getName()));
}
StructObjectInspector structObjectInspector = (StructObjectInspector) objectInspector;
StringBuilder sb = new StringBuilder();
boolean first = true;
for (StructField structField : structObjectInspector.getAllStructFieldRefs()) {
if (first) {
first = false;
} else {
sb.append(", ");
}
ObjectInspector oi = structField.getFieldObjectInspector();
String typeName;
typeName = oi.getTypeName();
sb.append(structField.getFieldName()).append(" ").append(typeName);
}
return sb.toString();
}
}