/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.DatasetAdmin; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.InstanceConflictException; import co.cask.cdap.api.dataset.InstanceNotFoundException; import co.cask.cdap.api.dataset.module.DatasetModule; import co.cask.cdap.common.ServiceUnavailableException; import co.cask.cdap.data2.datafabric.dataset.type.DatasetClassLoaderProvider; import co.cask.cdap.data2.metadata.lineage.AccessType; import co.cask.cdap.proto.DatasetSpecificationSummary; import co.cask.cdap.proto.Id; import com.google.common.annotations.VisibleForTesting; import org.apache.twill.filesystem.Location; import java.io.IOException; import java.util.Collection; import java.util.Map; import javax.annotation.Nullable; /** * Provides access to the Datasets System. * * Typical usage example: * <tt> * DatasetFramework datasetFramework = ...; * datasetFramework.addModule("myDatasets", MyDatasetModule.class); * datasetFramework.addInstance("myTable", "table", DatasetProperties.EMPTY); * TableAdmin admin = datasetFramework.getAdmin("myTable"); * admin.create(); * Table table = datasetFramework.getDataset("myTable"); * try { * table.write("key", "value"); * } finally { * table.close(); * } * </tt> */ // todo: use dataset instead of dataset instance in namings public interface DatasetFramework { /** * Adds dataset types by adding dataset module to the system. Calling this method to add {@link DatasetModule} may * result in tracing class dependencies if the {@link DatasetModule} is not a system dataset, which can takes * couple seconds for the tracing. If the jar {@link Location} containing the {@link DatasetModule} is known, it's * better to call {@link #addModule(Id.DatasetModule, DatasetModule, Location)} instead. * * @param moduleId dataset module id * @param module dataset module * @throws ModuleConflictException when module with same name is already registered or this module registers a type * with a same name as one of the already registered by another module types * @throws DatasetManagementException in case of problems * @throws ServiceUnavailableException when the dataset service is not running */ void addModule(Id.DatasetModule moduleId, DatasetModule module) throws DatasetManagementException; /** * Adds dataset types by adding dataset module to the system with a jar location containing all dataset classes * needed by the module. * * @param moduleId dataset module id * @param module dataset module * @param jarLocation location of the jar file that contains the dataset classes needed by the module * @throws ModuleConflictException when module with same name is already registered or this module registers a type * with a same name as one of the already registered by another module types * @throws DatasetManagementException in case of problems * @throws ServiceUnavailableException when the dataset service is not running */ void addModule(Id.DatasetModule moduleId, DatasetModule module, Location jarLocation) throws DatasetManagementException; /** * Deletes dataset module and its types from the system. * * @param moduleId dataset module id * @throws ModuleConflictException when module cannot be deleted because of its dependant modules or instances * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void deleteModule(Id.DatasetModule moduleId) throws DatasetManagementException; /** * Deletes dataset modules and its types in the specified namespace. * * @param namespaceId the {@link Id.Namespace} to delete all modules from. * @throws ModuleConflictException when some of modules can't be deleted because of its dependant modules or instances * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void deleteAllModules(Id.Namespace namespaceId) throws DatasetManagementException; /** * Adds information about dataset instance to the system. * * This uses * {@link co.cask.cdap.api.dataset.DatasetDefinition#configure(String, DatasetProperties)} * method to build {@link co.cask.cdap.api.dataset.DatasetSpecification} which describes dataset instance * and later used to initialize {@link DatasetAdmin} and {@link Dataset} for the dataset instance. * * @param datasetTypeName dataset instance type name * @param datasetInstanceId dataset instance name * @param props dataset instance properties * @throws InstanceConflictException if dataset instance with this name already exists * @throws IOException when creation of dataset instance using its admin fails * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void addInstance(String datasetTypeName, Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException, IOException; /** * Updates the existing dataset instance in the system. * * This uses * {@link co.cask.cdap.api.dataset.DatasetDefinition#configure(String, DatasetProperties)} * method to build {@link co.cask.cdap.api.dataset.DatasetSpecification} with new properties, * which describes dataset instance and {@link DatasetAdmin} is used to upgrade * {@link Dataset} for the dataset instance. * @param datasetInstanceId dataset instance name * @param props dataset instance properties * @throws IOException when creation of dataset instance using its admin fails * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void updateInstance(Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException, IOException; /** * Get all dataset instances in the specified namespace * * @param namespaceId the specified namespace id * @return a collection of {@link DatasetSpecification}s for all datasets in the specified namespace */ Collection<DatasetSpecificationSummary> getInstances(Id.Namespace namespaceId) throws DatasetManagementException; /** * Gets the {@link DatasetSpecification} for the specified dataset instance id * * @param datasetInstanceId the {@link Id.DatasetInstance} for which the {@link DatasetSpecification} is desired * @return {@link DatasetSpecification} of the dataset or {@code null} if dataset not not exist */ @Nullable DatasetSpecification getDatasetSpec(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException; /** * @param datasetInstanceId the {@link Id.DatasetInstance} to check for existence * @return true if instance exists, false otherwise * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ boolean hasInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException; /** * Checks if the specified type exists in the 'system' namespace * * @return true if type exists in the 'system' namespace, false otherwise * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ boolean hasSystemType(String typeName) throws DatasetManagementException; /** * Checks if the specified type exists in the specified namespace * * @return true if type exists in the specified namespace, false otherwise * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ @VisibleForTesting boolean hasType(Id.DatasetType datasetTypeId) throws DatasetManagementException; /** * Truncates a dataset instance. * * @param datasetInstanceId dataset instance name * @throws InstanceNotFoundException if dataset instance does not exist * @throws IOException when truncation of dataset instance using its admin fails * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void truncateInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException, IOException; /** * Deletes dataset instance from the system. * * @param datasetInstanceId dataset instance name * @throws InstanceConflictException if dataset instance cannot be deleted because of its dependencies * @throws InstanceNotFoundException if dataset instance does not exist * @throws IOException when deletion of dataset instance using its admin fails * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void deleteInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException, IOException; /** * Deletes all dataset instances in the specified namespace. * * @param namespaceId the specified namespace id * @throws IOException when deletion of dataset instance using its admin fails * @throws DatasetManagementException * @throws ServiceUnavailableException when the dataset service is not running */ void deleteAllInstances(Id.Namespace namespaceId) throws DatasetManagementException, IOException; /** * Gets dataset instance admin to be used to perform administrative operations. The given classloader must * be able to load all classes needed to instantiate the dataset admin. This means if the system classloader is * used, only system dataset admins can fetched. * * @param <T> dataset admin type * @param datasetInstanceId dataset instance name * @param classLoader classLoader to be used to load classes or {@code null} to use system classLoader * @return instance of dataset admin or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link DatasetAdmin} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, @Nullable ClassLoader classLoader) throws DatasetManagementException, IOException; /** * Gets dataset instance admin to be used to perform administrative operations. The class loader provider * is used get classloaders for any dataset modules used by the specified dataset admin. This is because * the classloader(s) for a dataset admin may create some resources that need to be cleaned up on close. * * @param <T> dataset admin type * @param datasetInstanceId dataset instance name * @param classLoader parent classLoader to be used to load classes or {@code null} to use system classLoader * @param classLoaderProvider provider to get classloaders for different dataset modules * @return instance of dataset admin or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link DatasetAdmin} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider) throws DatasetManagementException, IOException; /** * Gets dataset to be used to perform data operations. * * @param <T> dataset type to be returned * @param datasetInstanceId dataset instance id * @param arguments runtime arguments for the dataset instance * @param classLoader classLoader to be used to load classes or {@code null} to use system classLoader * @param owners owners of the dataset * @return instance of dataset or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link co.cask.cdap.api.dataset.Dataset} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader, @Nullable Iterable<? extends Id> owners) throws DatasetManagementException, IOException; /** * Gets dataset to be used to perform data operations. * * @param <T> dataset type to be returned * @param datasetInstanceId dataset instance id * @param arguments runtime arguments for the dataset instance * @param classLoader classLoader to be used to load classes or {@code null} to use system classLoader * @return instance of dataset or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link co.cask.cdap.api.dataset.Dataset} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader) throws DatasetManagementException, IOException; /** * Gets dataset to be used to perform data operations. This one is used when the classloader(s) for a dataset may * create some resources that need to be cleaned up on close. * * @param <T> dataset type to be returned * @param datasetInstanceId dataset instance id * @param arguments runtime arguments for the dataset instance * @param classLoader parent classLoader to be used to load classes or {@code null} to use system classLoader * @param classLoaderProvider provider to get classloaders for different dataset modules * @param owners owners of the dataset * @return instance of dataset or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link co.cask.cdap.api.dataset.Dataset} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners) throws DatasetManagementException, IOException; /** * Gets dataset to be used to perform data operations. This one is used when the classloader(s) for a dataset may * create some resources that need to be cleaned up on close, and an access type is specified. * * @param <T> dataset type to be returned * @param datasetInstanceId dataset instance id * @param arguments runtime arguments for the dataset instance * @param classLoader parent classLoader to be used to load classes or {@code null} to use system classLoader * @param classLoaderProvider provider to get classloaders for different dataset modules * @param owners owners of the dataset * @param accessType accessType for this request * @return instance of dataset or {@code null} if dataset instance of this name doesn't exist. * @throws DatasetManagementException when there's trouble getting dataset meta info * @throws IOException when there's trouble to instantiate {@link co.cask.cdap.api.dataset.Dataset} * @throws ServiceUnavailableException when the dataset service is not running */ @Nullable <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners, AccessType accessType) throws DatasetManagementException, IOException; /** * Write lineage for a particular dataset instance. * * @param datasetInstanceId dataset instance id * @param accessType accessType to be recorded */ void writeLineage(Id.DatasetInstance datasetInstanceId, AccessType accessType); /** * Creates a namespace in the Storage Providers - HBase/LevelDB, Hive and HDFS/Local File System. * * @param namespaceId the {@link Id.Namespace} to create */ void createNamespace(Id.Namespace namespaceId) throws DatasetManagementException; /** * Deletes a namespace in the Storage Providers - HBase/LevelDB, Hive and HDFS/Local File System. * * @param namespaceId the {@link Id.Namespace} to create */ void deleteNamespace(Id.Namespace namespaceId) throws DatasetManagementException; }