/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.DatasetAdmin; import co.cask.cdap.api.dataset.DatasetContext; import co.cask.cdap.api.dataset.DatasetDefinition; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.InstanceConflictException; import co.cask.cdap.api.dataset.InstanceNotFoundException; import co.cask.cdap.api.dataset.module.DatasetDefinitionRegistry; import co.cask.cdap.api.dataset.module.DatasetModule; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.common.lang.ClassLoaders; import co.cask.cdap.data2.audit.AuditPublisher; import co.cask.cdap.data2.audit.AuditPublishers; import co.cask.cdap.data2.datafabric.dataset.DatasetsUtil; import co.cask.cdap.data2.datafabric.dataset.type.ConstantClassLoaderProvider; import co.cask.cdap.data2.datafabric.dataset.type.DatasetClassLoaderProvider; import co.cask.cdap.data2.dataset2.module.lib.DatasetModules; import co.cask.cdap.data2.metadata.lineage.AccessType; import co.cask.cdap.proto.DatasetSpecificationSummary; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.audit.AuditPayload; import co.cask.cdap.proto.audit.AuditType; import co.cask.cdap.proto.id.NamespaceId; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Supplier; import com.google.common.base.Throwables; import com.google.common.collect.HashBasedTable; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.SetMultimap; import com.google.common.collect.Sets; import com.google.common.collect.Table; import com.google.common.collect.Tables; import com.google.inject.Inject; import com.google.inject.name.Named; import org.apache.twill.filesystem.Location; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.annotation.Nullable; /** * A simple implementation of {@link co.cask.cdap.data2.dataset2.DatasetFramework} that keeps its state in * memory */ @SuppressWarnings("unchecked") public class InMemoryDatasetFramework implements DatasetFramework { private static final Logger LOG = LoggerFactory.getLogger(InMemoryDatasetFramework.class); private final DatasetDefinitionRegistryFactory registryFactory; private final Set<Id.Namespace> namespaces; private final SetMultimap<Id.Namespace, String> nonDefaultTypes; // Id.Namespace is contained in Id.DatasetInstance. But we need to be able to get all instances in a namespace // and delete all instances in a namespace, so we keep it as a separate key private final Table<Id.Namespace, Id.DatasetInstance, DatasetSpecification> instances; private final Table<Id.Namespace, Id.DatasetModule, String> moduleClasses; private final Lock readLock; private final Lock writeLock; private final boolean allowDatasetUncheckedUpgrade; // NOTE: used only for "internal" operations, that doesn't return to client object of custom type // NOTE: for getting dataset/admin objects we construct fresh new one using all modules (no dependency management in // this in-mem implementation for now) and passed client (program) class loader // NOTE: We maintain one DatasetDefinitionRegistry per namespace private final Map<Id.Namespace, DatasetDefinitionRegistry> registries; private AuditPublisher auditPublisher; public InMemoryDatasetFramework(DatasetDefinitionRegistryFactory registryFactory, CConfiguration configuration) { this(registryFactory, new HashMap<String, DatasetModule>(), configuration); } @Inject public InMemoryDatasetFramework(DatasetDefinitionRegistryFactory registryFactory, @Named("defaultDatasetModules") Map<String, DatasetModule> defaultModules, CConfiguration configuration) { this.registryFactory = registryFactory; this.allowDatasetUncheckedUpgrade = configuration.getBoolean(Constants.Dataset.DATASET_UNCHECKED_UPGRADE); this.namespaces = Sets.newHashSet(); this.nonDefaultTypes = HashMultimap.create(); this.instances = HashBasedTable.create(); this.registries = Maps.newHashMap(); // the order in which module classes are inserted is important, // so we use a table where Map<Id.DatasetModule, String> is a LinkedHashMap Map<Id.Namespace, Map<Id.DatasetModule, String>> backingMap = Maps.newHashMap(); this.moduleClasses = Tables.newCustomTable(backingMap, new Supplier<Map<Id.DatasetModule, String>>() { @Override public Map<Id.DatasetModule, String> get() { return Maps.newLinkedHashMap(); } }); // add default dataset modules to system namespace namespaces.add(Id.Namespace.SYSTEM); DatasetDefinitionRegistry systemRegistry = registryFactory.create(); for (Map.Entry<String, DatasetModule> entry : defaultModules.entrySet()) { LOG.debug("Adding Default module {} to system namespace", entry.getKey()); String moduleName = entry.getKey(); DatasetModule module = entry.getValue(); entry.getValue().register(systemRegistry); // keep track of default module classes. These are used when creating registries for other namespaces, // which need to register system classes too. String moduleClassName = DatasetModules.getDatasetModuleClass(module).getName(); Id.DatasetModule moduleId = Id.DatasetModule.from(Id.Namespace.SYSTEM, moduleName); moduleClasses.put(Id.Namespace.SYSTEM, moduleId, moduleClassName); } registries.put(Id.Namespace.SYSTEM, systemRegistry); ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); readLock = readWriteLock.readLock(); writeLock = readWriteLock.writeLock(); } @SuppressWarnings("unused") @Inject(optional = true) public void setAuditPublisher(AuditPublisher auditPublisher) { this.auditPublisher = auditPublisher; } @Override public void addModule(Id.DatasetModule moduleId, DatasetModule module) throws ModuleConflictException { writeLock.lock(); try { if (moduleClasses.contains(moduleId.getNamespace(), moduleId)) { throw new ModuleConflictException(String.format("Cannot add module '%s', it already exists.", moduleId)); } DatasetDefinitionRegistry registry = registries.get(moduleId.getNamespace()); if (registry == null) { registry = registryFactory.create(); registries.put(moduleId.getNamespace(), registry); } TypesTrackingRegistry trackingRegistry = new TypesTrackingRegistry(registry); module.register(trackingRegistry); String moduleClassName = DatasetModules.getDatasetModuleClass(module).getName(); moduleClasses.put(moduleId.getNamespace(), moduleId, moduleClassName); nonDefaultTypes.putAll(moduleId.getNamespace(), trackingRegistry.getTypes()); } finally { writeLock.unlock(); } } @Override public void addModule(Id.DatasetModule moduleId, DatasetModule module, Location jarLocation) throws DatasetManagementException { // Location is never used to create classloader for in memory dataset framework addModule(moduleId, module); } @Override public void deleteModule(Id.DatasetModule moduleId) { // todo: check if existing datasets or modules use this module writeLock.lock(); try { moduleClasses.remove(moduleId.getNamespace(), moduleId); LinkedHashSet<String> availableModuleClasses = getAvailableModuleClasses(moduleId.getNamespace()); // this will cleanup types DatasetDefinitionRegistry registry = createRegistry(availableModuleClasses, registries.getClass().getClassLoader()); registries.put(moduleId.getNamespace(), registry); } finally { writeLock.unlock(); } } @Override public void deleteAllModules(Id.Namespace namespaceId) throws ModuleConflictException { writeLock.lock(); try { // check if there are any datasets that use types from the namespace from which we want to remove all modules Set<String> typesInNamespace = nonDefaultTypes.get(namespaceId); for (DatasetSpecification spec : instances.row(namespaceId).values()) { if (typesInNamespace.contains(spec.getType())) { throw new ModuleConflictException( String.format("Cannot delete all modules in namespace '%s', some datasets use them", namespaceId)); } } moduleClasses.row(namespaceId).clear(); nonDefaultTypes.removeAll(namespaceId); registries.put(namespaceId, registryFactory.create()); } finally { writeLock.unlock(); } } @Override public void addInstance(String datasetType, Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException, IOException { writeLock.lock(); try { if (!allowDatasetUncheckedUpgrade && instances.contains(datasetInstanceId.getNamespace(), datasetInstanceId)) { throw new InstanceConflictException(String.format("Dataset instance '%s' already exists.", datasetInstanceId)); } DatasetDefinition def = getDefinitionForType(datasetInstanceId.getNamespace(), datasetType); if (def == null) { throw new DatasetManagementException( String.format("Dataset type '%s' is neither registered in the '%s' namespace nor in the system namespace", datasetType, datasetInstanceId.getNamespaceId())); } DatasetSpecification spec = def.configure(datasetInstanceId.getId(), props); spec = spec.setOriginalProperties(props); if (props.getDescription() != null) { spec = spec.setDescription(props.getDescription()); } def.getAdmin(DatasetContext.from(datasetInstanceId.getNamespaceId()), spec, null).create(); instances.put(datasetInstanceId.getNamespace(), datasetInstanceId, spec); publishAudit(datasetInstanceId, AuditType.CREATE); LOG.info("Created dataset {} of type {}", datasetInstanceId, datasetType); } finally { writeLock.unlock(); } } @Override public void updateInstance(Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException, IOException { writeLock.lock(); try { DatasetSpecification oldSpec = instances.get(datasetInstanceId.getNamespace(), datasetInstanceId); if (oldSpec == null) { throw new InstanceNotFoundException(datasetInstanceId.getId()); } DatasetDefinition def = getDefinitionForType(datasetInstanceId.getNamespace(), oldSpec.getType()); if (def == null) { throw new DatasetManagementException( String.format("Dataset type '%s' is neither registered in the '%s' namespace nor in the system namespace", oldSpec.getType(), datasetInstanceId.getNamespaceId())); } DatasetSpecification spec = def.configure(datasetInstanceId.getId(), props); spec = spec.setOriginalProperties(props); if (props.getDescription() != null) { spec = spec.setDescription(props.getDescription()); } instances.put(datasetInstanceId.getNamespace(), datasetInstanceId, spec); def.getAdmin(DatasetContext.from(datasetInstanceId.getNamespaceId()), spec, null).upgrade(); publishAudit(datasetInstanceId, AuditType.UPDATE); } finally { writeLock.unlock(); } } @Override public Collection<DatasetSpecificationSummary> getInstances(Id.Namespace namespaceId) { readLock.lock(); try { // don't expect this to be called a lot. // might be better to maintain this collection separately and just return it, but seems like its not worth it. Collection<DatasetSpecification> specs = instances.row(namespaceId).values(); ImmutableList.Builder<DatasetSpecificationSummary> specSummaries = ImmutableList.builder(); for (DatasetSpecification spec : specs) { specSummaries.add(new DatasetSpecificationSummary(spec.getName(), spec.getType(), spec.getProperties())); } return specSummaries.build(); } finally { readLock.unlock(); } } @Nullable @Override public DatasetSpecification getDatasetSpec(Id.DatasetInstance datasetInstanceId) { readLock.lock(); try { DatasetSpecification spec = instances.get(datasetInstanceId.getNamespace(), datasetInstanceId); return DatasetsUtil.fixOriginalProperties(spec); } finally { readLock.unlock(); } } @Override public boolean hasInstance(Id.DatasetInstance datasetInstanceId) { readLock.lock(); try { return instances.contains(datasetInstanceId.getNamespace(), datasetInstanceId); } finally { readLock.unlock(); } } @Override public boolean hasSystemType(String typeName) { return hasType(Id.DatasetType.from(Id.Namespace.SYSTEM, typeName)); } @VisibleForTesting @Override public boolean hasType(Id.DatasetType datasetTypeId) { return registries.containsKey(datasetTypeId.getNamespace()) && registries.get(datasetTypeId.getNamespace()).hasType(datasetTypeId.getTypeName()); } @Override public void truncateInstance(Id.DatasetInstance instanceId) throws DatasetManagementException, IOException { writeLock.lock(); try { DatasetSpecification spec = instances.get(instanceId.getNamespace(), instanceId); if (spec == null) { throw new InstanceNotFoundException(instanceId.getId()); } DatasetDefinition def = getDefinitionForType(instanceId.getNamespace(), spec.getType()); if (def == null) { throw new DatasetManagementException( String.format("Dataset type '%s' is neither registered in the '%s' namespace nor in the system namespace", spec.getType(), instanceId.getNamespaceId())); } def.getAdmin(DatasetContext.from(instanceId.getNamespaceId()), spec, null).truncate(); publishAudit(instanceId, AuditType.TRUNCATE); } finally { writeLock.unlock(); } } @Override public void deleteInstance(Id.DatasetInstance instanceId) throws DatasetManagementException, IOException { writeLock.lock(); try { DatasetSpecification spec = instances.remove(instanceId.getNamespace(), instanceId); if (spec == null) { throw new InstanceNotFoundException(instanceId.getId()); } DatasetDefinition def = getDefinitionForType(instanceId.getNamespace(), spec.getType()); if (def == null) { throw new DatasetManagementException( String.format("Dataset type '%s' is neither registered in the '%s' namespace nor in the system namespace", spec.getType(), instanceId.getNamespaceId())); } def.getAdmin(DatasetContext.from(instanceId.getNamespaceId()), spec, null).drop(); publishAudit(instanceId, AuditType.DELETE); } finally { writeLock.unlock(); } } @Override public void deleteAllInstances(Id.Namespace namespaceId) throws DatasetManagementException, IOException { writeLock.lock(); try { for (DatasetSpecification spec : instances.row(namespaceId).values()) { DatasetDefinition def = getDefinitionForType(namespaceId, spec.getType()); if (def == null) { throw new DatasetManagementException( String.format("Dataset type '%s' is neither registered in the '%s' namespace nor in the system namespace", spec.getType(), namespaceId)); } def.getAdmin(DatasetContext.from(namespaceId.getId()), spec, null).drop(); publishAudit(Id.DatasetInstance.from(namespaceId, spec.getName()), AuditType.DELETE); } instances.row(namespaceId).clear(); } finally { writeLock.unlock(); } } @Override public <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, @Nullable ClassLoader classLoader) throws IOException { return getAdmin(datasetInstanceId, classLoader, new ConstantClassLoaderProvider(classLoader)); } @Nullable @Override public <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider) throws IOException { readLock.lock(); try { DatasetSpecification spec = instances.get(datasetInstanceId.getNamespace(), datasetInstanceId); if (spec == null) { return null; } LinkedHashSet<String> availableModuleClasses = getAvailableModuleClasses(datasetInstanceId.getNamespace()); DatasetDefinition impl = createRegistry(availableModuleClasses, classLoader).get(spec.getType()); return (T) impl.getAdmin(DatasetContext.from(datasetInstanceId.getNamespaceId()), spec, classLoader); } finally { readLock.unlock(); } } @Override public <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, Map<String, String> arguments, @Nullable ClassLoader classLoader, @Nullable Iterable<? extends Id> owners) throws IOException { readLock.lock(); try { DatasetSpecification spec = instances.get(datasetInstanceId.getNamespace(), datasetInstanceId); if (spec == null) { return null; } LinkedHashSet<String> availableModuleClasses = getAvailableModuleClasses(datasetInstanceId.getNamespace()); DatasetDefinition def = createRegistry(availableModuleClasses, classLoader).get(spec.getType()); return (T) (def.getDataset(DatasetContext.from(datasetInstanceId.getNamespaceId()), spec, arguments, classLoader)); } finally { readLock.unlock(); } } @Override public <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, Map<String, String> arguments, @Nullable ClassLoader classLoader) throws IOException { return getDataset(datasetInstanceId, arguments, classLoader, null); } @Nullable @Override public <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader parentClassLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners) throws IOException { return getDataset(datasetInstanceId, arguments, parentClassLoader, classLoaderProvider, owners, AccessType.UNKNOWN); } @Nullable @Override public <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader parentClassLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners, AccessType accessType) throws IOException { readLock.lock(); try { DatasetSpecification spec = instances.get(datasetInstanceId.getNamespace(), datasetInstanceId); if (spec == null) { return null; } LinkedHashSet<String> availableModuleClasses = getAvailableModuleClasses(datasetInstanceId.getNamespace()); DatasetDefinition def = createRegistry(availableModuleClasses, parentClassLoader).get(spec.getType()); return (T) (def.getDataset(DatasetContext.from(datasetInstanceId.getNamespaceId()), spec, arguments, parentClassLoader)); } finally { readLock.unlock(); } } @Override public void writeLineage(Id.DatasetInstance datasetInstanceId, AccessType accessType) { // no-op. The InMemoryDatasetFramework doesn't need to do anything. // The lineage should be recorded before this point. In fact, this should not even be called because // RemoteDatasetFramework's implementation of this is also a no-op. } @Override public void createNamespace(Id.Namespace namespaceId) throws DatasetManagementException { writeLock.lock(); try { if (!namespaces.add(namespaceId)) { throw new DatasetManagementException(String.format("Namespace %s already exists.", namespaceId.getId())); } } finally { writeLock.unlock(); } } @Override public void deleteNamespace(Id.Namespace namespaceId) throws DatasetManagementException { writeLock.lock(); try { Preconditions.checkArgument(!Id.Namespace.SYSTEM.equals(namespaceId), "Cannot delete system namespace."); if (!namespaces.remove(namespaceId)) { throw new DatasetManagementException(String.format("Namespace %s does not exist", namespaceId.getId())); } instances.row(namespaceId).clear(); moduleClasses.row(namespaceId).clear(); registries.remove(namespaceId); } finally { writeLock.unlock(); } } // because there may be dependencies between modules, it is important that they are ordered correctly. protected DatasetDefinitionRegistry createRegistry(LinkedHashSet<String> availableModuleClasses, @Nullable ClassLoader classLoader) { DatasetDefinitionRegistry registry = registryFactory.create(); for (String moduleClassName : availableModuleClasses) { // todo: this module loading and registering code somewhat duplicated in RemoteDatasetFramework Class<?> moduleClass; // try program class loader then cdap class loader try { moduleClass = ClassLoaders.loadClass(moduleClassName, classLoader, this); } catch (ClassNotFoundException e) { try { moduleClass = ClassLoaders.loadClass(moduleClassName, null, this); } catch (ClassNotFoundException e2) { LOG.error("Was not able to load dataset module class {}", moduleClassName, e); throw Throwables.propagate(e); } } try { DatasetModule module = DatasetModules.getDatasetModule(moduleClass); module.register(registry); } catch (Exception e) { LOG.error("Was not able to load dataset module class {}", moduleClassName, e); throw Throwables.propagate(e); } } return registry; } // gets all module class names available in the given namespace. Includes system modules first, then // namespace modules. protected LinkedHashSet<String> getAvailableModuleClasses(Id.Namespace namespace) { // order is important, system LinkedHashSet<String> availableModuleClasses = Sets.newLinkedHashSet(); availableModuleClasses.addAll(moduleClasses.row(Id.Namespace.SYSTEM).values()); availableModuleClasses.addAll(moduleClasses.row(namespace).values()); return availableModuleClasses; } @Nullable @VisibleForTesting DatasetDefinition getDefinitionForType(Id.Namespace namespaceId, String datasetType) { DatasetDefinitionRegistry registry = registries.get(namespaceId); if (registry != null && registry.hasType(datasetType)) { return registry.get(datasetType); } registry = registries.get(Id.Namespace.SYSTEM); if (registry != null && registry.hasType(datasetType)) { return registry.get(datasetType); } return null; } // NOTE: this class is needed to collect all types added by a module private class TypesTrackingRegistry implements DatasetDefinitionRegistry { private final DatasetDefinitionRegistry delegate; private final List<String> types = Lists.newArrayList(); TypesTrackingRegistry(DatasetDefinitionRegistry delegate) { this.delegate = delegate; } List<String> getTypes() { return types; } @Override public void add(DatasetDefinition def) { delegate.add(def); types.add(def.getName()); } @Override public <T extends DatasetDefinition> T get(String datasetTypeName) { // In real-world scenarios, default modules are guaranteed to always exist in the system namespace. // Hence, we could add a preconditions check here to verify that registries contains types from system namespace // However, a lot of our tests (DatasetFrameworkTestUtil) start without default modules, so not adding that check. // In any case, the pattern here of first looking for the definition in own namespace, then in system is valid // and the else block should throw an exception if the dataset type is not found in the current or the system // namespace. if (delegate.hasType(datasetTypeName)) { return delegate.get(datasetTypeName); } else if (registries.containsKey(Id.Namespace.SYSTEM)) { return registries.get(Id.Namespace.SYSTEM).get(datasetTypeName); } else { throw new IllegalStateException(String.format("Dataset type %s not found.", datasetTypeName)); } } @Override public boolean hasType(String datasetTypeName) { return delegate.hasType(datasetTypeName); } } private void publishAudit(Id.DatasetInstance datasetInstance, AuditType auditType) { // Don't publish audit for system datasets admin operations, there can be a deadlock if (NamespaceId.SYSTEM.getNamespace().equals(datasetInstance.getNamespaceId()) && auditType != AuditType.ACCESS) { return; } AuditPublishers.publishAudit(auditPublisher, datasetInstance, auditType, AuditPayload.EMPTY_PAYLOAD); } }