/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.datafabric.dataset; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.DatasetAdmin; import co.cask.cdap.api.dataset.DatasetContext; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.InstanceConflictException; import co.cask.cdap.api.dataset.module.DatasetModule; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.common.io.Locations; import co.cask.cdap.common.lang.ClassLoaders; import co.cask.cdap.data2.datafabric.dataset.type.ConstantClassLoaderProvider; import co.cask.cdap.data2.datafabric.dataset.type.DatasetClassLoaderProvider; import co.cask.cdap.data2.dataset2.DatasetDefinitionRegistryFactory; import co.cask.cdap.data2.dataset2.DatasetFramework; import co.cask.cdap.data2.dataset2.SingleTypeModule; import co.cask.cdap.data2.metadata.lineage.AccessType; import co.cask.cdap.proto.DatasetMeta; import co.cask.cdap.proto.DatasetSpecificationSummary; import co.cask.cdap.proto.DatasetTypeMeta; import co.cask.cdap.proto.Id; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import com.google.common.io.ByteStreams; import com.google.inject.Inject; import org.apache.twill.discovery.DiscoveryServiceClient; import org.apache.twill.filesystem.Location; import org.apache.twill.internal.ApplicationBundler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.Collection; import java.util.Map; import java.util.Set; import java.util.jar.JarEntry; import java.util.jar.JarInputStream; import java.util.jar.JarOutputStream; import javax.annotation.Nullable; /** * {@link co.cask.cdap.data2.dataset2.DatasetFramework} implementation that talks to DatasetFramework Service */ @SuppressWarnings("unchecked") public class RemoteDatasetFramework implements DatasetFramework { private static final Logger LOG = LoggerFactory.getLogger(RemoteDatasetFramework.class); private final CConfiguration cConf; private final LoadingCache<Id.Namespace, DatasetServiceClient> clientCache; private final AbstractDatasetProvider instances; @Inject public RemoteDatasetFramework(final CConfiguration cConf, final DiscoveryServiceClient discoveryClient, DatasetDefinitionRegistryFactory registryFactory) { this.cConf = cConf; this.clientCache = CacheBuilder.newBuilder().build(new CacheLoader<Id.Namespace, DatasetServiceClient>() { @Override public DatasetServiceClient load(Id.Namespace namespace) throws Exception { return new DatasetServiceClient(discoveryClient, namespace, cConf); } }); this.instances = new AbstractDatasetProvider(registryFactory) { @Override public DatasetMeta getMeta(Id.DatasetInstance instance) throws Exception { return RemoteDatasetFramework.this.clientCache.getUnchecked(instance.getNamespace()) .getInstance(instance.getId()); } @Override public void createIfNotExists(Id.DatasetInstance instance, String type, DatasetProperties creationProps) throws Exception { try { RemoteDatasetFramework.this.addInstance(type, instance, creationProps); } catch (InstanceConflictException e) { // ignore, since this indicates the dataset already exists } } }; } @Override public void addModule(Id.DatasetModule moduleId, DatasetModule module) throws DatasetManagementException { Class<?> moduleClass = getModuleClass(module); try { Location deploymentJar = createDeploymentJar(moduleClass); try { clientCache.getUnchecked(moduleId.getNamespace()) .addModule(moduleId.getId(), moduleClass.getName(), deploymentJar); } finally { try { deploymentJar.delete(); } catch (IOException e) { // Just log warning, since the add module operation can still proceed LOG.warn("Failed to delete temporary deployment jar {}", deploymentJar, e); } } } catch (IOException e) { String msg = String.format("Could not create jar for deploying dataset module %s with main class %s", moduleId, moduleClass.getName()); LOG.error(msg, e); throw new DatasetManagementException(msg, e); } } @Override public void addModule(Id.DatasetModule moduleId, DatasetModule module, Location jarLocation) throws DatasetManagementException { clientCache.getUnchecked(moduleId.getNamespace()) .addModule(moduleId.getId(), getModuleClass(module).getName(), jarLocation); } @Override public void deleteModule(Id.DatasetModule moduleId) throws DatasetManagementException { clientCache.getUnchecked(moduleId.getNamespace()).deleteModule(moduleId.getId()); } @Override public void deleteAllModules(Id.Namespace namespaceId) throws DatasetManagementException { clientCache.getUnchecked(namespaceId).deleteModules(); } @Override public void addInstance(String datasetType, Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException { clientCache.getUnchecked(datasetInstanceId.getNamespace()) .addInstance(datasetInstanceId.getId(), datasetType, props); } @Override public void updateInstance(Id.DatasetInstance datasetInstanceId, DatasetProperties props) throws DatasetManagementException { clientCache.getUnchecked(datasetInstanceId.getNamespace()) .updateInstance(datasetInstanceId.getId(), props); } @Override public Collection<DatasetSpecificationSummary> getInstances(Id.Namespace namespaceId) throws DatasetManagementException { return clientCache.getUnchecked(namespaceId).getAllInstances(); } @Nullable @Override public DatasetSpecification getDatasetSpec(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException { DatasetMeta meta = clientCache.getUnchecked(datasetInstanceId.getNamespace()) .getInstance(datasetInstanceId.getId()); return meta == null ? null : meta.getSpec(); } @Override public boolean hasInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException { return clientCache.getUnchecked(datasetInstanceId.getNamespace()).getInstance(datasetInstanceId.getId()) != null; } @Override public boolean hasSystemType(String typeName) throws DatasetManagementException { return hasType(Id.DatasetType.from(Id.Namespace.SYSTEM, typeName)); } @Override public boolean hasType(Id.DatasetType datasetTypeId) throws DatasetManagementException { return clientCache.getUnchecked(datasetTypeId.getNamespace()).getType(datasetTypeId.getTypeName()) != null; } @Override public void truncateInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException { clientCache.getUnchecked(datasetInstanceId.getNamespace()).truncateInstance(datasetInstanceId.getId()); } @Override public void deleteInstance(Id.DatasetInstance datasetInstanceId) throws DatasetManagementException { clientCache.getUnchecked(datasetInstanceId.getNamespace()).deleteInstance(datasetInstanceId.getId()); } @Override public void deleteAllInstances(Id.Namespace namespaceId) throws DatasetManagementException, IOException { // delete all one by one for (DatasetSpecificationSummary metaSummary : getInstances(namespaceId)) { Id.DatasetInstance datasetInstanceId = Id.DatasetInstance.from(namespaceId, metaSummary.getName()); deleteInstance(datasetInstanceId); } } @Override public <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, ClassLoader classLoader) throws DatasetManagementException, IOException { return getAdmin(datasetInstanceId, classLoader, new ConstantClassLoaderProvider(classLoader)); } @Nullable @Override public <T extends DatasetAdmin> T getAdmin(Id.DatasetInstance datasetInstanceId, @Nullable ClassLoader parentClassLoader, DatasetClassLoaderProvider classLoaderProvider) throws DatasetManagementException, IOException { DatasetMeta instanceInfo = clientCache.getUnchecked(datasetInstanceId.getNamespace()) .getInstance(datasetInstanceId.getId()); if (instanceInfo == null) { return null; } DatasetType type = instances.getType(instanceInfo.getType(), parentClassLoader, classLoaderProvider); return (T) type.getAdmin(DatasetContext.from(datasetInstanceId.getNamespaceId()), instanceInfo.getSpec()); } @Override public <T extends Dataset> T getDataset( Id.DatasetInstance datasetInstanceId, Map<String, String> arguments, @Nullable ClassLoader classLoader, @Nullable Iterable<? extends Id> owners) throws DatasetManagementException, IOException { return getDataset(datasetInstanceId, arguments, classLoader, new ConstantClassLoaderProvider(classLoader), owners); } @Override public <T extends Dataset> T getDataset( Id.DatasetInstance datasetInstanceId, Map<String, String> arguments, @Nullable ClassLoader classLoader) throws DatasetManagementException, IOException { return getDataset(datasetInstanceId, arguments, classLoader, null); } @Nullable @Override public <T extends Dataset> T getDataset( Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners) throws DatasetManagementException, IOException { return getDataset(datasetInstanceId, arguments, classLoader, classLoaderProvider, owners, AccessType.UNKNOWN); } @Nullable @Override public <T extends Dataset> T getDataset(Id.DatasetInstance datasetInstanceId, @Nullable Map<String, String> arguments, @Nullable ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider, @Nullable Iterable<? extends Id> owners, AccessType accessType) throws DatasetManagementException, IOException { DatasetMeta instanceInfo = clientCache.getUnchecked(datasetInstanceId.getNamespace()) .getInstance(datasetInstanceId.getId(), owners); if (instanceInfo == null) { return null; } return (T) instances.get( datasetInstanceId, instanceInfo.getType(), instanceInfo.getSpec(), classLoaderProvider, classLoader, arguments); } @Override public void writeLineage(Id.DatasetInstance datasetInstanceId, AccessType accessType) { // no-op. The RemoteDatasetFramework doesn't need to do anything. The lineage should be recorded before this point. } @Override public void createNamespace(Id.Namespace namespaceId) throws DatasetManagementException { clientCache.getUnchecked(namespaceId).createNamespace(); } @Override public void deleteNamespace(Id.Namespace namespaceId) throws DatasetManagementException { clientCache.getUnchecked(namespaceId).deleteNamespace(); } private Location createDeploymentJar(Class<?> clz) throws IOException { File tempDir = new File(cConf.get(Constants.CFG_LOCAL_DATA_DIR), cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsoluteFile(); tempDir.mkdirs(); File tempFile = File.createTempFile(clz.getName(), ".jar", tempDir); try { // Create a bundle jar in a temp location ClassLoader remembered = ClassLoaders.setContextClassLoader(clz.getClassLoader()); try { ApplicationBundler bundler = new ApplicationBundler(ImmutableList.of("co.cask.cdap.api", "org.apache.hadoop", "org.apache.hbase", "org.apache.hive")); bundler.createBundle(Locations.toLocation(tempFile), clz); } finally { ClassLoaders.setContextClassLoader(remembered); } // Create the program jar for deployment. It removes the "classes/" prefix as that's the convention taken // by the ApplicationBundler inside Twill. File destination = File.createTempFile(clz.getName(), ".jar", tempDir); try ( JarOutputStream jarOutput = new JarOutputStream(new FileOutputStream(destination)); JarInputStream jarInput = new JarInputStream(new FileInputStream(tempFile)) ) { Set<String> seen = Sets.newHashSet(); JarEntry jarEntry = jarInput.getNextJarEntry(); while (jarEntry != null) { boolean isDir = jarEntry.isDirectory(); String entryName = jarEntry.getName(); if (!entryName.equals("classes/")) { if (entryName.startsWith("classes/")) { jarEntry = new JarEntry(entryName.substring("classes/".length())); } else { jarEntry = new JarEntry(entryName); } if (seen.add(jarEntry.getName())) { jarOutput.putNextEntry(jarEntry); if (!isDir) { ByteStreams.copy(jarInput, jarOutput); } } } jarEntry = jarInput.getNextJarEntry(); } return Locations.toLocation(destination); } } finally { tempFile.delete(); } } // can be used directly if DatasetTypeMeta is known, like in create dataset by dataset ops executor service /** * Return an instance of the {@link DatasetType} corresponding to given dataset modules. Uses the given * classloader as a parent for all dataset modules, and the given classloader provider to get classloaders for * each dataset module in given the dataset type meta. Order of dataset modules in the given * {@link DatasetTypeMeta} is important. The classloader for the first dataset module is used as the parent of * the second dataset module and so on until the last dataset module. The classloader for the last dataset module * is then used as the classloader for the returned {@link DatasetType}. * * @param implementationInfo the dataset type metadata to instantiate the type from * @param classLoader the parent classloader to use for dataset modules * @param classLoaderProvider the classloader provider to get classloaders for each dataset module * @param <T> the type of DatasetType * @return an instance of the DatasetType */ public <T extends DatasetType> T getDatasetType(DatasetTypeMeta implementationInfo, ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider) { return instances.getType(implementationInfo, classLoader, classLoaderProvider); } /** * Returns the {@link Class} of the {@link DatasetModule}. * * We support easier APIs for custom datasets: user can implement dataset and make it available for others to use * by only implementing Dataset. Without requiring implementing datasets module, definition and other classes. * In this case we wrap that Dataset implementation with SingleTypeModule. But since we don't have a way to serde * dataset modules, if we pass only SingleTypeModule.class the Dataset implementation info will be lost. Hence, as * a workaround we put Dataset implementation class in MDS (on DatasetService) and wrapping it with SingleTypeModule * when we need to instantiate module. * * todo: do proper serde for modules instead of just passing class name to server */ private Class<?> getModuleClass(DatasetModule module) { if (module instanceof SingleTypeModule) { return ((SingleTypeModule) module).getDataSetClass(); } return module.getClass(); } }