/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.hive.context;
import co.cask.cdap.api.dataset.DatasetManagementException;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.metrics.MetricsCollectionService;
import co.cask.cdap.common.conf.CConfiguration;
import co.cask.cdap.common.conf.ConfigurationUtil;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.common.guice.ConfigModule;
import co.cask.cdap.common.guice.DiscoveryRuntimeModule;
import co.cask.cdap.common.guice.KafkaClientModule;
import co.cask.cdap.common.guice.LocationRuntimeModule;
import co.cask.cdap.common.guice.ZKClientModule;
import co.cask.cdap.common.lang.FilterClassLoader;
import co.cask.cdap.common.metrics.NoOpMetricsCollectionService;
import co.cask.cdap.data.dataset.SystemDatasetInstantiator;
import co.cask.cdap.data.dataset.SystemDatasetInstantiatorFactory;
import co.cask.cdap.data.runtime.DataFabricModules;
import co.cask.cdap.data.runtime.DataSetsModules;
import co.cask.cdap.data.stream.StreamAdminModules;
import co.cask.cdap.data.view.ViewAdminModules;
import co.cask.cdap.data2.audit.AuditModule;
import co.cask.cdap.data2.dataset2.DatasetFramework;
import co.cask.cdap.data2.transaction.stream.StreamAdmin;
import co.cask.cdap.data2.transaction.stream.StreamConfig;
import co.cask.cdap.explore.guice.ExploreClientModule;
import co.cask.cdap.hive.datasets.DatasetSerDe;
import co.cask.cdap.hive.stream.StreamSerDe;
import co.cask.cdap.notifications.feeds.client.NotificationFeedClientModule;
import co.cask.cdap.proto.Id;
import com.google.common.base.Objects;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.google.inject.Scopes;
import org.apache.hadoop.conf.Configuration;
import org.apache.twill.zookeeper.ZKClientService;
import java.io.Closeable;
import java.io.IOException;
import javax.annotation.Nullable;
/**
* Stores/creates context for Hive queries to run in MapReduce jobs. The Context is used to get dataset and stream
* information, such as their schema. The context is also used to instantiate datasets.
*
* This is a weird class because it is used in two different code paths that call the same method.
* When Hive executes a query, it calls the SerDe's initialize method. The {@link DatasetSerDe} and
* {@link StreamSerDe} both use this ContextManager to look up required information. This call to initialize
* happens both in the process that launches the Hive job (explore service), and in the mapreduce job that was launched.
*
* When called from a mapreduce job, we need to create a DatasetFramework, StreamAdmin, and ZKClientService.
* This is done by deserializing CDAP's configuration from the Hadoop Configuration, creating an injector,
* and instantiating those object.
*
* When called from the explore service, we don't want to instantiate everything all over again for every
* query, especially since Hive calls initialize multiple times per query for some reason. In that scenario,
* the explore service calls {@link #saveContext(DatasetFramework, StreamAdmin, SystemDatasetInstantiatorFactory)}
* when it starts up, in order to cache the Context.
*
* Since there is no way for the SerDe to know if it's in a mapreduce job or in the explore service, it relies
* on whether the Context has been cached to determine whether to create a new Context.
*/
public class ContextManager {
private static Context savedContext;
/**
* Create and save a context, so that any call to {@link #getContext(Configuration)} that is made in this jvm
* will return the context created from this call.
*/
public static void saveContext(DatasetFramework datasetFramework, StreamAdmin streamAdmin,
SystemDatasetInstantiatorFactory datasetInstantiatorFactory) {
savedContext = new Context(datasetFramework, streamAdmin, datasetInstantiatorFactory);
}
/**
* If a context was saved using {@link #saveContext(DatasetFramework, StreamAdmin, SystemDatasetInstantiatorFactory)},
* returns the saved context. This is what happens in the Explore service.
* If no context was saved and the conf is not null, creates a context and returns it. The context must be closed by
* the caller. The context created will not be saved, meaning the next time this method is called,
* a new context will be created. This is what happens in map reduce jobs launched by Hive.
* If no context was saved and the conf is null, null is returned.
*
* The {@code conf} param is expected to contain serialized {@link co.cask.cdap.common.conf.CConfiguration} and
* {@link org.apache.hadoop.conf.Configuration} objects, as well as transaction information.
*
* @param conf configuration used to create a context, if necessary. If it is null, return the saved context, which
* can also be null.
* @return Context of a query execution.
* @throws IOException when the configuration does not contain the required settings to create the context
*/
@Nullable
public static Context getContext(@Nullable Configuration conf) throws IOException {
if (conf != null && savedContext == null) {
return createContext(conf);
}
return savedContext;
}
// this method is called by the mappers/reducers of jobs launched by Hive.
private static Context createContext(Configuration conf) throws IOException {
// Create context needs to happen only when running in as a MapReduce job.
// In other cases, ContextManager will be initialized using saveContext method.
CConfiguration cConf = ConfigurationUtil.get(conf, Constants.Explore.CCONF_KEY, CConfCodec.INSTANCE);
Configuration hConf = ConfigurationUtil.get(conf, Constants.Explore.HCONF_KEY, HConfCodec.INSTANCE);
Injector injector = Guice.createInjector(
new ConfigModule(cConf, hConf),
new ZKClientModule(),
new LocationRuntimeModule().getDistributedModules(),
new DiscoveryRuntimeModule().getDistributedModules(),
new DataFabricModules().getDistributedModules(),
new DataSetsModules().getDistributedModules(),
new ExploreClientModule(),
new ViewAdminModules().getDistributedModules(),
new StreamAdminModules().getDistributedModules(),
new NotificationFeedClientModule(),
new KafkaClientModule(),
new AuditModule().getDistributedModules(),
new AbstractModule() {
@Override
protected void configure() {
bind(MetricsCollectionService.class).to(NoOpMetricsCollectionService.class).in(Scopes.SINGLETON);
}
}
);
ZKClientService zkClientService = injector.getInstance(ZKClientService.class);
zkClientService.startAndWait();
DatasetFramework datasetFramework = injector.getInstance(DatasetFramework.class);
StreamAdmin streamAdmin = injector.getInstance(StreamAdmin.class);
SystemDatasetInstantiatorFactory datasetInstantiatorFactory =
injector.getInstance(SystemDatasetInstantiatorFactory.class);
return new Context(datasetFramework, streamAdmin, zkClientService, datasetInstantiatorFactory);
}
/**
* Contains DatasetFramework object and StreamAdmin object required to run Hive queries in MapReduce jobs.
*/
public static class Context implements Closeable {
private final DatasetFramework datasetFramework;
private final StreamAdmin streamAdmin;
private final ZKClientService zkClientService;
private final SystemDatasetInstantiatorFactory datasetInstantiatorFactory;
public Context(DatasetFramework datasetFramework, StreamAdmin streamAdmin,
ZKClientService zkClientService,
SystemDatasetInstantiatorFactory datasetInstantiatorFactory) {
// This constructor is called from the MR job Hive launches.
this.datasetFramework = datasetFramework;
this.streamAdmin = streamAdmin;
this.zkClientService = zkClientService;
this.datasetInstantiatorFactory = datasetInstantiatorFactory;
}
public Context(DatasetFramework datasetFramework, StreamAdmin streamAdmin,
SystemDatasetInstantiatorFactory datasetInstantiatorFactory) {
// This constructor is called from Hive server, that is the Explore module.
this(datasetFramework, streamAdmin, null, datasetInstantiatorFactory);
}
public StreamConfig getStreamConfig(Id.Stream streamId) throws IOException {
return streamAdmin.getConfig(streamId);
}
public DatasetSpecification getDatasetSpec(Id.DatasetInstance datasetId) throws DatasetManagementException {
return datasetFramework.getDatasetSpec(datasetId);
}
/**
* Get a {@link SystemDatasetInstantiator} that can instantiate datasets using the given classloader as the
* parent classloader for datasets. Must be closed after it is no longer needed, as dataset jars may be unpacked
* in order to create classloaders for custom datasets.
*
* The given parent classloader will be wrapped in a {@link FilterClassLoader}
* to prevent CDAP dependencies from leaking through. For example, if a custom dataset has an avro dependency,
* the classloader should use the avro from the custom dataset and not from cdap.
*
* @param parentClassLoader the parent classloader to use when instantiating datasets. If null, the system
* classloader will be used
* @return a dataset instantiator that can be used to instantiate datasets
*/
public SystemDatasetInstantiator createDatasetInstantiator(@Nullable ClassLoader parentClassLoader) {
parentClassLoader = parentClassLoader == null ?
Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader()) :
parentClassLoader;
return datasetInstantiatorFactory.create(FilterClassLoader.create(parentClassLoader));
}
@Override
public void close() {
// zkClientService is null if used by the Explore service, since Explore manages the lifecycle of the zk service.
// it is not null if used by a mapreduce job launched by Hive.
if (zkClientService != null) {
zkClientService.stopAndWait();
}
}
}
}