/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hive.datasets; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.common.DatasetNotFoundException; import co.cask.cdap.common.conf.ConfigurationUtil; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.data.dataset.SystemDatasetInstantiator; import co.cask.cdap.hive.context.ContextManager; import co.cask.cdap.hive.context.TxnCodec; import co.cask.cdap.proto.Id; import co.cask.tephra.Transaction; import co.cask.tephra.TransactionAware; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import org.apache.hadoop.conf.Configuration; import java.io.Closeable; import java.io.IOException; /** * Instantiates the dataset used during runtime of a Hive query. This means it's used in mappers and reducers, * and must use the Hadoop configuration to look up what dataset to instantiate. Should not be closed until the * dataset has been closed. Assumes the dataset name and namespace are settings in the configuration. * It may seem like this would not work if multiple datasets are used in a single query, but that is not the case. * It is not obvious, but dataset name and namespace are added as job properties in DatasetStorageHandler. This tells * Hive to add those properties to the Configuration object before passing it in to the methods of an InputFormat * or OutputFormat. So even if multiple datasets are used in the same query (a join query for example), dataset name * will not get clobbered. */ public class DatasetAccessor implements Closeable { private final Id.DatasetInstance datasetId; private final ContextManager.Context context; private final Transaction transaction; private final SystemDatasetInstantiator datasetInstantiator; private Dataset dataset; public DatasetAccessor(Configuration conf) throws IOException { String datasetName = conf.get(Constants.Explore.DATASET_NAME); String namespace = conf.get(Constants.Explore.DATASET_NAMESPACE); Preconditions.checkArgument(!Strings.isNullOrEmpty(datasetName), "dataset name not present in config"); Preconditions.checkArgument(!Strings.isNullOrEmpty(namespace), "namespace not present in config"); this.datasetId = Id.DatasetInstance.from(namespace, datasetName); this.context = ContextManager.getContext(conf); this.transaction = ConfigurationUtil.get(conf, Constants.Explore.TX_QUERY_KEY, TxnCodec.INSTANCE); this.datasetInstantiator = context.createDatasetInstantiator(conf.getClassLoader()); } public void initialize() throws IOException, DatasetManagementException, DatasetNotFoundException, ClassNotFoundException { dataset = datasetInstantiator.getDataset(datasetId); if (dataset instanceof TransactionAware) { ((TransactionAware) dataset).startTx(transaction); } } public Id.DatasetInstance getDatasetId() { return datasetId; } public <T extends Dataset> T getDataset() { return (T) dataset; } public DatasetSpecification getDatasetSpec() throws DatasetManagementException { return context.getDatasetSpec(datasetId); } @Override public void close() throws IOException { datasetInstantiator.close(); context.close(); } }