/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.dataset2.lib.partitioned;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.DatasetAdmin;
import co.cask.cdap.api.dataset.DatasetContext;
import co.cask.cdap.api.dataset.DatasetDefinition;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.dataset.lib.AbstractDatasetDefinition;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.FileSetArguments;
import co.cask.cdap.api.dataset.lib.IndexedTable;
import co.cask.cdap.api.dataset.lib.IndexedTableDefinition;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.explore.client.ExploreFacade;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.Provider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Map;
/**
* Defines the partitioned dataset type. At this time, the partitions are not managed by the
* partitioned dataset, so all admin is simply on the partition table.
* TODO rethink this
*/
public class PartitionedFileSetDefinition extends AbstractDatasetDefinition<PartitionedFileSet, DatasetAdmin> {
private static final Logger LOG = LoggerFactory.getLogger(PartitionedFileSetDefinition.class);
protected static final String PARTITION_TABLE_NAME = "partitions";
protected static final String FILESET_NAME = "files";
private static final String INDEXED_COLS = Bytes.toString(PartitionedFileSetDataset.WRITE_PTR_COL) + ','
+ Bytes.toString(PartitionedFileSetDataset.CREATION_TIME_COL);
protected final DatasetDefinition<? extends IndexedTable, ?> indexedTableDef;
protected final DatasetDefinition<? extends FileSet, ?> filesetDef;
@Inject
private Injector injector;
public PartitionedFileSetDefinition(String name,
DatasetDefinition<? extends FileSet, ?> filesetDef,
DatasetDefinition<? extends IndexedTable, ?> indexedTableDef) {
super(name);
Preconditions.checkArgument(indexedTableDef != null, "IndexedTable definition is required");
Preconditions.checkArgument(filesetDef != null, "FileSet definition is required");
this.filesetDef = filesetDef;
this.indexedTableDef = indexedTableDef;
}
@Override
public DatasetSpecification configure(String instanceName, DatasetProperties properties) {
// define the columns for indexing on the partitionsTable
DatasetProperties indexedTableProperties = DatasetProperties.builder()
.addAll(properties.getProperties())
.add(IndexedTableDefinition.INDEX_COLUMNS_CONF_KEY, INDEXED_COLS)
.build();
return DatasetSpecification.builder(instanceName, getName())
.properties(properties.getProperties())
.datasets(filesetDef.configure(FILESET_NAME, properties),
indexedTableDef.configure(PARTITION_TABLE_NAME, indexedTableProperties))
.build();
}
@Override
public DatasetAdmin getAdmin(DatasetContext datasetContext, DatasetSpecification spec,
ClassLoader classLoader) throws IOException {
return new PartitionedFileSetAdmin(
datasetContext, spec, getExploreProvider(),
filesetDef.getAdmin(datasetContext, spec.getSpecification(FILESET_NAME), classLoader),
indexedTableDef.getAdmin(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), classLoader));
}
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec,
Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// properties must contain the partitioning
Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments, partitioning);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME),
arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME),
arguments, classLoader);
return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments,
getExploreProvider());
}
// if the arguments do not contain an output location, generate one from the partition key (if present)
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments,
Partitioning partitioning) {
if (FileSetArguments.getOutputPath(arguments) == null) {
PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning);
if (key != null) {
arguments = Maps.newHashMap(arguments);
FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning));
} else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) {
// when using DynamicPartitioner, use the baseLocation of the fileSet as the output location
FileSetArguments.setBaseOutputPath(arguments);
}
}
return arguments;
}
protected Provider<ExploreFacade> getExploreProvider() {
return new Provider<ExploreFacade>() {
@Override
public ExploreFacade get() {
try {
return injector.getInstance(ExploreFacade.class);
} catch (Exception e) {
// since explore is optional for this dataset, ignore but log it
LOG.warn(String.format("Unable to get explore facade from injector for %s dataset.", getName()), e);
return null;
}
}
};
}
}