/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.datafabric.dataset;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.dataset.DatasetDefinition;
import co.cask.cdap.api.dataset.DatasetManagementException;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.dataset.InstanceConflictException;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.ObjectMappedTable;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.data2.dataset2.DatasetFramework;
import co.cask.cdap.data2.dataset2.lib.file.FileSetDataset;
import co.cask.cdap.data2.metadata.lineage.LineageDataset;
import co.cask.cdap.data2.registry.UsageDataset;
import co.cask.cdap.proto.Id;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Throwables;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
import javax.annotation.Nullable;
/**
* Has handy methods for dealing with Datasets.
* todo: once we have couple methods, refactor out from "util" into smth more sensible
*/
public final class DatasetsUtil {
private static final Logger LOG = LoggerFactory.getLogger(DatasetsUtil.class);
private DatasetsUtil() {}
/**
* Gets instance of {@link Dataset}, while add instance to
* {@link co.cask.cdap.data2.dataset2.DatasetFramework} and creating the physical data set
* if that one doesn't exist.
* NOTE: does poor job guarding against races, i.e. only one client for this dataset instance is supported at a time
*/
public static <T extends Dataset> T getOrCreateDataset(DatasetFramework datasetFramework,
Id.DatasetInstance datasetInstanceId, String typeName,
DatasetProperties props,
Map<String, String> arguments,
ClassLoader cl)
throws DatasetManagementException, IOException {
createIfNotExists(datasetFramework, datasetInstanceId, typeName, props);
return (T) datasetFramework.getDataset(datasetInstanceId, arguments, null);
}
/**
* Creates instance of the data set if not exists
*/
public static void createIfNotExists(DatasetFramework datasetFramework,
Id.DatasetInstance datasetInstanceId, String typeName,
DatasetProperties props) throws DatasetManagementException, IOException {
if (!datasetFramework.hasInstance(datasetInstanceId)) {
try {
datasetFramework.addInstance(typeName, datasetInstanceId, props);
} catch (InstanceConflictException e) {
// Do nothing: someone created this instance in between, just continuing
} catch (DatasetManagementException e) {
LOG.error("Could NOT add dataset instance {} of type {} with props {}",
datasetInstanceId, typeName, props, e);
throw Throwables.propagate(e);
}
}
}
/**
* For a dataset spec that does not contain the original properties, we attempt to reconstruct them from
* the properties at the top-level of the spec. For most datasets, these will be identical, however, there
* are a few known dataset types whose {@link DatasetDefinition#configure(String, DatasetProperties)} method
* adds additional properties. As of release 3.3, the set of built-in such dataset types is known. Any dataset
* created or reconfigured beginning with 3.4 will have the original properties stored in the spec.
* @param spec a dataset spec that does not contain the original properties
* @return the input spec if it is null or if it has original properties; otherwise a spec that has the
* original properties with which the dataset was created or reconfigured, at best effort.
*/
@VisibleForTesting
public static DatasetSpecification fixOriginalProperties(@Nullable DatasetSpecification spec) {
if (spec == null || spec.getOriginalProperties() != null) {
return spec;
}
Map<String, String> props = new TreeMap<>(spec.getProperties());
if (!props.isEmpty()) {
String type = spec.getType();
// file sets add a fileset version indicating how to handle absolute base paths
if (FileSet.class.getName().equals(type) || "fileSet".equals(type)) {
props.remove(FileSetDataset.FILESET_VERSION_PROPERTY);
// TPFS adds the partitioning
} else if (TimePartitionedFileSet.class.getName().equals(type) || "timePartitionedFileSet".equals(type)) {
props.remove(PartitionedFileSetProperties.PARTITIONING_FIELDS);
for (String key : spec.getProperties().keySet()) {
if (key.startsWith(PartitionedFileSetProperties.PARTITIONING_FIELD_PREFIX)) {
props.remove(key);
}
}
// ObjectMappedTable adds the table schema and its row field name
} else if (ObjectMappedTable.class.getName().endsWith(type) || "objectMappedTable".equals(type)) {
props.remove(Table.PROPERTY_SCHEMA);
props.remove(Table.PROPERTY_SCHEMA_ROW_FIELD);
// LineageDataset and UsageDataset add the conflict level of none
} else if (UsageDataset.class.getSimpleName().equals(type) ||
LineageDataset.class.getName().equals(type) || "lineageDataset".equals(type)) {
props.remove(Table.PROPERTY_CONFLICT_LEVEL);
}
}
return spec.setOriginalProperties(props);
}
}