/* * Copyright 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.cube; import co.cask.cdap.api.dataset.DatasetAdmin; import co.cask.cdap.api.dataset.DatasetContext; import co.cask.cdap.api.dataset.DatasetDefinition; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.lib.AbstractDatasetDefinition; import co.cask.cdap.api.dataset.lib.CompositeDatasetAdmin; import co.cask.cdap.api.dataset.lib.cube.Cube; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.data2.dataset2.lib.table.MetricsTable; import co.cask.cdap.data2.dataset2.lib.table.hbase.HBaseTableAdmin; import co.cask.cdap.data2.dataset2.lib.timeseries.FactTable; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.gson.Gson; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * Dataset definition of {@link CubeDataset}, the dataset that implements {@link co.cask.cdap.api.dataset.lib.cube.Cube} * to store and query {@link co.cask.cdap.api.dataset.lib.cube.CubeFact}s. * <p/> * Cube dataset can be configured with aggregation resolutions and aggregations. E.g. <pre> dataset.cube.resolutions=1,60 dataset.cube.aggregation.userPages.dimensions=user,page dataset.cube.aggregation.userPages.requiredDimensions=page dataset.cube.aggregation.userActions.dimensions=user,action dataset.cube.aggregation.userActions.requiredDimensions=action </pre> * * <ul> * <li> * configures Cube to aggregate data for 1 second and 60 seconds resolutions * </li> * <li> * configures "userPages" aggregation (name doesn't have any restricted format, can be any alphabetical) that * aggregates measurements for user and page; allows querying e.g. number of visits of specific user of specific * page * </li> * <li> * configures "userActions" aggregation (name doesn't have any restricted format, can be any alphabetical) that * aggregates measurements for user and action; allows querying e.g. number of specific actions of specific user * </li> * </ul> * * Aggregation is defined with list of dimensions to aggregate by and a list of required dimensions * (dataset.cube.aggregation.[agg_name].dimensions and dataset.cube.aggregation.[agg_name].requiredDimensions properties * respectively). The {@link co.cask.cdap.api.dataset.lib.cube.CubeFact} measurement is aggregated within an aggregation * if it contains all required dimensions which non-null value. */ public class CubeDatasetDefinition extends AbstractDatasetDefinition<CubeDataset, DatasetAdmin> { public static final String PROPERTY_AGGREGATION_PREFIX = "dataset.cube.aggregation."; public static final String PROPERTY_DIMENSIONS = "dimensions"; public static final String PROPERTY_REQUIRED_DIMENSIONS = "requiredDimensions"; // 1 second is the only default resolution public static final int[] DEFAULT_RESOLUTIONS = new int[]{1}; private static final Gson GSON = new Gson(); private final DatasetDefinition<? extends Table, ?> tableDef; private final DatasetDefinition<MetricsTable, ? extends DatasetAdmin> metricsTableDef; /** * Creates instance of {@link CubeDatasetDefinition}. * * @param name this dataset type name * @param tableDef {@link Table} dataset definition, used to create tables to store cube data * @param metricsTableDef {@link MetricsTable} dataset definition, used to create tables to store encoding mappings * (see "entity" table). * Has to be non-transactional: Cube dataset uses in-memory non-transactional cache in front * of it, so all writes must be durable independent on transaction completion. */ public CubeDatasetDefinition(String name, DatasetDefinition<? extends Table, ?> tableDef, DatasetDefinition<MetricsTable, ? extends DatasetAdmin> metricsTableDef) { super(name); this.tableDef = tableDef; this.metricsTableDef = metricsTableDef; } @Override public DatasetSpecification configure(String instanceName, DatasetProperties properties) { int[] resolutions = getResolutions(properties.getProperties()); List<DatasetSpecification> datasetSpecs = Lists.newArrayList(); // Configuring table that hold mappings of tag names and values and such datasetSpecs.add(metricsTableDef.configure("entity", properties)); // Configuring tables that hold data of specific resolution Map<String, Aggregation> aggregations = getAggregations(properties.getProperties()); // Adding pre-splitting for fact tables Map<String, String> preSplitProperties = configurePreSplits(aggregations); DatasetProperties factTableProperties = DatasetProperties.builder().addAll(properties.getProperties()).addAll(preSplitProperties).build(); // NOTE: we create a table per resolution; we later will use that to e.g. configure ttl separately for each for (int resolution : resolutions) { datasetSpecs.add(tableDef.configure(String.valueOf(resolution), factTableProperties)); } return DatasetSpecification.builder(instanceName, getName()) .properties(properties.getProperties()) .datasets(datasetSpecs) .build(); } @Override public DatasetAdmin getAdmin(DatasetContext datasetContext, DatasetSpecification spec, ClassLoader classLoader) throws IOException { List<DatasetAdmin> admins = Lists.newArrayList(); admins.add(metricsTableDef.getAdmin(datasetContext, spec.getSpecification("entity"), classLoader)); int[] resolutions = getResolutions(spec.getProperties()); for (int resolution : resolutions) { admins.add(tableDef.getAdmin(datasetContext, spec.getSpecification(String.valueOf(resolution)), classLoader)); } return new CompositeDatasetAdmin(admins); } @Override public CubeDataset getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException { MetricsTable entityTable = metricsTableDef.getDataset(datasetContext, spec.getSpecification("entity"), arguments, classLoader); int[] resolutions = getResolutions(spec.getProperties()); Map<Integer, Table> resolutionTables = Maps.newHashMap(); for (int resolution : resolutions) { resolutionTables.put(resolution, tableDef.getDataset(datasetContext, spec.getSpecification(String.valueOf(resolution)), arguments, classLoader)); } Map<String, Aggregation> aggregations = getAggregations(spec.getProperties()); return new CubeDataset(spec.getName(), entityTable, resolutionTables, aggregations); } private Map<String, String> configurePreSplits(Map<String, Aggregation> aggregations) { byte[][] splits = FactTable.getSplits(aggregations.size()); return ImmutableMap.of(HBaseTableAdmin.PROPERTY_SPLITS, GSON.toJson(splits)); } private Map<String, Aggregation> getAggregations(Map<String, String> properties) { // Example of configuring one aggregation with two dimensions: user and action and user being required: // dataset.cube.aggregation.1.dimensions=user,action // dataset.cube.aggregation.1.requiredDimensions=user Map<String, List<String>> aggDimensions = Maps.newHashMap(); Map<String, Set<String>> aggRequiredDimensions = Maps.newHashMap(); for (Map.Entry<String, String> prop : properties.entrySet()) { if (prop.getKey().startsWith(PROPERTY_AGGREGATION_PREFIX)) { String aggregationProp = prop.getKey().substring(PROPERTY_AGGREGATION_PREFIX.length()); String[] nameAndProp = aggregationProp.split("\\.", 2); if (nameAndProp.length != 2) { throw new IllegalArgumentException("Invalid property: " + prop.getKey()); } String[] dimensions = prop.getValue().split(","); if (PROPERTY_DIMENSIONS.equals(nameAndProp[1])) { aggDimensions.put(nameAndProp[0], Arrays.asList(dimensions)); } else if (PROPERTY_REQUIRED_DIMENSIONS.equals(nameAndProp[1])) { aggRequiredDimensions.put(nameAndProp[0], new HashSet<>(Arrays.asList(dimensions))); } else { throw new IllegalArgumentException("Invalid property: " + prop.getKey()); } } } Map<String, Aggregation> aggregations = Maps.newHashMap(); for (Map.Entry<String, List<String>> aggDimensionsEntry : aggDimensions.entrySet()) { Set<String> requiredDimensions = aggRequiredDimensions.get(aggDimensionsEntry.getKey()); requiredDimensions = requiredDimensions == null ? Collections.<String>emptySet() : requiredDimensions; aggregations.put(aggDimensionsEntry.getKey(), new DefaultAggregation(aggDimensionsEntry.getValue(), requiredDimensions)); } return aggregations; } private int[] getResolutions(Map<String, String> propsMap) { // Example of configuring 1 second and 60 seconds resolutions: // dataset.cube.resolutions=1,60 String resProp = propsMap.get(Cube.PROPERTY_RESOLUTIONS); int[] resolutions; if (resProp == null) { resolutions = DEFAULT_RESOLUTIONS; } else { String[] seconds = resProp.split(","); if (seconds.length == 0) { throw new IllegalArgumentException(String.format("Invalid value %s for property %s.", resProp, Cube.PROPERTY_RESOLUTIONS)); } resolutions = new int[seconds.length]; for (int i = 0; i < seconds.length; i++) { try { resolutions[i] = Integer.valueOf(seconds[i]); } catch (NumberFormatException e) { throw new IllegalArgumentException(String.format("Invalid resolution value %s in property %s.", seconds[i], Cube.PROPERTY_RESOLUTIONS)); } } } return resolutions; } }