/*
* Copyright © 2014-2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.dataset2.lib.partitioned;
import co.cask.cdap.api.dataset.DataSetException;
import co.cask.cdap.api.dataset.DatasetContext;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.IndexedTable;
import co.cask.cdap.api.dataset.lib.PartitionDetail;
import co.cask.cdap.api.dataset.lib.PartitionFilter;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionMetadata;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.api.dataset.lib.TimePartitionDetail;
import co.cask.cdap.api.dataset.lib.TimePartitionOutput;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments;
import co.cask.cdap.explore.client.ExploreFacade;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.inject.Provider;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* Implementation of partitioned datasets using a Table to store the meta data.
*/
public class TimePartitionedFileSetDataset extends PartitionedFileSetDataset implements TimePartitionedFileSet {
// the fixed partitioning that time maps to
private static final String FIELD_YEAR = "year";
private static final String FIELD_MONTH = "month";
private static final String FIELD_DAY = "day";
private static final String FIELD_HOUR = "hour";
private static final String FIELD_MINUTE = "minute";
public static final Partitioning PARTITIONING = Partitioning.builder()
.addIntField(FIELD_YEAR)
.addIntField(FIELD_MONTH)
.addIntField(FIELD_DAY)
.addIntField(FIELD_HOUR)
.addIntField(FIELD_MINUTE)
.build();
public TimePartitionedFileSetDataset(DatasetContext datasetContext, String name,
FileSet fileSet, IndexedTable partitionTable,
DatasetSpecification spec, Map<String, String> arguments,
Provider<ExploreFacade> exploreFacadeProvider) {
super(datasetContext, name, PARTITIONING, fileSet, partitionTable, spec, arguments,
exploreFacadeProvider);
// the first version of TPFS in CDAP 2.7 did not have the partitioning in the properties. It is not supported.
if (PartitionedFileSetProperties.getPartitioning(spec.getProperties()) == null) {
throw new DataSetException("Unsupported version of TimePartitionedFileSet. Dataset '" + name + "' is missing " +
"the partitioning property. This probably means that it was created in CDAP 2.7, " +
"which is not supported any longer.");
}
}
@Override
public void addPartition(long time, String path) {
addPartition(time, path, Collections.<String, String>emptyMap());
}
@Override
public void addPartition(long time, String path, Map<String, String> metadata) {
addPartition(partitionKeyForTime(time), path, metadata);
}
@Override
public void addMetadata(long time, String metadataKey, String metadataValue) {
addMetadata(partitionKeyForTime(time), metadataKey, metadataValue);
}
@Override
public void addMetadata(long time, Map<String, String> metadata) {
addMetadata(partitionKeyForTime(time), metadata);
}
@Override
public void dropPartition(long time) {
dropPartition(partitionKeyForTime(time));
}
@Nullable
@Override
public TimePartitionDetail getPartitionByTime(long time) {
PartitionDetail partitionDetail = getPartition(partitionKeyForTime(time));
return partitionDetail == null ? null
: new BasicTimePartitionDetail(this, partitionDetail.getRelativePath(), partitionDetail.getPartitionKey(),
partitionDetail.getMetadata());
}
@Override
public Set<TimePartitionDetail> getPartitionsByTime(long startTime, long endTime) {
final Set<TimePartitionDetail> partitions = Sets.newHashSet();
for (PartitionFilter filter : partitionFiltersForTimeRange(startTime, endTime)) {
super.getPartitions(filter, new PartitionedFileSetDataset.PartitionConsumer() {
@Override
public void consume(PartitionKey key, String path, @Nullable PartitionMetadata metadata) {
partitions.add(new BasicTimePartitionDetail(TimePartitionedFileSetDataset.this, path, key, metadata));
}
});
}
return partitions;
}
private Collection<String> getPartitionPathsByTime(long startTime, long endTime) {
final Set<String> paths = Sets.newHashSet();
for (PartitionFilter filter : partitionFiltersForTimeRange(startTime, endTime)) {
super.getPartitions(filter, new PartitionedFileSetDataset.PartitionConsumer() {
@Override
public void consume(PartitionKey key, String path, @Nullable PartitionMetadata metadata) {
paths.add(path);
}
});
}
return paths;
}
@Override
public TimePartitionOutput getPartitionOutput(long time) {
if (isExternal) {
throw new UnsupportedOperationException(
"Output is not supported for external time-partitioned file set '" + spec.getName() + "'");
}
PartitionKey key = partitionKeyForTime(time);
return new BasicTimePartitionOutput(this, getOutputPath(key), key);
}
@Override
@Nullable
protected Collection<String> computeFilterInputPaths() {
Long startTime = TimePartitionedFileSetArguments.getInputStartTime(getRuntimeArguments());
Long endTime = TimePartitionedFileSetArguments.getInputEndTime(getRuntimeArguments());
if (startTime == null && endTime == null) {
// no times specified; perhaps a partition filter was specified. super will deal with that
return super.computeFilterInputPaths();
}
if (startTime == null) {
throw new DataSetException("Start time for input time range must be given as argument.");
}
if (endTime == null) {
throw new DataSetException("End time for input time range must be given as argument.");
}
return getPartitionPathsByTime(startTime, endTime);
}
@VisibleForTesting
static PartitionKey partitionKeyForTime(long time) {
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(time);
int year = calendar.get(Calendar.YEAR);
int month = calendar.get(Calendar.MONTH) + 1; // otherwise January would be 0
int day = calendar.get(Calendar.DAY_OF_MONTH);
int hour = calendar.get(Calendar.HOUR_OF_DAY);
int minute = calendar.get(Calendar.MINUTE);
return PartitionKey.builder()
.addIntField(FIELD_YEAR, year)
.addIntField(FIELD_MONTH, month)
.addIntField(FIELD_DAY, day)
.addIntField(FIELD_HOUR, hour)
.addIntField(FIELD_MINUTE, minute)
.build();
}
@VisibleForTesting
static long timeForPartitionKey(PartitionKey key) {
int year = (Integer) key.getField(FIELD_YEAR);
int month = (Integer) key.getField(FIELD_MONTH) - 1;
int day = (Integer) key.getField(FIELD_DAY);
int hour = (Integer) key.getField(FIELD_HOUR);
int minute = (Integer) key.getField(FIELD_MINUTE);
Calendar calendar = Calendar.getInstance();
calendar.clear();
//noinspection MagicConstant
calendar.set(year, month, day, hour, minute);
return calendar.getTimeInMillis();
}
// returns a list of partition filters that cover that specified time range.
// this may return a list with a single null filter (in case the range is unbounded in both directions)
@VisibleForTesting
static List<PartitionFilter> partitionFiltersForTimeRange(long startTime, long endTime) {
// unsatisfiable range
if (startTime >= endTime) {
return Collections.emptyList();
}
PartitionKey keyLower = startTime <= 0 ? null : partitionKeyForTime(startTime);
PartitionKey keyUpper = endTime == Long.MAX_VALUE ? null : partitionKeyForTime(endTime);
// no bounds -> no filter
if (keyLower == null && keyUpper == null) {
return Collections.singletonList(null); // no filter needed to select all time
}
List<PartitionFilter> filters = Lists.newArrayList();
String[] allFields = PARTITIONING.getFields().keySet().toArray(new String[PARTITIONING.getFields().size()]);
// if there is no lower bound, we only need the filters for the upper bound
if (keyLower == null) {
addUpperFilters(allFields, 0, keyUpper, filters, initialSupplier());
return filters;
}
// if there is no upper bound, we only need the filters for the lower bound
if (keyUpper == null) {
addLowerFilters(allFields, 0, keyLower, filters, initialSupplier());
return filters;
}
return filtersFor(allFields, 0, keyLower, keyUpper, filters, initialSupplier());
}
// this generates the filters for a suffix of the fields in the partition key. All filters will be
// prefixed with the conditions that are generated by the provided partition builder supplier.
// for example, if fields only contains day, hour and minute, then the supplier will return builders
// that already have conditions for the year and the month.
private static List<PartitionFilter> filtersFor(String[] fields, int position,
PartitionKey keyLower, PartitionKey keyUpper,
List<PartitionFilter> filters,
final Supplier<PartitionFilter.Builder> supplier) {
// examined all fields? -> done, build a filter and return.
if (position >= fields.length) {
filters.add(supplier.get().build());
return filters;
}
String fieldName = fields[position];
int lower = (Integer) keyLower.getField(fieldName);
int upper = (Integer) keyUpper.getField(fieldName);
// both upper and lower bound specify the same value for this field.
// Add an equality constraint for this field and value and continue with the next field
if (lower == upper) {
return filtersFor(fields, position + 1, keyLower, keyUpper, filters, nextSupplier(supplier, fieldName, lower));
}
// we have two different value. For example, if year and month are already provided by the supplier, we are
// looking at field "day":
// - lower bound is year/month/15/h1/m1
// - upper bound is year/month/20/h2/m2
// The conditions we need are either one of (with fixed year and month):
// - day is 15, and hour/minute are greater or equal than h1/m1 (addLowerFilters)
// - day is in 16...19, (add a condition to the supplier and descend to next field, see few special cases)
// - day is 20, and hour/minute are less than h2/m2 (addUpperFilters)
// generate the filters for the lower bound on the next level
addLowerFilters(fields, position + 1, keyLower, filters, nextSupplier(supplier, fieldName, lower));
// if this field is at the finest granularity (minutes), we must include its value in the range and finish.
// for example, lower = y/m/d/h/10, upper = y/m/d/h/15, then we need to add a range condition: minute in [10...15]
if (fields.length - 1 == position) {
if (lower + 1 == upper) { // special case: range of size one is a single value
filters.add(supplier.get().addValueCondition(fieldName, lower).build());
} else {
filters.add(supplier.get().addRangeCondition(fieldName, lower, upper).build());
}
} else {
// it is not the minute field: we add a condition for this field and descend. Other than in the
// minute case, the lower key's value is not included in the range (it was processed by addLowerFilters())
// if upper == lower + 1, then there are no values between them and no filter is added here.
if (lower + 2 == upper) {
filters.add(supplier.get().addValueCondition(fieldName, lower + 1).build());
} else if (lower + 2 < upper) {
filters.add(supplier.get().addRangeCondition(fieldName, lower + 1, upper).build());
}
}
// generate the filters for the upper bound on the next level
return addUpperFilters(fields, position + 1, keyUpper, filters, nextSupplier(supplier, fieldName, upper));
}
// adds filters for the lower bound, starting at a given field, with conditions on the higher levels supplied
private static List<PartitionFilter> addLowerFilters(String[] fields, int position, PartitionKey keyLower,
List<PartitionFilter> filters,
Supplier<PartitionFilter.Builder> supplier) {
if (position >= fields.length) {
return filters;
}
String fieldName = fields[position];
int lower = (Integer) keyLower.getField(fieldName);
// if this field is at the finest granularity (minutes), we must include its value in the range
// otherwise we exclude it from the range and descend into the next finer granularity with a value
// constraints on the current field name. For example:
// - for hour:15/minute:10, we add a filter for hour>=16 (excluding 15) and descent for hour=15
// - now the remaining field is minute:10, we add a filter for minute>=10 (including 10)
int lowerBound = position == fields.length - 1 ? lower : lower + 1;
// only add the filter if this condition is satisfiable. For example, not for hour>=24 or month>=13
if (isSatisfiableLowerBound(fieldName, lowerBound)) {
filters.add(supplier.get().addRangeCondition(fieldName, lowerBound, null).build());
}
return addLowerFilters(fields, position + 1, keyLower, filters, nextSupplier(supplier, fieldName, lower));
}
// adds filters for the upper bound, starting at a given field, with conditions on the higher levels supplied
private static List<PartitionFilter> addUpperFilters(String[] fields, int position, PartitionKey keyUpper,
List<PartitionFilter> filters,
Supplier<PartitionFilter.Builder> supplier) {
if (position >= fields.length) {
return filters;
}
String fieldName = fields[position];
int upper = (Integer) keyUpper.getField(fieldName);
// only add the filter if this condition is satisfiable. For example, not for hour<0 or month<1
if (isSatisfiableUpperBound(fieldName, upper)) {
filters.add(supplier.get().addRangeCondition(fieldName, null, upper).build());
}
return addUpperFilters(fields, position + 1, keyUpper, filters, nextSupplier(supplier, fieldName, upper));
}
private static Supplier<PartitionFilter.Builder> initialSupplier() {
return new Supplier<PartitionFilter.Builder>() {
@Override
public PartitionFilter.Builder get() {
return PartitionFilter.builder();
}
};
}
private static Supplier<PartitionFilter.Builder> nextSupplier(final Supplier<PartitionFilter.Builder> supplier,
final String field, final int value) {
return new Supplier<PartitionFilter.Builder>() {
@Override
public PartitionFilter.Builder get() {
return supplier.get().addValueCondition(field, value);
}
};
}
// this is not the smartest... for example, some months have less than 31 days. So we will sometimes generate
// a filter that is no satisfiable. It has no effect because that filter will not match any partitions, but
// may add a small performance overhead in these cases. It does not seem worth the effort of dealing with the
// different months - and leap years - to make this accurate.
private static boolean isSatisfiableLowerBound(String fieldName, int lowerBound) {
if (fieldName.equals(FIELD_MONTH)) {
return lowerBound <= 12;
}
if (fieldName.equals(FIELD_DAY)) {
return lowerBound <= 31;
}
if (fieldName.equals(FIELD_HOUR)) {
return lowerBound <= 23;
}
if (fieldName.equals(FIELD_MINUTE)) {
return lowerBound <= 60;
}
return true;
}
private static boolean isSatisfiableUpperBound(String fieldName, int upperBound) {
if (fieldName.equals(FIELD_YEAR)) {
return upperBound > 1; // this could be 1968 because no time is before the epoch. But just to be sure...
}
if (fieldName.equals(FIELD_MONTH)) {
return upperBound > 1;
}
if (fieldName.equals(FIELD_DAY)) {
return upperBound > 1;
}
return upperBound > 0;
}
private static class BasicTimePartitionDetail extends BasicPartitionDetail implements TimePartitionDetail {
private final Long time;
private BasicTimePartitionDetail(TimePartitionedFileSetDataset timePartitionedFileSetDataset, String relativePath,
PartitionKey key, PartitionMetadata metadata) {
super(timePartitionedFileSetDataset, relativePath, key, metadata);
this.time = timeForPartitionKey(key);
}
@Override
public long getTime() {
return time;
}
}
private static class BasicTimePartitionOutput extends BasicPartitionOutput implements TimePartitionOutput {
private final Long time;
private BasicTimePartitionOutput(TimePartitionedFileSetDataset timePartitionedFileSetDataset, String relativePath,
PartitionKey key) {
super(timePartitionedFileSetDataset, relativePath, key);
this.time = timeForPartitionKey(key);
}
@Override
public long getTime() {
return time;
}
}
}