/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.api.dataset.lib; import co.cask.cdap.api.Predicate; import co.cask.cdap.api.annotation.Beta; import co.cask.cdap.api.data.batch.InputFormatProvider; import co.cask.cdap.api.data.batch.OutputFormatProvider; import co.cask.cdap.api.dataset.DataSetException; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.dataset.PartitionNotFoundException; import java.util.Map; import java.util.Set; import javax.annotation.Nullable; /** * Represents a dataset that is split into partitions that can be uniquely addressed * by partitioning keys along multiple dimensions. Each partition is a path in a file set, * the partitioning keys attached as meta data. * * This dataset can be made available for querying with SQL (explore). This is enabled through dataset * properties when the dataset is created. See {@link FileSetProperties} * for details. If it is enabled for explore, a Hive external table will be created when the dataset is * created. The Hive table is partitioned by the same keys as this dataset. */ @Beta public interface PartitionedFileSet extends Dataset, InputFormatProvider, OutputFormatProvider { /** * Get the partitioning declared for the file set. */ Partitioning getPartitioning(); /** * Add a partition for a given partition key, stored at a given path (relative to the file set's base path). */ void addPartition(PartitionKey key, String path); /** * Add a partition for a given partition key, stored at a given path (relative to the file set's base path), * with the given metadata. */ void addPartition(PartitionKey key, String path, Map<String, String> metadata); /** * Adds a new metadata entry for a particular partition. * Note that existing entries cannot be updated. * * @throws DataSetException when an attempt is made to either update an existing entry * @throws PartitionNotFoundException when a partition for the given key is not found */ void addMetadata(PartitionKey key, String metadataKey, String metadataValue); /** * Adds a set of new metadata entries for a particular partition. * Note that existing entries cannot be updated. * * @throws DataSetException when an attempt is made to either update existing entries * @throws PartitionNotFoundException when a partition for the given key is not found */ void addMetadata(PartitionKey key, Map<String, String> metadata); /** * Remove a partition for a given partition key, silently ignoring if the key is not found. */ void dropPartition(PartitionKey key); /** * Return the partition for a specific partition key, or null if key is not found. */ @Nullable PartitionDetail getPartition(PartitionKey key); /** * Return all partitions matching the partition filter. * @param filter If non null, only partitions that match this filter are returned. If null, * all partitions are returned. */ Set<PartitionDetail> getPartitions(@Nullable PartitionFilter filter); /** * Incrementally consumes partitions. This method can be used to retrieve partitions that have been created since the * last call to this method. Note that it is the client's responsibility to maintain state of the partitions processed * in the iterator returned in the PartitionConsumerResult. * * @param partitionConsumerState the state from which to start consuming from * @return {@link PartitionConsumerResult} which holds the state of consumption as well as an iterator to the consumed * {@link Partition}s */ PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState); /** * Incrementally consumes partitions. This method can be used to retrieve partitions that have been created since the * last call to this method. Note that it is the client's responsibility to maintain state of the partitions processed * in the iterator returned in the PartitionConsumerResult. * * @param partitionConsumerState the state from which to start consuming from * @param limit number of partitions, which once reached, will not add add more partitions committed by other * transactions; the limit is checked after adding consuming all partitions of a transaction, so * the total number of consumed partitions may be greater than this limit * @param predicate a predicate which determines the partitions to be consumed * @return {@link PartitionConsumerResult} which holds the state of consumption as well as an iterator to the consumed * {@link Partition}s */ PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate); /** * Return a partition output for a specific partition key, in preparation for creating a new partition. * Obtain the location to write from the PartitionOutput, then call the {@link PartitionOutput#addPartition} * to add the partition to this dataset. */ PartitionOutput getPartitionOutput(PartitionKey key); /** * @return the underlying (embedded) file set. */ FileSet getEmbeddedFileSet(); /** * Allow direct access to the runtime arguments of this partitioned file set. * * @return the runtime arguments specified for this dataset. */ Map<String, String> getRuntimeArguments(); }