Datasets.java example

Explorer

Kite-master
- kite-data
- kite-hadoop-compatibility
  - src
    - main
      - java
        org
        kitesdk
        compat
        DynConstructors.java
        DynMethods.java
        Hadoop.java
- kite-maven-plugin
  - src
    - main
      - java
        org
        kitesdk
        maven
        plugins
        AbstractAppMojo.java
        AbstractDatasetMojo.java
        AbstractHadoopMojo.java
        CreateDatasetMojo.java
        DeleteDatasetMojo.java
        DeployAppMojo.java
        PackageAppMojo.java
        RunAppMojo.java
        RunToolMojo.java
        UpdateDatasetMojo.java
        Workflow.java
        WorkflowXmlWriter.java
    - test
      - java
        org
        kitesdk
        maven
        plugins
        TestCreateDataset.java
        TestGetRepository.java
- kite-minicluster
  - src
    - main
      - java
        org
        apache
        flume
        sink
        kite
        DatasetSink.java
        DatasetSinkConstants.java
        KerberosUtil.java
        kitesdk
        minicluster
        FlumeService.java
        HBaseService.java
        HdfsService.java
        HiveService.java
        MiniCluster.java
        Service.java
        ZookeeperService.java
        cli
        Command.java
        Help.java
        Main.java
        RunCommand.java
    - test
      - java
        org
        kitesdk
        minicluster
        TestMiniCluster.java
- kite-morphlines
  - kite-morphlines-avro
    - src
      - main
        java
        org
        kitesdk
        morphline
        avro
        AvroConversions.java
        ExtractAvroPathsBuilder.java
        ExtractAvroTreeBuilder.java
        FastGenericDatumReader.java
        ReadAvroBuilder.java
        ReadAvroContainerBuilder.java
        ToAvroBuilder.java
        ToAvroMapBuilder.java
        WriteAvroToByteArrayBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        avro
        ArrayInUnionTestRecord.java
        AvroMorphlineTest.java
  - kite-morphlines-core
    - src
      - main
        java
        org
        kitesdk
        morphline
        api
        Command.java
        CommandBuilder.java
        ExceptionHandler.java
        MorphlineCompilationException.java
        MorphlineContext.java
        MorphlineRuntimeException.java
        Record.java
        TypedSettings.java
        package-info.java
        base
        AbstractCommand.java
        Compiler.java
        Configs.java
        Connector.java
        FaultTolerance.java
        FieldExpression.java
        Fields.java
        Metrics.java
        Notifications.java
        PrettyPrinter.java
        Validator.java
        scriptengine
        java
        FastJavaScriptEngine.java
        JavaCompiler.java
        JavaScriptEngine.java
        JavaScriptEngineFactory.java
        MemoryClassLoader.java
        MemoryJavaFileManager.java
        ScriptEvaluator.java
        shaded
        com
        google
        code
        regexp
        GroupInfo.java
        MatchResult.java
        Matcher.java
        Pattern.java
        common
        io
        Closeables.java
        reflect
        ClassPath.java
        FluentIterable.java
        Reflection.java
        package-info.java
        googlecode
        jcsv
        fastreader
        CSVTokenizer.java
        QuotedCSVTokenizer.java
        SimpleCSVTokenizer.java
        org
        apache
        commons
        codec
        binary
        binary
        Base64.java
        BaseNCodec.java
        StringUtils.java
        math3
        random
        AbstractWell.java
        BitsStreamGenerator.java
        RandomGenerator.java
        Well19937c.java
        hadoop
        fs
        GlobPattern.java
        stdio
        AbstractParser.java
        MediaType.java
        ReadBlobBuilder.java
        ReadCSVBuilder.java
        ReadClobBuilder.java
        ReadLineBuilder.java
        ReadMultiLineBuilder.java
        stdlib
        AbstractAddValuesCommand.java
        AbstractFieldTransformCommand.java
        AddCurrentTimeBuilder.java
        AddLocalHostBuilder.java
        AddValuesBuilder.java
        AddValuesIfAbsentBuilder.java
        CallParentPipeBuilder.java
        ContainsBuilder.java
        ConvertTimestampBuilder.java
        DecodeBase64Builder.java
        DropRecordBuilder.java
        EqualsBuilder.java
        ExtractURIComponentBuilder.java
        ExtractURIComponentsBuilder.java
        ExtractURIQueryParametersBuilder.java
        FindReplaceBuilder.java
        GenerateUUIDBuilder.java
        GrokBuilder.java
        GrokDictionaries.java
        HashDigestBuilder.java
        HeadBuilder.java
        IfThenElseBuilder.java
        JavaBuilder.java
        LogCommand.java
        LogDebugBuilder.java
        LogErrorBuilder.java
        LogInfoBuilder.java
        LogTraceBuilder.java
        LogWarnBuilder.java
        NotBuilder.java
        PatternMetricFilter.java
        PatternNameMatcher.java
        Pipe.java
        PipeBuilder.java
        RemoveFieldsBuilder.java
        RemoveValuesBuilder.java
        ReplaceValues.java
        ReplaceValuesBuilder.java
        SampleBuilder.java
        SeparateAttachmentsBuilder.java
        SetValuesBuilder.java
        SplitBuilder.java
        SplitKeyValueBuilder.java
        StartReportingMetricsToCSVBuilder.java
        StartReportingMetricsToJMXBuilder.java
        StartReportingMetricsToSLF4JBuilder.java
        ToByteArrayBuilder.java
        ToStringBuilder.java
        TranslateBuilder.java
        TryRulesBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        api
        AbstractMorphlineTest.java
        CSVTokenizerTest.java
        Collector.java
        ConfigsTest.java
        CopyTestCommandBuilder.java
        FailCommandBuilder.java
        FieldExpressionTest.java
        FileUtils.java
        GenerateSequenceNumberBuilder.java
        MorphlineDemo.java
        MorphlineTest.java
        PrettyPrinterTest.java
        ScriptEvaluatorTest.java
        SimpleHoconConfigTest.java
        ThrowExceptionCommandBuilder.java
        TypedSettingsTest.java
        ValidatorTest.java
        stdio
        MediaTypeTest.java
        stdlib
        GlobPatternTest.java
        GrokDictionaryTest.java
        HashDigestTest.java
        PatternMetricFilterTest.java
  - kite-morphlines-hadoop-core
    - src
      - main
        java
        org
        kitesdk
        morphline
        hadoop
        core
        DownloadHdfsFileBuilder.java
        FileUtils.java
        OpenHdfsFileBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        hadoop
        core
        DownloadHdfsFileTest.java
        MiniDFSTest.java
        OpenHdfsFileTest.java
  - kite-morphlines-hadoop-parquet-avro
    - src
      - main
        java
        org
        kitesdk
        morphline
        hadoop
        parquet
        avro
        ReadAvroParquetFileBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        hadoop
        parquet
        avro
        AvroParquetMorphlineTest.java
  - kite-morphlines-hadoop-rcfile
    - src
      - main
        java
        org
        kitesdk
        morphline
        hadoop
        rcfile
        ReadRCFileBuilder.java
        SingleStreamFileSystem.java
      - test
        java
        org
        kitesdk
        morphline
        hadoop
        rcfile
        ReadRCFileTest.java
  - kite-morphlines-hadoop-sequencefile
    - src
      - main
        java
        org
        kitesdk
        morphline
        hadoop
        sequencefile
        ReadSequenceFileBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        hadoop
        sequencefile
        ParseTextMyWritableBuilder.java
        ReadSequenceFileTest.java
  - kite-morphlines-json
    - src
      - main
        java
        org
        kitesdk
        morphline
        json
        ExtractJsonPathsBuilder.java
        ReadJsonBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        json
        JsonMorphlineTest.java
  - kite-morphlines-maxmind
    - src
      - main
        java
        org
        kitesdk
        morphline
        maxmind
        GeoIPBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        maxmind
        MaxmindMorphlineTest.java
  - kite-morphlines-metrics-scalable
    - src
      - main
        java
        org
        kitesdk
        morphline
        metrics
        scalable
        CoordinatedOmissionTimer.java
        MetricBuilders.java
        ScalableStatistics.java
        TDigestMetricBuilders.java
        TDigestReservoir.java
      - test
        java
        org
        kitesdk
        morphline
        metrics
        scalable
        ScalableStatisticsTest.java
        TDigestReservoirTest.java
  - kite-morphlines-metrics-servlets
    - src
      - main
        java
        org
        kitesdk
        morphline
        metrics
        servlets
        RegisterJVMMetricsBuilder.java
        StartReportingMetricsToHTTPBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        metrics
        servlets
        HttpMetricsMorphlineTest.java
  - kite-morphlines-protobuf
    - generated-test-sources
      - org
        kitesdk
        morphline
        protobuf
        Protos.java
    - src
      - main
        java
        org
        kitesdk
        morphline
        protobuf
        ExtractProtobufPathsBuilder.java
        ReadProtobufBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        protobuf
        ProtobufMorphlineTest.java
  - kite-morphlines-saxon
    - src
      - main
        java
        org
        kitesdk
        morphline
        saxon
        ClassLoaderUtil.java
        ConvertHTMLBuilder.java
        SaxonCommand.java
        XMLInputFactoryCreator.java
        XMLStreamCopier.java
        XQueryBuilder.java
        XSLTBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        saxon
        MyConcatExtensionFunction.java
        MyConcatExtensionFunctionDefinition.java
        SaxonMorphlineTest.java
        TagsoupMorphlineTest.java
  - kite-morphlines-solr-cell
    - src
      - main
        java
        org
        kitesdk
        morphline
        solrcell
        SolrCellBuilder.java
        StripNonCharSolrContentHandlerFactory.java
        TrimSolrContentHandlerFactory.java
      - test
        java
        org
        kitesdk
        morphline
        solrcell
        EnvironmentTest.java
        SolrCellMorphlineTest.java
  - kite-morphlines-solr-core
    - src
      - main
        java
        org
        apache
        solr
        client
        solrj
        retry
        DefaultRetryPolicyFactory.java
        FlexibleBoundedExponentialBackoffRetry.java
        MetricsFacade.java
        MutableLong.java
        RetriesExhaustedException.java
        RetryPolicy.java
        RetryPolicyFactory.java
        RetrySleeper.java
        RetryingSolrServer.java
        kitesdk
        morphline
        solr
        DocumentLoader.java
        FileUtils.java
        GenerateSolrSequenceKeyBuilder.java
        LoadSolrBuilder.java
        RateLimiter.java
        RetryPolicyFactoryParser.java
        SafeConcurrentUpdateSolrServer.java
        SanitizeUnknownSolrFieldsBuilder.java
        SmoothRateLimiter.java
        SolrLocator.java
        SolrMorphlineContext.java
        SolrServerDocumentLoader.java
        TokenizeTextBuilder.java
        ZooKeeperDownloader.java
      - test
        java
        org
        apache
        solr
        client
        solrj
        retry
        FlexibleBoundedExponentialBackoffRetryTest.java
        RetryingSolrServerTest.java
        kitesdk
        morphline
        solr
        AbstractSolrMorphlineTest.java
        AbstractSolrMorphlineZkTest.java
        CollectingDocumentLoader.java
        EnvironmentTest.java
        SolrLocatorTest.java
        SolrMorphlineTest.java
        SolrMorphlineZkAliasTest.java
        SolrMorphlineZkAvroTest.java
        SolrMorphlineZkTest.java
        TestEmbeddedSolrServer.java
  - kite-morphlines-tika-core
    - src
      - main
        java
        org
        kitesdk
        morphline
        tika
        DetectMimeTypeBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        tika
        DetectMimeTypesTest.java
  - kite-morphlines-tika-decompress
    - src
      - main
        java
        org
        kitesdk
        morphline
        tika
        decompress
        DecompressBuilder.java
        EmbeddedExtractor.java
        UnpackBuilder.java
  - kite-morphlines-twitter
    - src
      - main
        java
        org
        kitesdk
        morphline
        twitter
        ReadJsonTestTweetsBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        twitter
        ReadJsonTweetsTest.java
  - kite-morphlines-useragent
    - src
      - main
        java
        org
        kitesdk
        morphline
        useragent
        UserAgentBuilder.java
      - test
        java
        org
        kitesdk
        morphline
        useragent
        UserAgentMorphlineTest.java
- kite-tools-parent
  - kite-tools
    - src
      - main
        java
        org
        kitesdk
        cli
        Command.java
        Help.java
        Main.java
        commands
        BaseCommand.java
        BaseDatasetCommand.java
        CSVImportCommand.java
        CSVSchemaCommand.java
        CompactCommand.java
        CopyCommand.java
        CreateColumnMappingCommand.java
        CreateDatasetCommand.java
        CreatePartitionStrategyCommand.java
        DeleteCommand.java
        FlumeConfigCommand.java
        InfoCommand.java
        InputFormatImportCommand.java
        JSONImportCommand.java
        JSONSchemaCommand.java
        ListCommand.java
        Log4jConfigCommand.java
        ObjectSchemaCommand.java
        SchemaCommand.java
        ShowRecordsCommand.java
        TarImportCommand.java
        TransformCommand.java
        UpdateDatasetCommand.java
        example
        ToUpperCase.java
        User.java
        tools
        CompactionTask.java
        CopyTask.java
        JarFinder.java
        JobClasspathHelper.java
        TaskUtil.java
        TransformTask.java
      - test
        java
        org
        kitesdk
        cli
        TestMain.java
        TestUtil.java
        commands
        TestBaseCommand.java
        TestCSVImportCommand.java
        TestCSVImportCommandCluster.java
        TestCSVSchemaCommand.java
        TestCompactCommandCluster.java
        TestCopyCommandCluster.java
        TestCopyCommandClusterChangedNameWithPartitioning.java
        TestCopyCommandClusterNewField.java
        TestCopyCommandClusterSchemaEvolution.java
        TestCopyCommandLocal.java
        TestCreateColumnMappingCommand.java
        TestCreateDatasetCommand.java
        TestCreateDatasetCommandCluster.java
        TestCreateDatasetWithExistingData.java
        TestCreatePartitionStrategyCommand.java
        TestDeleteCommand.java
        TestFlumeConfigurationCommand.java
        TestInputFormatImportCommandCluster.java
        TestLog4jConfigurationCommand.java
        TestObjectSchemaCommand.java
        TestSchemaCommand.java
        TestSchemaCommandCluster.java
        TestSchemaCommandMerge.java
        TestShowRecordsCommand.java
        TestTarImportCommand.java
        TestTransformCommandCluster.java
        TestTransformCommandLocal.java
        TestUpdateDatasetCommand.java

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kitesdk.data;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.Pair;
import org.kitesdk.data.spi.Registration;

/**
 * Methods for working with {@link Dataset} instances.
 * <p>
 * <b>URIs</b>
 * <p>
 * All methods require a URI that identifies a dataset, view, or
 * repository. The URI must begin with the scheme {@code dataset:}, 
 * {@code view:}, or {@code repo:}. The remainder of the URI is 
 * implementation specific, depending on the dataset scheme.
 * <p>
 * For example, the URI {@code dataset:hive:movies/ratings}
 * references a dataset named <i>ratings</i> in the
 * <i>movies</i> namespace, stored in Hive.
 * <p>
 * The URI {@code view:hive:movies/ratings?year=2015&month=3}
 * references a view of the same <i>ratings</i> dataset. The view
 * is filtered to include records from only March, 2015.
 * <p>
 * See <a href="http://kitesdk.org/docs/current/URIs.html">Dataset and View
 * URIs</a> for the available URI patterns.
 * <p>
 * <b>Dataset Descriptors</b>
 * <p>
 * Some methods require a {@link DatasetDescriptor} that encapsulates metadata
 * about a dataset. Descriptors are built using a
 * {@link DatasetDescriptor.Builder descriptor builder}.
 * <p>
 * <b>Entities</b>
 * <p>
 * <i>Entities</i> are analagous to <i>records</i> in database terminology.
 * The term is used in the API to emphasize that an entity can include not
 * only primitive objects, but also complex objects such as hash maps.
 * <p>
 * Some methods accept an entity class that will be used by Kite when returning
 * entities from a dataset or view.
 *
 * @since 0.8.0
 */
public class Datasets {
  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * <p>
   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read a subset of the dataset.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V load(URI uri, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isView ||
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
        + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    Dataset<E> dataset = repo.load(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type);

    if (isView) {
      return Datasets.<E, V> view(dataset, uriOptions);
    } else {
      // if the URI isn't a view URI, only load the dataset
      return (V) dataset;
    }
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * <p>
   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read  a subset of the dataset.
   * 
   * @param uri a {@code Dataset} or {@code View} URI
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static <V extends View<GenericRecord>> V load(URI uri) {
    return Datasets.<GenericRecord, V>load(uri, GenericRecord.class);
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * <p>
   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to 
   * read a subset of the dataset.
   *
   * @param uriString a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  public static <E, V extends View<E>> V load(String uriString, Class<E> type) {
    return Datasets.<E, V> load(URI.create(uriString), type);
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * <p>
   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read a subset of the dataset.
   *
   * @param uriString a {@code Dataset} or {@code View} URI
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  public static <V extends View<GenericRecord>> V load(String uriString) {
    return Datasets.<GenericRecord, V>load(
        uriString, GenericRecord.class);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V create(URI uri, DatasetDescriptor descriptor, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isView ||
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
        + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor = new DatasetDescriptor.Builder(descriptor)
          .location(uriOptions.get("location"))
          .build();
    }

    Dataset<E> dataset = repo.create(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);

    if (isView) {
      return Datasets.<E, V> view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <V extends View<GenericRecord>> V create(URI uri, DatasetDescriptor descriptor) {
    return Datasets.<GenericRecord, V>create(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI string.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI string
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  public static <E, V extends View<E>> V create(String uri, DatasetDescriptor descriptor, Class<E> type) {
    return Datasets.<E, V> create(URI.create(uri), descriptor, type);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI string.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * <p>
   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI string
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <V extends View<GenericRecord>> V create(String uri, DatasetDescriptor descriptor) {
    return Datasets.<GenericRecord, V>create(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   * <p>
   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * <a href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema
   * Evolution</a>.
   * <p>
   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * <p>
   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * <p>
   * You cannot change a dataset format or partition strategy.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, D extends Dataset<E>> D update(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return (D) repo.update(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   * <p>
   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * <a href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema
   * Evolution</a>.
   * <p>
   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * <p>
   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * <p>
   * You cannot change a dataset format or partition strategy.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <D extends Dataset<GenericRecord>> D update(
      URI uri, DatasetDescriptor descriptor) {
    return Datasets.<GenericRecord, D>update(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI string.
   * <p>
   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * <a href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema
   * Evolution</a>.
   * <p>
   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * <p>
   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * <p>
   * You cannot change a dataset format or partition strategy.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this
   *          {@code Dataset}
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  public static <E, D extends Dataset<E>> D update(String uri, DatasetDescriptor descriptor, Class<E> type) {
    return Datasets.<E, D> update(URI.create(uri), descriptor, type);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI string.
   * <p>
   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * <a href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema
   * Evolution</a>.
   * <p>
   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * <p>
   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * <p>
   * You cannot change a dataset format or partition strategy.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  public static <D extends Dataset<GenericRecord>> D update(String uri, DatasetDescriptor descriptor) {
    return Datasets.<GenericRecord, D>update(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Delete a {@link Dataset} identified by the given dataset URI.
   * <p>
   * When you call this method using a dataset URI, both data and metadata are
   * deleted. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * <p>
   * When you call this method using a view URI, data in that view is deleted.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the delete requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * deletion rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts deletes,
   * and under what conditions it rejects them.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean delete(URI uri) {
    return deleteWithTrash(uri, false);
  }

  /**
   * Delete a {@link Dataset} identified by the given dataset URI string.  
   * <p>
   * When you call this method using a dataset URI, both data and metadata are
   * deleted. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * <p>
   * When you call this method using a view URI, data in that view is deleted.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the delete requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * deletion rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts deletes,
   * and under what conditions it rejects them.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean delete(String uri) {
    return delete(URI.create(uri));
  }

  /**
   * Move a {@link Dataset} identified by the given dataset URI to trash.
   * <p>
   * When you call this method using a dataset URI, both data and metadata are
   * moved to trash. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * <p>
   * When you call this method using a view URI, data in that view is moved to trash.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the move to trash requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * move to trash rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts moves to trash,
   * and under what conditions it rejects them.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean moveToTrash(URI uri) {
    return deleteWithTrash(uri, true);
  }

  private static boolean deleteWithTrash(URI uri, boolean useTrash){
    Preconditions.checkArgument(
            URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
            "Not a dataset URI: " + uri);

    Pair<DatasetRepository, Map<String, String>> pair =
            Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return useTrash ? repo.moveToTrash(uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION)) : repo.delete(uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }

  /**
   * Move a {@link Dataset} identified by the given dataset URI string to trash.
   * <p>
   * When you call this method using a dataset URI, both data and metadata are
   * removed. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * <p>
   * When you call this method using a view URI, data in that view is moved to trash.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the move to trash requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * move to trash rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts moves to trash,
   * and under what conditions it rejects them.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean moveToTrash(String uri) {
    return moveToTrash(URI.create(uri));
  }

  /**
   * Check whether a {@link Dataset} identified by the given URI exists.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(URI uri) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset URI: " + uri);

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return repo.exists(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }

  /**
   * Check whether a {@link Dataset} identified by the given URI string exists.
   * <p>
   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(String uri) {
    return exists(URI.create(uri));
  }

  /**
   * List the {@link Dataset} URIs in the repository identified by the URI.
   * <p>
   * URI formats are defined by {@code Dataset} implementations. The repository
   * URIs you pass to this method must begin with {@code repo:}. For example, to 
   * list the {@code Dataset} URIs for the Hive repository, provide the URI
   * {@code repo:hive}.
   *
   * @param uri a {@code DatasetRepository} URI
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection<URI> list(URI uri) {
    boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri);
    DatasetRepository repo = Registration
        .open(URI.create(uri.getRawSchemeSpecificPart()));

    // build a URI for each dataset name
    URI repoUri = repo.getUri();
    List<URI> datasets = Lists.newArrayList();
    for (String namespace : repo.namespaces()) {
      for (String dataset : repo.datasets(namespace)) {
        datasets.add(new URIBuilder(repoUri, namespace, dataset).build());
      }
    }

    return datasets;
  }

  /**
   * List the {@link Dataset} URIs in the repository identified by the URI
   * string.
   * <p>
   * URI formats are defined by {@code Dataset} implementations. The repository
   * URIs you pass to this method must begin with {@code repo:}. For example, to 
   * list the {@code Dataset} URIs for the Hive repository, provide the URI
   * {@code repo:hive}.
   * 
   * @param uri a {@code DatasetRepository} URI string
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code URI} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection<URI> list(String uri) {
    return list(URI.create(uri));
  }

  @SuppressWarnings("unchecked")
  private static <E, V extends View<E>> V view(Dataset<E> dataset,
                                               Map<String, String> uriOptions) {
    if (dataset instanceof AbstractDataset) {
      DatasetDescriptor descriptor = dataset.getDescriptor();
      Schema schema = descriptor.getSchema();
      PartitionStrategy strategy = null;
      if (descriptor.isPartitioned()) {
        strategy = descriptor.getPartitionStrategy();
      }
      Constraints constraints = Constraints.fromQueryMap(
          schema, strategy, uriOptions);
      return (V) ((AbstractDataset) dataset).filter(constraints);
    } else {
      return (V) dataset;
    }
  }
}